1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #ifndef SQREAD_H
19 #define SQREAD_H
20 
21 //  DO NOT INCLUDE THIS FILE DIRECTLY, include sqStore.H.
22 
23 #include "arrays.H"
24 #include "sequence.H"
25 
26 class sqRead;
27 class sqLibrary;
28 
29 class sqStore;
30 class sqStoreBlobWriter;
31 
32 class sqCache;
33 
34 //  Even though we can store up to 4GB blob files, we artificially limit it to 1 GB
35 //  for (presumed) better caching on object storage systems.  Though there are
36 //  65k files allowed, pieces that stream through the store (correction, RED, OEA)
37 //  run out of file handles well before that.
38 
39 const uint64 AS_BLOBFILE_MAX_SIZE  = 1024 * 1024 * 1024;
40 
41 
42 
43 //  The default version is set either by the user explicitly, or by the store
44 //  when it is opened.  It should never be unset.
45 //
46 //  If the store is created (new sqStore(...)) with no default set, it will
47 //  examine what reads are present and return the 'latest' available (raw,
48 //  corrected, trimmed, in that order).  If the homopolymer compression flag
49 //  exists, compressed reads will be returned.
50 //
51 //  If the default version is set before the store is opened, exactly those
52 //  reads will be returned.
53 //
54 //  The need for sqRead_normal arises when the store is set to return
55 //  homopoly compressed reads by default (if file 'homopolymerCompression'
56 //  exists).  sqRead_normal prevents sqStore_loadMetadata from enabling
57 //  homopoly compression.  It is otherwise not used (and so gets a quite
58 //  bogus value).
59 //
60 //  The 'exclude' function works ONLY AFTER the store is opened - since there
61 //  is no default set we can't remove a flag yet.
62 //
63 
64 typedef uint32  sqRead_which;
65 
66 const sqRead_which sqRead_unset      = 0x0000;   //  Nothing specified, query the store to decide what to return.
67 const sqRead_which sqRead_raw        = 0x0001;   //  Use noisy sequence.
68 const sqRead_which sqRead_corrected  = 0x0002;   //  Use corrected sequence.
69 const sqRead_which sqRead_normal     = 0x0080;   //  Return normal uncompressed bases.
70 const sqRead_which sqRead_compressed = 0x0004;   //  Return compressed bases.
71 const sqRead_which sqRead_trimmed    = 0x0008;   //  Return trimmed bases.
72 const sqRead_which sqRead_largest    = 0x0010;   //  Used to size an array.
73 
74 extern
75 sqRead_which  sqRead_defaultVersion;
76 
77 static
78 const
79 char *
toString(sqRead_which w)80 toString(sqRead_which w) {
81   const sqRead_which  c  = sqRead_compressed;
82   const sqRead_which  t  = sqRead_trimmed;
83   const sqRead_which  ct = sqRead_compressed | sqRead_trimmed;
84 
85   w &= ~sqRead_normal;  //  Disable flag so we report the normal uncompressed string.
86 
87   switch (w) {
88     case sqRead_unset:            return("unset");                           break;
89 
90     case sqRead_raw:              return("raw");                             break;
91     case sqRead_raw | c:          return("raw-compressed");                  break;
92     case sqRead_raw | t:          return("raw-trimmed");                     break;
93     case sqRead_raw | ct:         return("raw-compressed-trimmed");          break;
94 
95     case sqRead_corrected:        return("corrected");                       break;
96     case sqRead_corrected | c:    return("corrected-compressed");            break;
97     case sqRead_corrected | t:    return("corrected-trimmed");               break;
98     case sqRead_corrected | ct:   return("corrected-compressed-trimmed");    break;
99 
100     case sqRead_compressed:       return("compressed");                      break;
101     case sqRead_trimmed:          return("trimmed");                         break;
102 
103     default:                      return("undefined-mode");                  break;
104   }
105 
106   return("undefined-mode");
107 }
108 
109 static
110 void
sqRead_setDefaultVersion(sqRead_which v)111 sqRead_setDefaultVersion(sqRead_which v) {
112   sqRead_defaultVersion = v;
113 }
114 
115 static
116 void
sqRead_setDefaultVersionInclude(sqRead_which v)117 sqRead_setDefaultVersionInclude(sqRead_which v) {
118   sqRead_defaultVersion |= v;
119 }
120 
121 static
122 void
sqRead_setDefaultVersionExclude(sqRead_which v)123 sqRead_setDefaultVersionExclude(sqRead_which v) {
124   sqRead_defaultVersion &= ~v;
125 }
126 
127 static    //  Return true of the default version has mode 'v' set.
128 bool
sqRead_defaultIs(sqRead_which v)129 sqRead_defaultIs(sqRead_which v) {
130   return((sqRead_defaultVersion & v) == v);
131 }
132 
133 static    //  Return true of the default version has mode 'v' not set.
134 bool
sqRead_defaultIsNot(sqRead_which v)135 sqRead_defaultIsNot(sqRead_which v) {
136   return((sqRead_defaultVersion & v) == sqRead_unset);
137 }
138 
139 static
140 const
141 char *
sqRead_getDefaultVersion(void)142 sqRead_getDefaultVersion(void) {
143   return(toString(sqRead_defaultVersion));
144 }
145 
146 
147 
148 
149 
150 
151 
152 
153 //  On disk sequence metadata.  Sequence data itself is in the blobs.
154 //
155 //  In general, you should not be directly using this class:
156 //    sqReadSeq_length() ALWAYS returns the untrimmed length of the read.
157 //
158 //    sqReadSeq_clearBgn() and sqReadSeq_clearEnd() will both return zero for
159 //    a read with no clear range set.
160 //
161 //  There are too many places where encapsulation needs to be broken to
162 //  make it truly private.  For example:
163 //    Marking reads as ignore in sqStoreCreate (to get rid of extra coverage).
164 //    Loading clear ranges.
165 //
166 class sqReadSeq {
167 public:
sqReadSeq_initialize(void)168   void        sqReadSeq_initialize(void) {
169     _seqValid  = 0;
170     _unused1   = 0;
171     _seqLength = 0;
172 
173     _ignoreU   = 0;
174     _ignoreT   = 0;
175     _clearBgn  = 0;
176 
177     _trimmed   = 0;
178     _unused2   = 0;
179     _clearEnd  = 0;
180   };
181 
182 private:
sqReadSeq_length(void)183   uint32      sqReadSeq_length(void)          { return(_seqLength);  };   //  ALWAYS untrimmed length.
184 
sqReadSeq_clearBgn(void)185   uint32      sqReadSeq_clearBgn(void)        { assert(_trimmed);  return(_clearBgn);   };   //  NOT valid unless trimmed.
sqReadSeq_clearEnd(void)186   uint32      sqReadSeq_clearEnd(void)        { assert(_trimmed);  return(_clearEnd);   };
187 
188 private:
sqReadSeq_valid(void)189   bool        sqReadSeq_valid(void)           { return(_seqValid);   };   //  True if there is data.
sqReadSeq_trimmed(void)190   bool        sqReadSeq_trimmed(void)         { return(_trimmed);    };   //  True if the clear range is set.
191 
sqReadSeq_ignoreU(void)192   bool        sqReadSeq_ignoreU(void)         { return(_ignoreU);               };   //  True if this read should be ignored.
sqReadSeq_ignoreT(void)193   bool        sqReadSeq_ignoreT(void)         { return(_ignoreU | _ignoreT);    };   //  True if the trimmed version should be ignored.
194 
195 
196   //  Call ONLY for initializing with a newly added sequence.
197   //  Do NOT call for trimmed reads.
198   //  The only caller should be sqReadDataWriter::sqReadDataWriter_writeBlob().
199   //  If you're thinking you want to call this, think again.
200 private:
sqReadSeq_setLength(char * bases,uint32 basesLen,bool doCompress)201   void        sqReadSeq_setLength(char *bases, uint32 basesLen, bool doCompress) {
202     uint32  sl = 0;
203 
204     assert(bases[basesLen] == 0);
205 
206     if (doCompress == false)
207       sl = basesLen;
208     else
209       sl = homopolyCompress(bases, basesLen);
210 
211     assert(_seqValid == 0);
212 
213     _seqValid  = 1;
214     _unused1   = 0;
215     _seqLength = sl;
216 
217     _ignoreU   = 0;
218     _ignoreT   = 0;
219     _clearBgn  = 0;
220 
221     _trimmed   = 0;
222     _unused2   = 0;
223     _clearEnd  = sl;
224   };
225 
226 
227   //  These are public, but unless the store is opened for appending, changes
228   //  will be lost.
229 public:
sqReadSeq_setAllClear(void)230   void        sqReadSeq_setAllClear(void) {
231     _clearBgn = 0;
232     _clearEnd = _seqLength;
233     _trimmed  = true;
234 
235     _ignoreT |= _ignoreU;   //  If the untrimmed is ignored, ignore the trimmed too.
236   };
237 
238   void        sqReadSeq_setClearRange(uint32 bgn, uint32 end, bool set=true) {
239     _clearBgn = bgn;
240     _clearEnd = end;
241     _trimmed  = set;
242 
243     _ignoreT |= _ignoreU;   //  If the untrimmed is ignored, ignore the trimmed too.
244   };
245 
246 private:
247   //  Only access from sqStore_setIgnored().
sqReadSeq_setIgnoreU(void)248   void        sqReadSeq_setIgnoreU(void) {
249     _ignoreU  = true;
250     _ignoreT |= true;   //  Also set ignoreT if untrimmed is ignored.
251   };
252 
sqReadSeq_setIgnoreT(void)253   void        sqReadSeq_setIgnoreT(void) {
254     _ignoreT = true;
255   };
256 
257 
258 private:
259   //  The data are logically out of order, so we can fit things into three 32-bit words.
260 
261   uint32  _seqValid  : 1;      //  The sequence for this record is present in the blob.
262   uint32  _unused1   : 1;      //  (unused)
263   uint32  _seqLength : 30;     //  The length of the sequence stored in the blob.
264 
265   uint32  _ignoreU   : 1;      //  Ignore both the untrimmed and trimmed versions.
266   uint32  _ignoreT   : 1;      //  Ignore only the trimmed version.
267   uint32  _clearBgn  : 30;     //
268 
269   uint32  _trimmed   : 1;      //  The trim points are valid.
270   uint32  _unused2   : 1;      //  (unused)
271   uint32  _clearEnd  : 30;     //
272 
273   //  Friends are only allowed to access the methods, not directly the data!
274 
275   friend class sqRead;
276   friend class sqStore;
277   friend class sqStoreInfo;
278   friend class sqReadDataWriter;
279 };
280 
281 
282 
283 //  On disk read metadata, in particular, the pointer to the blob data.
284 class sqReadMeta {
285 public:
286   void        sqReadMeta_initialize(uint32 readID    = 0,
287                                     uint32 libraryID = 0) {
288     _readID            = readID;      assert(_readID    == readID);
289     _libraryID         = libraryID;   assert(_libraryID == libraryID);
290     _assignment        = 0;
291     _assignmentScore   = 0;
292 
293     _unused            = 0;
294     _mSegm             = 0;
295     _mByte             = 0;
296   };
297 
sqRead_readID(void)298   uint32      sqRead_readID(void)           { return(_readID);          };
sqRead_libraryID(void)299   uint32      sqRead_libraryID(void)        { return(_libraryID);       };
300 
sqRead_assignment(void)301   uint32      sqRead_assignment(void)       { return(_assignment);      };
sqRead_assignmentScore(void)302   uint32      sqRead_assignmentScore(void)  { return(_assignmentScore); };
303 
sqRead_mSegm(void)304   uint64      sqRead_mSegm(void)            { return(_mSegm);           };
sqRead_mByte(void)305   uint64      sqRead_mByte(void)            { return(_mByte);           };
306 
sqRead_setPosition(uint64 mSegm,uint64 mByte)307   void        sqRead_setPosition(uint64  mSegm,
308                                  uint64  mByte) {
309     _mSegm = mSegm;   assert(_mSegm == mSegm);   //  Check that mSegm and mByte are in
310     _mByte = mByte;   assert(_mByte == mByte);   //  the range that can be stored.
311   };
312 
313 private:
314   uint64           _readID          : 30;   //    1 billion
315   uint64           _libraryID       : 12;   //    4 thousand
316   uint64           _assignment      : 15;   //   32 thousand
317   uint64           _assignmentScore : 7;    //  128
318 #if 30 + 12 + 15 + 7 != 64
319 #error sqReadMeta bits #1 are the wrong size.
320 #endif
321 
322   uint64           _unused          :  8;   //   -- FOR FUTURE USE
323   uint64           _mSegm           : 16;   //  64 thousand files       - Pointer to the blobs file we are in.
324   uint64           _mByte           : 40;   //    1 TB                  - Pointer to the blob data in the blobs file.
325 #if 8 + 16 + 40 != 64
326 #error sqReadMeta bits #2 are the wrong size.
327 #endif
328 };
329 
330 
331 
332 //  In core read representation.  Only instantiated as needed, and sequence data is only loaded
333 //  as requested.
334 //
335 //  This is really the old sqReadData structure.  It just got promoted.
336 //
337 class sqRead {
338 public:
~sqRead()339   ~sqRead() {
340     delete [] _metaA;
341     delete [] _rseqA;
342 
343     delete [] _blob;
344 
345     delete [] _name;
346     delete [] _rawBases;
347     delete [] _corBases;
348     delete [] _retBases;
349   };
350 
sqRead_readID(void)351   uint32      sqRead_readID(void)       { return(_meta->sqRead_readID());    };
sqRead_libraryID(void)352   uint32      sqRead_libraryID(void)    { return(_meta->sqRead_libraryID()); };
sqRead_library(void)353   sqLibrary  *sqRead_library(void)      { return(_library);                  };
sqRead_name(void)354   char       *sqRead_name(void)         { return(_name);                     };
355 
356   //  Like sqStore, return 0 for reads we shouldn't use.
357   uint32      sqRead_length  (sqRead_which w=sqRead_defaultVersion) {
358     sqReadSeq  *seq = sqRead_getSeq(w);
359 
360     if (w & sqRead_trimmed)
361       return(((seq == NULL) ||
362               (seq->sqReadSeq_trimmed() == false) ||
363               (seq->sqReadSeq_valid()   == false) ||
364               (seq->sqReadSeq_ignoreT() == true)) ? 0 : (seq->sqReadSeq_clearEnd() - seq->sqReadSeq_clearBgn()));
365 
366     else
367       return(((seq == NULL) ||
368               (seq->sqReadSeq_valid()   == false) ||
369               (seq->sqReadSeq_ignoreU() == true)) ? 0 : (seq->sqReadSeq_length()));
370   };
371 
372   uint32      sqRead_clearBgn(sqRead_which w=sqRead_defaultVersion) {
373     sqReadSeq  *seq = sqRead_getSeq(w);
374 
375     if (seq == NULL)
376       return(0);
377 
378     if (w & sqRead_trimmed)
379       return(seq->sqReadSeq_clearBgn());
380     else
381       return(0);
382   };
383 
384   uint32      sqRead_clearEnd(sqRead_which w=sqRead_defaultVersion) {
385     sqReadSeq  *seq = sqRead_getSeq(w);
386 
387     if (seq == NULL)
388       return(0);
389 
390     if (w & sqRead_trimmed)
391       return(seq->sqReadSeq_clearBgn());
392     else
393       return(seq->sqReadSeq_length());
394   };
395 
396   char       *sqRead_sequence(sqRead_which w=sqRead_defaultVersion) {
397     bool  comp = ((w & sqRead_compressed) == sqRead_unset) ? false : true;
398     bool  trim = ((w & sqRead_trimmed)    == sqRead_unset) ? false : true;
399 
400     //  Figure out which bases to return, either raw or corrected, and the
401     //  length of that sequence.  'bases' must be a valid pointer (we're
402     //  either loading raw or corrected bases, but the string can be length
403     //  zero.  In all cases, though, it must be NUL terminated.
404 
405     sqReadSeq   *seq      = sqRead_getSeq(w);
406     char        *bases    = NULL;
407     uint32       basesLen = 0;
408 
409     if (w & sqRead_raw) {
410       bases    = _rawBases;
411       basesLen = _rawU->sqReadSeq_length();
412     }
413 
414     if (w & sqRead_corrected) {
415       bases    = _corBases;
416       basesLen = _corU->sqReadSeq_length();
417     }
418 
419     assert(bases           != NULL);
420     assert(bases[basesLen] == 0);
421 
422     //  If neither compressed or trimmed, just return the sequence we already have.
423     //
424     if ((comp == false) &&
425         (trim == false)) {
426       return(bases);
427     }
428 
429     //  If not compressed but trimmed, copy the trimmed bases into
430     //  temporary space and return that.
431     //
432     if ((comp == false) &&
433         (trim == true)) {
434       uint32  bgn = seq->sqReadSeq_clearBgn();   //  Only valid if trimmed, do not make global!
435       uint32  end = seq->sqReadSeq_clearEnd();
436 
437       resizeArray(_retBases, 0, _retBasesAlloc, end - bgn + 1, _raAct::doNothing);
438 
439       memmove(_retBases, bases + bgn, end - bgn);
440       _retBases[end-bgn] = 0;
441 
442       return(_retBases);
443     }
444 
445     //  Otherwise, we need to homopolymer compress the sequence.  It'll be no
446     //  longer than the uncompressed sequence, so we can allocate that much,
447     //  instead of tracking down the actual length.
448 
449     resizeArray(_retBases, 0, _retBasesAlloc, basesLen + 1, _raAct::doNothing);
450 
451     homopolyCompress(bases, basesLen, _retBases);
452 
453     //  Trim the read if needed.
454     //
455     if (trim == true) {
456       uint32  bgn = seq->sqReadSeq_clearBgn();   //  Only valid if trimmed, do not make global!
457       uint32  end = seq->sqReadSeq_clearEnd();
458 
459       memmove(_retBases, _retBases + bgn, end - bgn);
460       _retBases[end-bgn] = 0;
461     }
462 
463     //  But either way, we return the same thing.
464     return(_retBases);
465   };
466 
467 private:
468   void        sqRead_fetchBlob(readBuffer *B);
469   void        sqRead_decodeBlob(void);
470 
471 private:
sqRead_getSeq(sqRead_which w)472   sqReadSeq  *sqRead_getSeq(sqRead_which w) {
473 
474     if (w == sqRead_unset)         //  If w is somehow set to the unset state,
475       w = sqRead_defaultVersion;   //  go back to using the global default.
476 
477     bool  isRaw = ((w & sqRead_raw)        == sqRead_raw);         //  Return the sqReadSeq object
478     bool  isCor = ((w & sqRead_corrected)  == sqRead_corrected);   //  appropriate for the supplied
479     bool  isTrm = ((w & sqRead_trimmed)    == sqRead_trimmed);     //  sqRead_which.
480     bool  isCmp = ((w & sqRead_compressed) == sqRead_compressed);
481 
482     if ((isRaw == true) && (isCmp == false))   { return(_rawU); };
483     if ((isRaw == true) && (isCmp == true))    { return(_rawC); };
484 
485     if ((isCor == true) && (isCmp == false))   { return(_corU); };
486     if ((isCor == true) && (isCmp == true))    { return(_corC); };
487 
488     fprintf(stderr, "sqRead_getSeq()-- Unknown which '%s'\n", toString(w));
489 
490     assert(0);
491     return(NULL);
492   };
493 
494 private:
495   sqReadMeta   *_meta = nullptr;                 //  Pointers to arrays owned by sqStore.
496   sqReadSeq    *_rawU = nullptr;
497   sqReadSeq    *_rawC = nullptr;
498   sqReadSeq    *_corU = nullptr;
499   sqReadSeq    *_corC = nullptr;
500 
501   sqReadMeta   *_metaA = nullptr;                //  When loading without a store, these
502   sqReadSeq    *_rseqA = nullptr;                //  hold the data for the above pointers.
503 
504   sqLibrary    *_library = nullptr;
505 
506   bool          _blobLoaded = false;
507 
508   char          _blobName[4] = {0};
509   uint32        _blobLen     =  0;
510   uint32        _blobMax     =  0;
511   uint8        *_blob        =  nullptr;
512 
513   uint32        _nameAlloc = 0;
514   char         *_name      = nullptr;
515 
516   uint32        _rawBasesAlloc = 0;              //  The raw sequence, as loaded from disk.
517   char         *_rawBases      = nullptr;
518 
519   uint32        _corBasesAlloc = 0;              //  The corrected sequence, as loaded from disk.
520   char         *_corBases      = nullptr;
521 
522   sqRead_which  _retFlags      = sqRead_unset;   //  Remember what the returned sequence is.
523   uint32        _retBasesAlloc = 0;              //  Scratch space for computing trimmed and
524   char         *_retBases      = nullptr;        //  compressed sequences to return to the user.
525 
526   friend class sqStore;
527   friend class sqCache;
528   friend class sqReadDataWriter;
529 };
530 
531 
532 
533 
534 class sqReadDataWriter {
535 public:
536   sqReadDataWriter(sqReadMeta *meta=NULL,
537                    sqReadSeq  *rawu=NULL,
538                    sqReadSeq  *rawc=NULL,
539                    sqReadSeq  *coru=NULL,
540                    sqReadSeq  *corc=NULL) {
541     _meta          = meta;
542     _rawU          = rawu;
543     _rawC          = rawc;
544     _corU          = coru;
545     _corC          = corc;
546 
547     _charMap['A'] = _charMap['a'] = 'A';
548     _charMap['C'] = _charMap['c'] = 'C';
549     _charMap['G'] = _charMap['g'] = 'G';
550     _charMap['T'] = _charMap['t'] = 'T';
551     _charMap['U'] = _charMap['u'] = 'T';   //  Map U to T.
552     _charMap['N'] = _charMap['n'] = 'N';
553   };
554 
~sqReadDataWriter()555   ~sqReadDataWriter() {
556     delete [] _name;
557     delete [] _rawBases;
558     delete [] _corBases;
559   };
560 
561 public:
562   void        sqReadDataWriter_importData(sqRead *read);
563 
564   void        sqReadDataWriter_setName(const char *N);
565   void        sqReadDataWriter_setRawBases(const char *S, uint32 Slen);
566   void        sqReadDataWriter_setCorrectedBases(const char *S, uint32 Slen);
567 
568   void        sqReadDataWriter_writeBlob(writeBuffer *buffer);
569 
sqReadDataWriter_getRawLength(bool compressed)570   uint32      sqReadDataWriter_getRawLength(bool compressed) {
571     return((compressed == false) ? _rawU->sqReadSeq_length() : _rawC->sqReadSeq_length());
572   };
573 
sqReadDataWriter_getCorrectedLength(bool compressed)574   uint32      sqReadDataWriter_getCorrectedLength(bool compressed) {
575     return((compressed == false) ? _corU->sqReadSeq_length() : _corC->sqReadSeq_length());
576   };
577 
578 private:
579   sqReadMeta  *_meta = nullptr;           //  Pointer to the read metadata.
580   sqReadSeq   *_rawU = nullptr;           //    In sqStoreCreate, these pointers are
581   sqReadSeq   *_rawC = nullptr;           //    back to the data in sqStore itself.
582   sqReadSeq   *_corU = nullptr;
583   sqReadSeq   *_corC = nullptr;
584 
585   uint32       _nameAlloc     = 0;
586   uint32       _nameLen       = 0;
587   char        *_name          = nullptr;
588 
589   uint32       _rawBasesAlloc = 0;        //  The raw sequence.  The length is the
590   uint32       _rawBasesLen   = 0;        //  length of the array, not of the string.
591   char        *_rawBases      = nullptr;
592 
593   uint32       _corBasesAlloc = 0;        //  The corrected sequence.  The length is the
594   uint32       _corBasesLen   = 0;        //  length of the array, not of the string.
595   char        *_corBases      = nullptr;
596 
597   char         _charMap[256]  = { 0 };
598 
599   friend class sqStore;
600   friend class sqStoreBlobWriter;
601 };
602 
603 
604 
605 #endif  //  SQREAD_H
606