1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #ifndef SQSTORE_H
19 #define SQSTORE_H
20 
21 #include "runtime.H"
22 #include "files.H"
23 
24 //  Versions:
25 //
26 //   4 - first to be called seqStore.               18 APR 2018.  01d182304a951846677a641197bb761674a5e662
27 //   5 - tracking of the last blob used.            01 MAY 2018.  1e1ca668fbe16763ef22d7d69a04413cc2b95fb1
28 //   6 - unlimited blobs, segs and parts.           04 MAY 2018.  5654d275dfe096465eedd36d0dc316a6c79c59ec
29 //   7 - skipped, used during development.
30 //   8 - allow clear/compr for any sequence type.   05 JUL 2019.  40763d8b21bfc7f65530c8d768abffbc64e777c3
31 //   9 - add, again, numBlobs.                      18 OCT 2019.  (unknown)
32 
33 #define SQ_MAGIC8  0x504b473a756e6162lu      //  banu:GKP (for versions before 9)
34 #define SQ_MAGIC   0x5145533a756e6163lu      //  canu:SEQ
35 #define SQ_VERSION 0x0000000000000009lu
36 
37 
38 //  The number of library IIDs we can handle.
39 //
40 #define AS_MAX_LIBRARIES_BITS      6
41 #define AS_MAX_LIBRARIES           (((uint32)1 << AS_MAX_LIBRARIES_BITS) - 1)
42 
43 #define LIBRARY_NAME_SIZE          128
44 
45 //  Maximum length of reads.
46 //
47 //  If 16, an overlap is only 20 bytes.  (5x 32 bit words)
48 //  If 17-21, an overlap is 24 bytes.    (3x 64 bit words)
49 //  If 22-32, an overlap is 32 bytes.    (4x 64 bit words)
50 //
51 //  if 26, bogart has issues with storing the error rate
52 //  If 28, alignment/alignment-drivers.C won't link
53 //  If 29, alignment/alignment-drivers.C won't link
54 //  If 30, alignment/alignment-drivers.C won't link
55 //  If 31, alignment/alignment-drivers.C won't compile, len+len+2 == 0
56 //  If 32, it won't compile because of shifting (uint32)1 << 32 == 0.
57 //
58 #define AS_MAX_READLEN_BITS        21
59 #define AS_MAX_READLEN             (((uint32)1 << AS_MAX_READLEN_BITS) - 1)
60 
61 //  The number of read IDs we can handle.  Longer reads implies fewer reads.
62 //    readLen 32 + numLibs 6 -> numReads 26 ( 64  million)
63 //    readLen 30 + numLibs 6 -> numReads 28 (256  million)
64 //    readLen 28 + numLibs 6 -> numReads 30 (1024 million)
65 //    readLen 26 + numLibs 6 -> numReads 32 (4096 million)  //  limited elsewhere!
66 //    readLen 24 + numLibs 6 -> numReads 34 (4096 million)  //  limited elsewhere!
67 //    readLen 22 + numLibs 6 -> numReads 36 (4096 million)  //  limited elsewhere!
68 //    readLen 21 + numLibs 6 -> numReads 37 (4096 million)  //  limited elsewhere!
69 //    readLen 20 + numLibs 6 -> numReads 38 (4096 million)  //  limited elsewhere!
70 //
71 #define AS_MAX_READS_BITS          64 - AS_MAX_READLEN_BITS - AS_MAX_LIBRARIES_BITS
72 #define AS_MAX_READS               (((uint64)1 << AS_MAX_READS_BITS) - 1)
73 
74 
75 #include "sqLibrary.H"
76 #include "sqRead.H"
77 
78 
79 //  The default behavior is to open the store for read only, and to load
80 //  all the metadata into memory.
81 
82 typedef enum {
83   sqStore_create      = 0x00,  //  Open for creating, will fail if files exist already
84   sqStore_extend      = 0x01,  //  Open for modification and appending new reads/libraries
85   sqStore_readOnly    = 0x02,  //  Open read only
86 } sqStore_mode;
87 
88 
89 static
90 const
91 char *
toString(sqStore_mode m)92 toString(sqStore_mode m) {
93   switch (m) {
94     case sqStore_create:       return("sqStore_create");       break;
95     case sqStore_extend:       return("sqStore_extend");       break;
96     case sqStore_readOnly:     return("sqStore_readOnly");     break;
97   }
98 
99   return("undefined-mode");
100 }
101 
102 
103 static
104 void
makeBlobName(char const * storePath,uint32 blobNumber,char * blobName)105 makeBlobName(char const *storePath, uint32 blobNumber, char *blobName) {
106   snprintf(blobName, FILENAME_MAX, "%s/blobs.%04" F_U32P , storePath, blobNumber);
107 }
108 
109 
110 class sqStoreInfo {
111 public:
112   sqStoreInfo();
113   ~sqStoreInfo();
114 
115 private:
116   bool      readInfo8(char const *metaPath);
117 public:
118   void      readInfo(char const *metaPath);
119   void      writeInfo(char const *metaPath);
120 
121   void      writeInfoAsText(FILE *F);
122 
sqInfo_lastLibraryID(void)123   uint32    sqInfo_lastLibraryID(void)          { return(_numLibraries); };
sqInfo_lastReadID(void)124   uint32    sqInfo_lastReadID(void)             { return(_numReads);     };
125 
126   //  Well, shoot.  The on-disk metadata is storing _reads as a 64-bit int
127   //  for some reason, although the store cannot handle more than 4-billion
128   //  reads.  I can't change the 64-bit int to a 32-bit int without breaking
129   //  combatibility, and am not sure what havoc would result if I return a
130   //  64-bit int here (probably none, but it's be a glaring inconsistency all
131   //  over the place, instead of just here).
132   //
sqInfo_numReads(sqRead_which w)133   uint32    sqInfo_numReads(sqRead_which w)     { return((uint32)_reads[w]); };
sqInfo_numBases(sqRead_which w)134   uint64    sqInfo_numBases(sqRead_which w)     { return(        _bases[w]); };
135 
136 public:
137   bool      examineRead(uint32 ii, sqReadSeq *seq, sqRead_which w);
138 
139 private:
140   bool      checkInfo(void);
141   void      update(sqReadSeq  *rawU,
142                    sqReadSeq  *rawC,
143                    sqReadSeq  *corU,
144                    sqReadSeq  *corC);
145 
146 public:
147   //  For info in the store, these control the number of objects
148   //  that are allocated in the metadata arrays.
149   //
150   //  For info in dumpMetaData, these are used to count the number
151   //  of objects seen in the scan.
152   //
sqInfo_addLibrary(void)153   void      sqInfo_addLibrary(void)             { _numLibraries++;       };
sqInfo_addRead(void)154   void      sqInfo_addRead(void)                { _numReads++;           };
155 
156 private:
157   uint64    _sqMagic;
158   uint64    _sqVersion;
159 
160   uint32    _sqLibrarySize;      //  Sanity checks that this code can load the data properly.
161   uint32    _sqReadSize;
162   uint32    _sqMaxLibrariesBits;
163   uint32    _sqLibraryNameSize;
164   uint32    _sqMaxReadBits;
165   uint32    _sqMaxReadLenBits;
166 
167   uint32    _numLibraries;       //  Counts of types of things we have loaded (next
168   uint32    _numReads;           //    available index into _libraries and _reads in sqStore)
169   uint32    _numBlobs;
170 
171   uint64    _reads[sqRead_largest];
172   uint64    _bases[sqRead_largest];
173 
174   friend class sqStore;
175   friend class sqStoreBlobWriter;
176 };
177 
178 
179 
180 
181 
182 class sqStoreBlobWriter {
183 public:
184   sqStoreBlobWriter(const char *storePath, sqStoreInfo *info);
185   ~sqStoreBlobWriter();
186 
187   void           writeData(sqReadDataWriter *readData);
188 
189 private:
190   char          _storePath[FILENAME_MAX+1];        //  Path to the seqStore.
191   char          _blobName[FILENAME_MAX+1];         //  A temporary to make life easier.
192 
193   sqStoreInfo  *_info;
194   writeBuffer  *_buffer;
195 };
196 
197 
198 
199 //  Manages access to blob data.  You need one of these per thread.
200 //
201 class sqStoreBlobReader {
202 public:
203   sqStoreBlobReader(const char *storePath);
204   ~sqStoreBlobReader();
205 
206   readBuffer    *getBuffer(sqReadMeta *meta);
getBuffer(sqReadMeta & meta)207   readBuffer    *getBuffer(sqReadMeta &meta)   { return(getBuffer(&meta)); };
208 
209 private:
210   char          _storePath[FILENAME_MAX+1];        //  Path to the seqStore.
211   char          _blobName[FILENAME_MAX+1];         //  A temporary to make life easier.
212 
213   uint32        _buffersMax;
214   readBuffer  **_buffers;   //  One per blob file.
215 };
216 
217 
218 
219 
220 
221 class sqStore {
222 public:
223   sqStore(char const *storePath, sqStore_mode mode=sqStore_readOnly, uint32 version=0);
224   ~sqStore();
225 private:
226   void         sqStore_loadMetadata(void);
227 
228 public:
sqStore_path(void)229   const char  *sqStore_path(void) { return(_storePath); };  //  Returns the path to the store
230 
231   static
232   uint32       sqStore_lastVersion(char const *storePath);
233 
234   static
235   void         sqStore_revertVersion(char const *storePath, uint32 version);
236 
sqStore_lastLibraryID(void)237   uint32       sqStore_lastLibraryID(void)            { return(_info.sqInfo_lastLibraryID()); };
sqStore_lastReadID(void)238   uint32       sqStore_lastReadID(void)               { return(_info.sqInfo_lastReadID());    };
239 
240   //  If iterating over reads, use sqStore_lastReadID(), not sqStore_getNumReads().
sqStore_getNumReads(sqRead_which w)241   uint32       sqStore_getNumReads(sqRead_which w)    { return(_info.sqInfo_numReads(w));     };
242 
sqStore_getLibrary(uint32 id)243   sqLibrary   *sqStore_getLibrary(uint32 id)          { return(&_libraries[id]); };
244 
sqStore_getLibraryIDForRead(uint32 id)245   uint32       sqStore_getLibraryIDForRead(uint32 id) { return(_meta[id].sqRead_libraryID()); };
sqStore_getLibraryForRead(uint32 id)246   sqLibrary   *sqStore_getLibraryForRead(uint32 id)   { return(&_libraries[_meta[id].sqRead_libraryID()]); };
247 
248 public:
249   readBuffer  *sqStore_getReadBuffer(uint32 readID);
250   sqRead      *sqStore_getRead(uint32 readID, sqRead *read);
251 
252 public:
253   static
254   bool         sqStore_loadReadFromBuffer(readBuffer *B, sqRead *read);
255   void         sqStore_saveReadToBuffer(writeBuffer *B, uint32 id, sqRead *rd, sqReadDataWriter *wr);
256 
257 private:
sqStore_getSeq(sqRead_which w)258   sqReadSeq   *sqStore_getSeq(sqRead_which w) {
259 
260     if (w == sqRead_unset)         //  If w is somehow set to the unset state,
261       w = sqRead_defaultVersion;   //  go back to using the global default.
262 
263     bool  isRaw = ((w & sqRead_raw)        == sqRead_raw);
264     bool  isCor = ((w & sqRead_corrected)  == sqRead_corrected);
265     bool  isTrm = ((w & sqRead_trimmed)    == sqRead_trimmed);
266     bool  isCmp = ((w & sqRead_compressed) == sqRead_compressed);
267 
268     if ((isRaw == true) && (isCmp == false))          { return(_rawU); };
269     if ((isRaw == true) && (isCmp == true))           { return(_rawC); };
270 
271     if ((isCor == true) && (isCmp == false))          { return(_corU); };
272     if ((isCor == true) && (isCmp == true))           { return(_corC); };
273 
274     fprintf(stderr, "sqStore_getSeq()-- Unknown which '%s'\n", toString(w));
275 
276     assert(0);
277     return(NULL);
278   };
279 
280   //  Accessors to read data.
281 public:
282   uint64             sqStore_getReadSegm(uint32 id);
283   uint64             sqStore_getReadByte(uint32 id);
284 
285   sqReadSeq         *sqStore_getReadSeq(uint32 id, sqRead_which w=sqRead_defaultVersion);
286 
287   //  Read length is zero if the read isn't present, isn't valid, is ignored,
288   //  or is trimmed out.  Ideally, those cases should generate an error -- we
289   //  should be testing for those cases before deciding to use a read -- but
290   //  too many places are expecting length==0 in thse cases.
291   //
292   uint32             sqStore_getReadLength(uint32 id, sqRead_which w=sqRead_defaultVersion);
293 
294   //  Unlike read length, we'll insist that accessing clear ranges only occur
295   //  for trimmed reads (enforced by sqReadSeq).
296   //
297   uint32             sqStore_getClearBgn  (uint32 id, sqRead_which w=sqRead_defaultVersion);
298   uint32             sqStore_getClearEnd  (uint32 id, sqRead_which w=sqRead_defaultVersion);
299 
300   bool               sqStore_isValidRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
301   bool               sqStore_isIgnoredRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
302   bool               sqStore_isTrimmedRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
303 
304   //  For use ONLY by sqStoreCreate, to add new libraries and reads to a
305   //  store.
306   //
307   //    addEmptyLibrary() adds a new library to the store.
308   //
309   //    createEmptyRead() allocates a sRDW, but does not add a new read to
310   //    the store.  It must be followed by addRead() to load the data to the
311   //    store.  Of note, the sRDW is assigned a read ID for the next -
312   //    currently non-existent - read in the store.  Calling
313   //    createEmptyRead() twice with no addRead() between will create two
314   //    reads with the same ID and Bad Things will result.
315   //
316 public:
317   sqLibrary         *sqStore_addEmptyLibrary(char const *name, sqLibrary_tech techType);
318 
319   sqReadDataWriter  *sqStore_createEmptyRead(sqLibrary *lib, const char *name);
320   void               sqStore_addRead(sqReadDataWriter *rdw);
321 
322   //  Used when initially loading reads into seqStore, and when loading
323   //  trimmed reads.  It sets the ignore flag in both the normal and
324   //  compressed metadata for a given read.  Select 'raw' or 'corrected' with
325   //  aqRead_which, then select untrimmed or trimmed with the flags.
326 public:
327   void               sqStore_setIgnored(uint32       id,
328                                         bool         untrimmed,
329                                         bool         trimmed,
330                                         sqRead_which w=sqRead_defaultVersion);
331 
332   //  This loads a read, computes the compressed sequence, and sets both the
333   //  normal and compressed clear ranges.
334 public:
335   void               sqStore_setClearRange(uint32 id,
336                                            uint32 bgn, uint32 end, bool bogus,
337                                            sqRead_which w=sqRead_defaultVersion);
338 
339 private:
340   sqStoreInfo          _info;  //  All the stuff stored on disk.
341 
342   char                 _storePath[FILENAME_MAX+1];    //  Needed to create files
343   char                 _metaPath[FILENAME_MAX+1];
344 
345   sqStore_mode         _mode;  //  What mode this store is opened as, sanity checking
346   uint32               _version;
347 
348   uint32               _librariesAlloc;  //  Size of allocation
349   sqLibrary           *_libraries;       //  In core data
350 
351   uint32               _readsAlloc;      //  Size of allocation
352   sqReadMeta          *_meta;            //  In core data
353   sqReadSeq           *_rawU;            //  Metadata for raw sequence
354   sqReadSeq           *_rawC;            //  Metadata for raw compressed sequence
355   sqReadSeq           *_corU;
356   sqReadSeq           *_corC;
357 
358   sqStoreBlobReader   *_blobReader;
359   sqStoreBlobWriter   *_blobWriter;
360 };
361 
362 
363 
364 
365 //  The rest are just accessors to reads.
366 
367 
368 inline
369 uint64
sqStore_getReadSegm(uint32 id)370 sqStore::sqStore_getReadSegm(uint32 id) {
371   return(_meta[id].sqRead_mSegm());
372 }
373 
374 
375 inline
376 uint64
sqStore_getReadByte(uint32 id)377 sqStore::sqStore_getReadByte(uint32 id) {
378   return(_meta[id].sqRead_mByte());
379 }
380 
381 
382 inline
383 sqReadSeq *
sqStore_getReadSeq(uint32 id,sqRead_which w)384 sqStore::sqStore_getReadSeq(uint32 id, sqRead_which w) {
385   sqReadSeq  *seq = sqStore_getSeq(w);
386 
387   assert(id > 0);
388 
389   return((seq == NULL) ? NULL : &seq[id]);
390 }
391 
392 
393 //  Read length is zero if the read isn't present, isn't valid, is ignored,
394 //  or is trimmed out.  Ideally, those cases should generate an error -- we
395 //  should be testing for those cases before deciding to use a read -- but
396 //  too many places are expecting length==0 in thse cases.
397 //
398 inline
399 uint32
sqStore_getReadLength(uint32 id,sqRead_which w)400 sqStore::sqStore_getReadLength(uint32 id, sqRead_which w) {
401   sqReadSeq  *seq = sqStore_getSeq(w);
402 
403   assert(id > 0);
404 
405   if (w & sqRead_trimmed)
406     return(((seq == NULL) ||
407             (seq[id].sqReadSeq_trimmed() == false) ||
408             (seq[id].sqReadSeq_valid()   == false) ||
409             (seq[id].sqReadSeq_ignoreT() == true)) ? 0 : (seq[id].sqReadSeq_clearEnd() - seq[id].sqReadSeq_clearBgn()));
410 
411   else
412     return(((seq == NULL) ||
413             (seq[id].sqReadSeq_valid()   == false) ||
414             (seq[id].sqReadSeq_ignoreU() == true)) ? 0 : (seq[id].sqReadSeq_length()));
415 }
416 
417 
418 //  Unlike read length, we'll insist that accessing clear ranges only occur
419 //  for trimmed reads (enforced by sqReadSeq).
420 //
421 inline
422 uint32
sqStore_getClearBgn(uint32 id,sqRead_which w)423 sqStore::sqStore_getClearBgn  (uint32 id, sqRead_which w) {
424   sqReadSeq  *seq = sqStore_getSeq(w);
425 
426   assert(id > 0);
427 
428   if (seq == NULL)
429     return(0);
430 
431   if (w & sqRead_trimmed)
432     return(seq[id].sqReadSeq_clearBgn());
433   else
434     return(0);
435 }
436 
437 
438 inline
439 uint32
sqStore_getClearEnd(uint32 id,sqRead_which w)440 sqStore::sqStore_getClearEnd  (uint32 id, sqRead_which w) {
441   sqReadSeq  *seq = sqStore_getSeq(w);
442 
443   assert(id > 0);
444 
445   if (seq == NULL)
446     return(0);
447 
448   if (w & sqRead_trimmed)
449     return(seq[id].sqReadSeq_clearEnd());
450   else
451     return(seq[id].sqReadSeq_length());
452 }
453 
454 
455 inline
456 bool
sqStore_isValidRead(uint32 id,sqRead_which w)457 sqStore::sqStore_isValidRead(uint32 id, sqRead_which w) {
458   sqReadSeq  *seq = sqStore_getSeq(w);
459 
460   assert(id > 0);
461 
462   if (seq == NULL)
463     return(false);
464 
465   if (w & sqRead_trimmed)
466     return(seq[id].sqReadSeq_valid() && seq[id].sqReadSeq_trimmed());
467   else
468     return(seq[id].sqReadSeq_valid());
469 }
470 
471 
472 inline
473 bool
sqStore_isIgnoredRead(uint32 id,sqRead_which w)474 sqStore::sqStore_isIgnoredRead(uint32 id, sqRead_which w) {
475   sqReadSeq  *seq = sqStore_getSeq(w);
476 
477   assert(id > 0);
478 
479   if (seq == NULL)
480     return(false);
481 
482   if (sqStore_isValidRead(id, w) == false)
483     return(true);
484 
485   if (w & sqRead_trimmed)
486     return(seq[id].sqReadSeq_ignoreT());
487   else
488     return(seq[id].sqReadSeq_ignoreU());
489 }
490 
491 
492 inline
493 bool
sqStore_isTrimmedRead(uint32 id,sqRead_which w)494 sqStore::sqStore_isTrimmedRead(uint32 id, sqRead_which w) {
495   sqReadSeq  *seq = sqStore_getSeq(w);
496 
497   assert(id > 0);
498 
499   return((seq == NULL) ? false : seq[id].sqReadSeq_trimmed());
500 }
501 
502 
503 
504 
505 
506 
507 
508 #endif  //  SQSTORE_H
509