1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #ifndef SQSTORE_H
19 #define SQSTORE_H
20
21 #include "runtime.H"
22 #include "files.H"
23
24 // Versions:
25 //
26 // 4 - first to be called seqStore. 18 APR 2018. 01d182304a951846677a641197bb761674a5e662
27 // 5 - tracking of the last blob used. 01 MAY 2018. 1e1ca668fbe16763ef22d7d69a04413cc2b95fb1
28 // 6 - unlimited blobs, segs and parts. 04 MAY 2018. 5654d275dfe096465eedd36d0dc316a6c79c59ec
29 // 7 - skipped, used during development.
30 // 8 - allow clear/compr for any sequence type. 05 JUL 2019. 40763d8b21bfc7f65530c8d768abffbc64e777c3
31 // 9 - add, again, numBlobs. 18 OCT 2019. (unknown)
32
33 #define SQ_MAGIC8 0x504b473a756e6162lu // banu:GKP (for versions before 9)
34 #define SQ_MAGIC 0x5145533a756e6163lu // canu:SEQ
35 #define SQ_VERSION 0x0000000000000009lu
36
37
38 // The number of library IIDs we can handle.
39 //
40 #define AS_MAX_LIBRARIES_BITS 6
41 #define AS_MAX_LIBRARIES (((uint32)1 << AS_MAX_LIBRARIES_BITS) - 1)
42
43 #define LIBRARY_NAME_SIZE 128
44
45 // Maximum length of reads.
46 //
47 // If 16, an overlap is only 20 bytes. (5x 32 bit words)
48 // If 17-21, an overlap is 24 bytes. (3x 64 bit words)
49 // If 22-32, an overlap is 32 bytes. (4x 64 bit words)
50 //
51 // if 26, bogart has issues with storing the error rate
52 // If 28, alignment/alignment-drivers.C won't link
53 // If 29, alignment/alignment-drivers.C won't link
54 // If 30, alignment/alignment-drivers.C won't link
55 // If 31, alignment/alignment-drivers.C won't compile, len+len+2 == 0
56 // If 32, it won't compile because of shifting (uint32)1 << 32 == 0.
57 //
58 #define AS_MAX_READLEN_BITS 21
59 #define AS_MAX_READLEN (((uint32)1 << AS_MAX_READLEN_BITS) - 1)
60
61 // The number of read IDs we can handle. Longer reads implies fewer reads.
62 // readLen 32 + numLibs 6 -> numReads 26 ( 64 million)
63 // readLen 30 + numLibs 6 -> numReads 28 (256 million)
64 // readLen 28 + numLibs 6 -> numReads 30 (1024 million)
65 // readLen 26 + numLibs 6 -> numReads 32 (4096 million) // limited elsewhere!
66 // readLen 24 + numLibs 6 -> numReads 34 (4096 million) // limited elsewhere!
67 // readLen 22 + numLibs 6 -> numReads 36 (4096 million) // limited elsewhere!
68 // readLen 21 + numLibs 6 -> numReads 37 (4096 million) // limited elsewhere!
69 // readLen 20 + numLibs 6 -> numReads 38 (4096 million) // limited elsewhere!
70 //
71 #define AS_MAX_READS_BITS 64 - AS_MAX_READLEN_BITS - AS_MAX_LIBRARIES_BITS
72 #define AS_MAX_READS (((uint64)1 << AS_MAX_READS_BITS) - 1)
73
74
75 #include "sqLibrary.H"
76 #include "sqRead.H"
77
78
79 // The default behavior is to open the store for read only, and to load
80 // all the metadata into memory.
81
82 typedef enum {
83 sqStore_create = 0x00, // Open for creating, will fail if files exist already
84 sqStore_extend = 0x01, // Open for modification and appending new reads/libraries
85 sqStore_readOnly = 0x02, // Open read only
86 } sqStore_mode;
87
88
89 static
90 const
91 char *
toString(sqStore_mode m)92 toString(sqStore_mode m) {
93 switch (m) {
94 case sqStore_create: return("sqStore_create"); break;
95 case sqStore_extend: return("sqStore_extend"); break;
96 case sqStore_readOnly: return("sqStore_readOnly"); break;
97 }
98
99 return("undefined-mode");
100 }
101
102
103 static
104 void
makeBlobName(char const * storePath,uint32 blobNumber,char * blobName)105 makeBlobName(char const *storePath, uint32 blobNumber, char *blobName) {
106 snprintf(blobName, FILENAME_MAX, "%s/blobs.%04" F_U32P , storePath, blobNumber);
107 }
108
109
110 class sqStoreInfo {
111 public:
112 sqStoreInfo();
113 ~sqStoreInfo();
114
115 private:
116 bool readInfo8(char const *metaPath);
117 public:
118 void readInfo(char const *metaPath);
119 void writeInfo(char const *metaPath);
120
121 void writeInfoAsText(FILE *F);
122
sqInfo_lastLibraryID(void)123 uint32 sqInfo_lastLibraryID(void) { return(_numLibraries); };
sqInfo_lastReadID(void)124 uint32 sqInfo_lastReadID(void) { return(_numReads); };
125
126 // Well, shoot. The on-disk metadata is storing _reads as a 64-bit int
127 // for some reason, although the store cannot handle more than 4-billion
128 // reads. I can't change the 64-bit int to a 32-bit int without breaking
129 // combatibility, and am not sure what havoc would result if I return a
130 // 64-bit int here (probably none, but it's be a glaring inconsistency all
131 // over the place, instead of just here).
132 //
sqInfo_numReads(sqRead_which w)133 uint32 sqInfo_numReads(sqRead_which w) { return((uint32)_reads[w]); };
sqInfo_numBases(sqRead_which w)134 uint64 sqInfo_numBases(sqRead_which w) { return( _bases[w]); };
135
136 public:
137 bool examineRead(uint32 ii, sqReadSeq *seq, sqRead_which w);
138
139 private:
140 bool checkInfo(void);
141 void update(sqReadSeq *rawU,
142 sqReadSeq *rawC,
143 sqReadSeq *corU,
144 sqReadSeq *corC);
145
146 public:
147 // For info in the store, these control the number of objects
148 // that are allocated in the metadata arrays.
149 //
150 // For info in dumpMetaData, these are used to count the number
151 // of objects seen in the scan.
152 //
sqInfo_addLibrary(void)153 void sqInfo_addLibrary(void) { _numLibraries++; };
sqInfo_addRead(void)154 void sqInfo_addRead(void) { _numReads++; };
155
156 private:
157 uint64 _sqMagic;
158 uint64 _sqVersion;
159
160 uint32 _sqLibrarySize; // Sanity checks that this code can load the data properly.
161 uint32 _sqReadSize;
162 uint32 _sqMaxLibrariesBits;
163 uint32 _sqLibraryNameSize;
164 uint32 _sqMaxReadBits;
165 uint32 _sqMaxReadLenBits;
166
167 uint32 _numLibraries; // Counts of types of things we have loaded (next
168 uint32 _numReads; // available index into _libraries and _reads in sqStore)
169 uint32 _numBlobs;
170
171 uint64 _reads[sqRead_largest];
172 uint64 _bases[sqRead_largest];
173
174 friend class sqStore;
175 friend class sqStoreBlobWriter;
176 };
177
178
179
180
181
182 class sqStoreBlobWriter {
183 public:
184 sqStoreBlobWriter(const char *storePath, sqStoreInfo *info);
185 ~sqStoreBlobWriter();
186
187 void writeData(sqReadDataWriter *readData);
188
189 private:
190 char _storePath[FILENAME_MAX+1]; // Path to the seqStore.
191 char _blobName[FILENAME_MAX+1]; // A temporary to make life easier.
192
193 sqStoreInfo *_info;
194 writeBuffer *_buffer;
195 };
196
197
198
199 // Manages access to blob data. You need one of these per thread.
200 //
201 class sqStoreBlobReader {
202 public:
203 sqStoreBlobReader(const char *storePath);
204 ~sqStoreBlobReader();
205
206 readBuffer *getBuffer(sqReadMeta *meta);
getBuffer(sqReadMeta & meta)207 readBuffer *getBuffer(sqReadMeta &meta) { return(getBuffer(&meta)); };
208
209 private:
210 char _storePath[FILENAME_MAX+1]; // Path to the seqStore.
211 char _blobName[FILENAME_MAX+1]; // A temporary to make life easier.
212
213 uint32 _buffersMax;
214 readBuffer **_buffers; // One per blob file.
215 };
216
217
218
219
220
221 class sqStore {
222 public:
223 sqStore(char const *storePath, sqStore_mode mode=sqStore_readOnly, uint32 version=0);
224 ~sqStore();
225 private:
226 void sqStore_loadMetadata(void);
227
228 public:
sqStore_path(void)229 const char *sqStore_path(void) { return(_storePath); }; // Returns the path to the store
230
231 static
232 uint32 sqStore_lastVersion(char const *storePath);
233
234 static
235 void sqStore_revertVersion(char const *storePath, uint32 version);
236
sqStore_lastLibraryID(void)237 uint32 sqStore_lastLibraryID(void) { return(_info.sqInfo_lastLibraryID()); };
sqStore_lastReadID(void)238 uint32 sqStore_lastReadID(void) { return(_info.sqInfo_lastReadID()); };
239
240 // If iterating over reads, use sqStore_lastReadID(), not sqStore_getNumReads().
sqStore_getNumReads(sqRead_which w)241 uint32 sqStore_getNumReads(sqRead_which w) { return(_info.sqInfo_numReads(w)); };
242
sqStore_getLibrary(uint32 id)243 sqLibrary *sqStore_getLibrary(uint32 id) { return(&_libraries[id]); };
244
sqStore_getLibraryIDForRead(uint32 id)245 uint32 sqStore_getLibraryIDForRead(uint32 id) { return(_meta[id].sqRead_libraryID()); };
sqStore_getLibraryForRead(uint32 id)246 sqLibrary *sqStore_getLibraryForRead(uint32 id) { return(&_libraries[_meta[id].sqRead_libraryID()]); };
247
248 public:
249 readBuffer *sqStore_getReadBuffer(uint32 readID);
250 sqRead *sqStore_getRead(uint32 readID, sqRead *read);
251
252 public:
253 static
254 bool sqStore_loadReadFromBuffer(readBuffer *B, sqRead *read);
255 void sqStore_saveReadToBuffer(writeBuffer *B, uint32 id, sqRead *rd, sqReadDataWriter *wr);
256
257 private:
sqStore_getSeq(sqRead_which w)258 sqReadSeq *sqStore_getSeq(sqRead_which w) {
259
260 if (w == sqRead_unset) // If w is somehow set to the unset state,
261 w = sqRead_defaultVersion; // go back to using the global default.
262
263 bool isRaw = ((w & sqRead_raw) == sqRead_raw);
264 bool isCor = ((w & sqRead_corrected) == sqRead_corrected);
265 bool isTrm = ((w & sqRead_trimmed) == sqRead_trimmed);
266 bool isCmp = ((w & sqRead_compressed) == sqRead_compressed);
267
268 if ((isRaw == true) && (isCmp == false)) { return(_rawU); };
269 if ((isRaw == true) && (isCmp == true)) { return(_rawC); };
270
271 if ((isCor == true) && (isCmp == false)) { return(_corU); };
272 if ((isCor == true) && (isCmp == true)) { return(_corC); };
273
274 fprintf(stderr, "sqStore_getSeq()-- Unknown which '%s'\n", toString(w));
275
276 assert(0);
277 return(NULL);
278 };
279
280 // Accessors to read data.
281 public:
282 uint64 sqStore_getReadSegm(uint32 id);
283 uint64 sqStore_getReadByte(uint32 id);
284
285 sqReadSeq *sqStore_getReadSeq(uint32 id, sqRead_which w=sqRead_defaultVersion);
286
287 // Read length is zero if the read isn't present, isn't valid, is ignored,
288 // or is trimmed out. Ideally, those cases should generate an error -- we
289 // should be testing for those cases before deciding to use a read -- but
290 // too many places are expecting length==0 in thse cases.
291 //
292 uint32 sqStore_getReadLength(uint32 id, sqRead_which w=sqRead_defaultVersion);
293
294 // Unlike read length, we'll insist that accessing clear ranges only occur
295 // for trimmed reads (enforced by sqReadSeq).
296 //
297 uint32 sqStore_getClearBgn (uint32 id, sqRead_which w=sqRead_defaultVersion);
298 uint32 sqStore_getClearEnd (uint32 id, sqRead_which w=sqRead_defaultVersion);
299
300 bool sqStore_isValidRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
301 bool sqStore_isIgnoredRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
302 bool sqStore_isTrimmedRead(uint32 id, sqRead_which w=sqRead_defaultVersion);
303
304 // For use ONLY by sqStoreCreate, to add new libraries and reads to a
305 // store.
306 //
307 // addEmptyLibrary() adds a new library to the store.
308 //
309 // createEmptyRead() allocates a sRDW, but does not add a new read to
310 // the store. It must be followed by addRead() to load the data to the
311 // store. Of note, the sRDW is assigned a read ID for the next -
312 // currently non-existent - read in the store. Calling
313 // createEmptyRead() twice with no addRead() between will create two
314 // reads with the same ID and Bad Things will result.
315 //
316 public:
317 sqLibrary *sqStore_addEmptyLibrary(char const *name, sqLibrary_tech techType);
318
319 sqReadDataWriter *sqStore_createEmptyRead(sqLibrary *lib, const char *name);
320 void sqStore_addRead(sqReadDataWriter *rdw);
321
322 // Used when initially loading reads into seqStore, and when loading
323 // trimmed reads. It sets the ignore flag in both the normal and
324 // compressed metadata for a given read. Select 'raw' or 'corrected' with
325 // aqRead_which, then select untrimmed or trimmed with the flags.
326 public:
327 void sqStore_setIgnored(uint32 id,
328 bool untrimmed,
329 bool trimmed,
330 sqRead_which w=sqRead_defaultVersion);
331
332 // This loads a read, computes the compressed sequence, and sets both the
333 // normal and compressed clear ranges.
334 public:
335 void sqStore_setClearRange(uint32 id,
336 uint32 bgn, uint32 end, bool bogus,
337 sqRead_which w=sqRead_defaultVersion);
338
339 private:
340 sqStoreInfo _info; // All the stuff stored on disk.
341
342 char _storePath[FILENAME_MAX+1]; // Needed to create files
343 char _metaPath[FILENAME_MAX+1];
344
345 sqStore_mode _mode; // What mode this store is opened as, sanity checking
346 uint32 _version;
347
348 uint32 _librariesAlloc; // Size of allocation
349 sqLibrary *_libraries; // In core data
350
351 uint32 _readsAlloc; // Size of allocation
352 sqReadMeta *_meta; // In core data
353 sqReadSeq *_rawU; // Metadata for raw sequence
354 sqReadSeq *_rawC; // Metadata for raw compressed sequence
355 sqReadSeq *_corU;
356 sqReadSeq *_corC;
357
358 sqStoreBlobReader *_blobReader;
359 sqStoreBlobWriter *_blobWriter;
360 };
361
362
363
364
365 // The rest are just accessors to reads.
366
367
368 inline
369 uint64
sqStore_getReadSegm(uint32 id)370 sqStore::sqStore_getReadSegm(uint32 id) {
371 return(_meta[id].sqRead_mSegm());
372 }
373
374
375 inline
376 uint64
sqStore_getReadByte(uint32 id)377 sqStore::sqStore_getReadByte(uint32 id) {
378 return(_meta[id].sqRead_mByte());
379 }
380
381
382 inline
383 sqReadSeq *
sqStore_getReadSeq(uint32 id,sqRead_which w)384 sqStore::sqStore_getReadSeq(uint32 id, sqRead_which w) {
385 sqReadSeq *seq = sqStore_getSeq(w);
386
387 assert(id > 0);
388
389 return((seq == NULL) ? NULL : &seq[id]);
390 }
391
392
393 // Read length is zero if the read isn't present, isn't valid, is ignored,
394 // or is trimmed out. Ideally, those cases should generate an error -- we
395 // should be testing for those cases before deciding to use a read -- but
396 // too many places are expecting length==0 in thse cases.
397 //
398 inline
399 uint32
sqStore_getReadLength(uint32 id,sqRead_which w)400 sqStore::sqStore_getReadLength(uint32 id, sqRead_which w) {
401 sqReadSeq *seq = sqStore_getSeq(w);
402
403 assert(id > 0);
404
405 if (w & sqRead_trimmed)
406 return(((seq == NULL) ||
407 (seq[id].sqReadSeq_trimmed() == false) ||
408 (seq[id].sqReadSeq_valid() == false) ||
409 (seq[id].sqReadSeq_ignoreT() == true)) ? 0 : (seq[id].sqReadSeq_clearEnd() - seq[id].sqReadSeq_clearBgn()));
410
411 else
412 return(((seq == NULL) ||
413 (seq[id].sqReadSeq_valid() == false) ||
414 (seq[id].sqReadSeq_ignoreU() == true)) ? 0 : (seq[id].sqReadSeq_length()));
415 }
416
417
418 // Unlike read length, we'll insist that accessing clear ranges only occur
419 // for trimmed reads (enforced by sqReadSeq).
420 //
421 inline
422 uint32
sqStore_getClearBgn(uint32 id,sqRead_which w)423 sqStore::sqStore_getClearBgn (uint32 id, sqRead_which w) {
424 sqReadSeq *seq = sqStore_getSeq(w);
425
426 assert(id > 0);
427
428 if (seq == NULL)
429 return(0);
430
431 if (w & sqRead_trimmed)
432 return(seq[id].sqReadSeq_clearBgn());
433 else
434 return(0);
435 }
436
437
438 inline
439 uint32
sqStore_getClearEnd(uint32 id,sqRead_which w)440 sqStore::sqStore_getClearEnd (uint32 id, sqRead_which w) {
441 sqReadSeq *seq = sqStore_getSeq(w);
442
443 assert(id > 0);
444
445 if (seq == NULL)
446 return(0);
447
448 if (w & sqRead_trimmed)
449 return(seq[id].sqReadSeq_clearEnd());
450 else
451 return(seq[id].sqReadSeq_length());
452 }
453
454
455 inline
456 bool
sqStore_isValidRead(uint32 id,sqRead_which w)457 sqStore::sqStore_isValidRead(uint32 id, sqRead_which w) {
458 sqReadSeq *seq = sqStore_getSeq(w);
459
460 assert(id > 0);
461
462 if (seq == NULL)
463 return(false);
464
465 if (w & sqRead_trimmed)
466 return(seq[id].sqReadSeq_valid() && seq[id].sqReadSeq_trimmed());
467 else
468 return(seq[id].sqReadSeq_valid());
469 }
470
471
472 inline
473 bool
sqStore_isIgnoredRead(uint32 id,sqRead_which w)474 sqStore::sqStore_isIgnoredRead(uint32 id, sqRead_which w) {
475 sqReadSeq *seq = sqStore_getSeq(w);
476
477 assert(id > 0);
478
479 if (seq == NULL)
480 return(false);
481
482 if (sqStore_isValidRead(id, w) == false)
483 return(true);
484
485 if (w & sqRead_trimmed)
486 return(seq[id].sqReadSeq_ignoreT());
487 else
488 return(seq[id].sqReadSeq_ignoreU());
489 }
490
491
492 inline
493 bool
sqStore_isTrimmedRead(uint32 id,sqRead_which w)494 sqStore::sqStore_isTrimmedRead(uint32 id, sqRead_which w) {
495 sqReadSeq *seq = sqStore_getSeq(w);
496
497 assert(id > 0);
498
499 return((seq == NULL) ? false : seq[id].sqReadSeq_trimmed());
500 }
501
502
503
504
505
506
507
508 #endif // SQSTORE_H
509