1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "ovStore.H"
19 
20 
21 ////////////////////////////////////////
22 //
23 //  SEQUENTIAL STORE - only two functions.
24 //
25 
ovStoreWriter(const char * path,sqStore * seq)26 ovStoreWriter::ovStoreWriter(const char *path, sqStore *seq) {
27   char name[FILENAME_MAX+1];
28 
29   memset(_storePath, 0, FILENAME_MAX);
30   strncpy(_storePath, path, FILENAME_MAX);
31 
32   //  Fail if this is a valid ovStore.
33 #warning not failing if a valid store
34   //if (_info.test(_storePath) == true)
35   //  fprintf(stderr, "ERROR:  '%s' is a valid ovStore; cannot create a new one.\n", _storePath), exit(1);
36 
37   //  Create the new store
38 
39   AS_UTL_mkdir(_storePath);
40 
41   _info.clear(seq->sqStore_lastReadID());
42   //_info.save(_storePath);   Used to save this as a sentinel, but now fails asserts I like
43 
44   _seq       = seq;
45 
46   _index     = new ovStoreOfft [_info.maxID() + 1];
47 
48   _bof       = NULL;   //  Open the file on the first overlap.
49   _bofSlice  = 1;      //  Constant, never changes.
50   _bofPiece  = 1;      //  Incremented whenever a file is closed.
51 
52   _histogram = new ovStoreHistogram(_seq);  //  Only used for merging in results from output files.
53 }
54 
55 
56 
~ovStoreWriter()57 ovStoreWriter::~ovStoreWriter() {
58 
59   //  Write the index
60 
61   AS_UTL_saveFile(_storePath, '/', "index", _index, _info.maxID()+1);
62 
63   delete [] _index;
64 
65   //  Update our copy of the histogram from the last open file, and close it.
66 
67   if (_bof) {
68     _histogram->mergeHistogram(_bof->getHistogram());
69     _bof->removeHistogram();
70   }
71 
72   delete _bof;
73 
74   //  Save the histogram data.
75 
76   _histogram->saveHistogram(_storePath);
77 
78   delete _histogram;
79 
80   //  Update the on-disk info with the results and real magic number
81 
82   _info.save(_storePath);
83 
84   //  Report a nice success message.
85 
86   fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
87           _storePath, _info.numOverlaps(), _info.bgnID(), _info.endID());
88 }
89 
90 
91 
92 void
writeOverlap(ovOverlap * overlap)93 ovStoreWriter::writeOverlap(ovOverlap *overlap) {
94 
95   //  Close the current output file if it's too big.
96   //    The current output file must exist.
97   //    The current output file must be too big.
98   //    The current output file must NOT contain overlaps for the same read this overlap does.
99   //
100 
101   //fprintf(stderr, "writeOverlap() maxID %u endID %u thisID %u\n", _info.maxID(), _info.endID(), overlap->a_iid);
102 
103   //  If an output file and it's too big (and we're not in the middle of a set of overlaps
104   //  for some read), make a new output file.  Save the histogram from the previous
105   //  file in our master list, then delete it so it isn't saved to disk.
106 
107   if ((_bof != NULL) &&
108       (_bof->fileTooBig() == true) &&
109       (_info.endID() < overlap->a_iid)) {
110     _histogram->mergeHistogram(_bof->getHistogram());
111 
112     _bof->removeHistogram();
113 
114     delete _bof;
115 
116     _bof = NULL;
117     _bofPiece++;
118   }
119 
120   //  Open a new output file if there isn't one.
121 
122   if (_bof == NULL)
123     _bof = new ovFile(_seq, _storePath, _bofSlice, _bofPiece, ovFileNormalWrite);
124 
125   //  Make sure the overlaps are sorted, and add the overlap to the info file.
126 
127   if (overlap->a_iid > _info.maxID()) {
128     assert(0);
129   }
130 
131   //  Add the overlap to the index and info.
132 
133   _index[overlap->a_iid].addOverlap(_bofSlice, _bofPiece, _bof->filePosition(), _info.numOverlaps());
134 
135   _info.addOverlaps(overlap->a_iid, 1);
136 
137   //  Write the overlap.
138 
139   _bof->writeOverlap(overlap);
140 }
141 
142 
143 
144 
145 ////////////////////////////////////////
146 //
147 //  PARALLEL STORE - many functions, all the rest.
148 //
149 
ovStoreSliceWriter(const char * path,sqStore * seq,uint32 sliceNum,uint32 numSlices,uint32 numBuckets)150 ovStoreSliceWriter::ovStoreSliceWriter(const char *path,
151                                        sqStore    *seq,
152                                        uint32      sliceNum,
153                                        uint32      numSlices,
154                                        uint32      numBuckets) {
155 
156   memset(_storePath, 0, FILENAME_MAX);
157   strncpy(_storePath, path, FILENAME_MAX);
158 
159   _seq                 = seq;
160 
161   _sliceNum            = sliceNum;
162   _pieceNum            = 1;
163   _numSlices           = numSlices;
164   _numBuckets          = numBuckets;
165 };
166 
167 
168 
~ovStoreSliceWriter()169 ovStoreSliceWriter::~ovStoreSliceWriter() {
170 
171   //  Report a nice success message.
172 
173   //fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
174   //        _storePath, _info.numOverlaps(), _info.bgnID(), _info.endID());
175 }
176 
177 
178 
179 
180 uint64
loadBucketSizes(uint64 * bucketSizes)181 ovStoreSliceWriter::loadBucketSizes(uint64 *bucketSizes) {
182   char      name[FILENAME_MAX+1];
183 
184   uint64   *sliceSizes = new uint64 [_numSlices + 1];  //  For each overlap job, number of overlaps per bucket
185   uint64    totOvl     = 0;
186 
187   for (uint32 i=0; i<=_numBuckets; i++) {
188     bucketSizes[i] = 0;
189 
190     snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, i, _sliceNum);
191 
192     //  If no file, there are no overlaps, so nothing to load.
193 
194     if (fileExists(name) == false)
195       continue;
196 
197     //  Load the slice sizes, and save the number of overlaps in this slice.
198 
199     snprintf(name, FILENAME_MAX, "%s/bucket%04u/sliceSizes", _storePath, i);
200     AS_UTL_loadFile(name, sliceSizes, _numSlices + 1);  //  Checks that all data is loaded, too.
201 
202     fprintf(stderr, "  found %10" F_U64P " overlaps in '%s'.\n", sliceSizes[_sliceNum], name);
203 
204     bucketSizes[i] = sliceSizes[_sliceNum];
205     totOvl        += sliceSizes[_sliceNum];
206   }
207 
208   delete [] sliceSizes;
209 
210   return(totOvl);
211 }
212 
213 
214 
215 void
loadOverlapsFromBucket(uint32 bucket,uint64 expectedLen,ovOverlap * ovls,uint64 & ovlsLen)216 ovStoreSliceWriter::loadOverlapsFromBucket(uint32 bucket, uint64 expectedLen, ovOverlap *ovls, uint64& ovlsLen) {
217   char name[FILENAME_MAX+1];
218 
219   if (expectedLen == 0)
220     return;
221 
222   snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bucket, _sliceNum);
223 
224   if (fileExists(name) == false)
225     fprintf(stderr, "ERROR: " F_U64 " overlaps claim to exist in bucket '%s', but file not found.\n",
226             expectedLen, name), exit(1);
227 
228   fprintf(stderr, "  loading %10" F_U64P " overlaps from '%s'.\n", expectedLen, name);
229 
230   ovFile   *bof    = new ovFile(_seq, name, ovFileFull);
231   uint64    before = ovlsLen;
232 
233   while (bof->readOverlap(ovls + ovlsLen))
234     ovlsLen++;
235 
236   delete bof;
237 
238   if (ovlsLen - before != expectedLen)
239     fprintf(stderr, "ERROR: expected " F_U64 " overlaps, found " F_U64 " overlaps.\n",
240             expectedLen, ovlsLen - before), exit(1);
241 }
242 
243 
244 
245 void
writeOverlaps(ovOverlap * ovls,uint64 ovlsLen)246 ovStoreSliceWriter::writeOverlaps(ovOverlap  *ovls,
247                                   uint64      ovlsLen) {
248   ovStoreInfo    info(_seq->sqStore_lastReadID());
249 
250   //  Probably wouldn't be too hard to make this take all overlaps for one read.
251   //  But would need to track the open files in the class, not only in this function.
252   assert(info.numOverlaps() == 0);
253 
254   //  Check that overlaps are sorted.
255 
256   uint64  nUnsorted = 0;
257 
258   for (uint64 oo=1; oo<ovlsLen; oo++)
259     if (ovls[oo-1].a_iid > ovls[oo].a_iid)
260       nUnsorted++;
261 
262   if (nUnsorted > 0) {
263     fprintf(stderr, "ERROR: Overlaps aren't sorted.\n");
264     exit(1);
265   }
266 
267   //  Create the index and overlaps files
268 
269   ovStoreOfft  *index     = new ovStoreOfft [_seq->sqStore_lastReadID() + 1];
270   ovFile       *olapFile  = new ovFile(_seq, _storePath, _sliceNum, _pieceNum, ovFileNormalWrite);
271 
272   //  Dump the overlaps
273 
274   for (uint64 oo=0; oo<ovlsLen; oo++ ) {
275 
276     //  If this overlap is for a new read, and we've written too many overlaps
277     //  to the current piece, start a new piece.
278 
279     if ((olapFile->fileTooBig() == true) &&
280         (ovls[oo].a_iid          > info.endID())) {
281       delete olapFile;
282 
283       _pieceNum++;
284 
285       olapFile  = new ovFile(_seq, _storePath, _sliceNum, _pieceNum, ovFileNormalWrite);
286     }
287 
288     //  Add the overlap to the index.
289 
290     index[ovls[oo].a_iid].addOverlap(_sliceNum, _pieceNum, olapFile->filePosition(), oo);
291 
292     //  Add the overlap to the file.
293 
294     olapFile->writeOverlap(ovls + oo);
295 
296     //  Add the overlap to the info
297 
298     info.addOverlaps(ovls[oo].a_iid, 1);
299   }
300 
301   //  Close the output file, write the index, write the info.
302 
303   delete    olapFile;
304 
305   char indexName[FILENAME_MAX+1];
306   snprintf(indexName, FILENAME_MAX, "%s/%04u.index", _storePath, _sliceNum);
307   AS_UTL_saveFile(indexName, index, info.maxID()+1);
308 
309   delete [] index;
310 
311   info.save(_storePath, _sliceNum, true);
312 
313   //  And done.
314 
315   fprintf(stderr, "  created '%s/%04u' with " F_U64 " overlaps for reads " F_U32 " to " F_U32 ".\n",
316           _storePath, _sliceNum, info.numOverlaps(), info.bgnID(), info.endID());
317 }
318 
319 
320 
321 
322 
323 void
mergeInfoFiles(void)324 ovStoreSliceWriter::mergeInfoFiles(void) {
325   char           indexName[FILENAME_MAX+1];
326 
327   ovStoreInfo    infopiece[_numSlices + 1];
328 
329   //  Load the info files.  We need to load at least one to figure out maxID.
330 
331   fprintf(stderr, " - Loading slice information.\n");
332   fprintf(stderr, " -\n");
333   fprintf(stderr, " - slice     bgnID     endID    maxID   numOlaps\n");
334   fprintf(stderr, " - ----- --------- --------- -------- ----------\n");
335 
336   for (uint32 ss=1; ss<=_numSlices; ss++) {
337     infopiece[ss].load(_storePath, ss, true);
338 
339     fprintf(stderr, " - %5" F_U32P " %9" F_U32P " %9" F_U32P " %8" F_U32P " %10" F_U64P "\n",
340             ss,
341             infopiece[ss].bgnID(),
342             infopiece[ss].endID(),
343             infopiece[ss].maxID(),
344             infopiece[ss].numOverlaps());
345   }
346 
347   fprintf(stderr, " - ----- --------- --------- -------- ----------\n");
348 
349   //  Set us up and allocate some space.
350 
351   ovStoreInfo    info(infopiece[1].maxID());
352 
353   ovStoreOfft   *indexpiece = new ovStoreOfft [infopiece[1].maxID() + 1];
354   ovStoreOfft   *index      = new ovStoreOfft [infopiece[1].maxID() + 1];
355 
356   //  Merge in indexes.
357 
358   fprintf(stderr, " -\n");
359   fprintf(stderr, " - Merging indexes.\n");
360   fprintf(stderr, " -\n");
361 
362   for (uint32 ss=1; ss<=_numSlices; ss++) {
363 
364     //  Load the index for this piece.
365 
366     snprintf(indexName, FILENAME_MAX, "%s/%04u.index", _storePath, ss);
367     AS_UTL_loadFile(indexName, indexpiece, info.maxID()+1);
368 
369     //  Copy index elements to the master index, updating overlapID.
370 
371     for (uint32 ii=infopiece[ss].bgnID(); ii<=infopiece[ss].endID(); ii++) {
372       index[ii]             = indexpiece[ii];
373       index[ii]._overlapID += info.numOverlaps();
374     }
375 
376     //  Update the master info.
377 
378     info.addOverlaps(infopiece[ss].bgnID(), 0);
379     info.addOverlaps(infopiece[ss].endID(), infopiece[ss].numOverlaps());
380 
381     //  Keep users entertained.
382 
383     //fprintf(stderr, " -  Now finished with fragments " F_U32 " to " F_U32 " -- " F_U64 " overlaps.\n",
384     //        info.bgnID(), info.endID(), info.numOverlaps());
385   }
386 
387   //  Dump the new store info file
388 
389   info.save(_storePath, _numSlices);
390 
391   //  And the new index
392 
393   AS_UTL_saveFile(_storePath, '/', "index", index, info.maxID()+1);
394 
395   //  Cleanup and done!
396 
397   delete [] indexpiece;
398   delete [] index;
399 
400   fprintf(stderr, " - Finished.  " F_U32 " reads with " F_U64 " overlaps.\n",
401           info.endID(), info.numOverlaps());
402   fprintf(stderr, " -\n");
403 }
404 
405 
406 
407 void
mergeHistogram(void)408 ovStoreSliceWriter::mergeHistogram(void) {
409   char               dataname[FILENAME_MAX+1] = {0};
410 
411   fprintf(stderr, " - Merging histograms.\n");
412   fprintf(stderr, " -\n");
413   fprintf(stderr, " -\n");
414   fprintf(stderr, " - slice piece     bgnID     endID\n");
415   fprintf(stderr, " - ----- ----- --------- ---------\n");
416 
417   ovStoreHistogram  *merged = new ovStoreHistogram(_seq);
418 
419   for (uint32 ss=1; ss <= _numSlices; ss++) {
420     for (uint32 pp=1; pp < 1000; pp++) {
421       ovStoreHistogram  piece(ovFile::createDataName(dataname, _storePath, ss, pp));
422 
423       if (piece.overlapScoresLastID() > 0) {
424         fprintf(stderr, " - %5u %5u %9u %9u\n",
425                 ss, pp, piece.overlapScoresBaseID(), piece.overlapScoresLastID());
426 
427         merged->mergeHistogram(&piece);
428       } else {
429         break;
430       }
431     }
432   }
433 
434   merged->saveHistogram(_storePath);
435 
436   fprintf(stderr, " - ----- ----- --------- ---------\n");
437 
438   delete merged;
439 }
440 
441 
442 
443 void
removeOverlapSlice(void)444 ovStoreSliceWriter::removeOverlapSlice(void) {
445   char name[FILENAME_MAX+1];
446 
447   for (uint32 bb=0; bb<=_numBuckets; bb++) {
448     snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bb, _sliceNum);
449     AS_UTL_unlink(name);
450   }
451 }
452 
453 
454 
455 void
checkSortingIsComplete(void)456 ovStoreSliceWriter::checkSortingIsComplete(void) {
457   char    nameF[FILENAME_MAX+1];
458   char    nameI[FILENAME_MAX+1];
459 
460   fprintf(stderr, " - Checking that sorting is complete (every slice should have an info file).\n");
461   fprintf(stderr, " -\n");
462   fprintf(stderr, "\n");
463 
464   uint32  failedJobs = 0;
465 
466   for (uint32 i=1; i<=_numSlices; i++) {
467     snprintf(nameF, FILENAME_MAX, "%s/%04u.info",  _storePath, i);
468     snprintf(nameI, FILENAME_MAX, "%s/%04u.index", _storePath, i);
469 
470     bool existF = fileExists(nameF);
471     bool existI = fileExists(nameI);
472 
473     if (existF && existI)
474       continue;
475 
476     failedJobs++;
477 
478     if (existF == false)    fprintf(stderr, "ERROR: Segment " F_U32 " info  not present (%s)\n", i, nameF);
479     if (existI == false)    fprintf(stderr, "ERROR: Segment " F_U32 " index not present (%s)\n", i, nameI);
480   }
481 
482   if (failedJobs > 0)
483     fprintf(stderr, "ERROR: " F_U32 " segments, out of " F_U32 ", failed.\n", _numSlices, failedJobs), exit(1);
484 }
485 
486 
487 
488 void
removeAllIntermediateFiles(void)489 ovStoreSliceWriter::removeAllIntermediateFiles(void) {
490   char name[FILENAME_MAX+1];
491   char nomo[FILENAME_MAX+1];   //  Esperanto, in case you were wondering.
492 
493   //  Remove sorting (slices) intermediates.
494 
495   fprintf(stderr, " - Removing intermediate files.\n");
496   fprintf(stderr, " -\n");
497 
498   for (uint32 ss=1; ss <= _numSlices; ss++) {
499     snprintf(name, FILENAME_MAX, "%s/%04u.index", _storePath, ss);
500     AS_UTL_unlink(name);
501 
502     snprintf(name, FILENAME_MAX, "%s/%04u.info",  _storePath, ss);
503     AS_UTL_unlink(name);
504 
505     for (uint32 pp=1; pp < 1000; pp++) {
506       ovFile::createDataName(name, _storePath, ss, pp);
507       ovStoreHistogram::createDataName(nomo, name);
508 
509       if (fileExists(nomo) == false)
510         break;
511 
512       AS_UTL_unlink(nomo);
513     }
514   }
515 
516   //  Remove buckets.
517 
518   for (uint32 bb=1; bb <= _numBuckets; bb++) {
519     snprintf(name, FILENAME_MAX, "%s/bucket%04u/sliceSizes", _storePath, bb);
520     AS_UTL_unlink(name);
521 
522     for (uint32 ss=1; ss <= _numSlices; ss++) {
523       snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bb, ss);
524       AS_UTL_unlink(name);
525     }
526 
527     snprintf(name, FILENAME_MAX, "%s/bucket%04u",            _storePath, bb);
528     AS_UTL_rmdir(name);
529   }
530 }
531