1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "ovStore.H"
19
20
21 ////////////////////////////////////////
22 //
23 // SEQUENTIAL STORE - only two functions.
24 //
25
ovStoreWriter(const char * path,sqStore * seq)26 ovStoreWriter::ovStoreWriter(const char *path, sqStore *seq) {
27 char name[FILENAME_MAX+1];
28
29 memset(_storePath, 0, FILENAME_MAX);
30 strncpy(_storePath, path, FILENAME_MAX);
31
32 // Fail if this is a valid ovStore.
33 #warning not failing if a valid store
34 //if (_info.test(_storePath) == true)
35 // fprintf(stderr, "ERROR: '%s' is a valid ovStore; cannot create a new one.\n", _storePath), exit(1);
36
37 // Create the new store
38
39 AS_UTL_mkdir(_storePath);
40
41 _info.clear(seq->sqStore_lastReadID());
42 //_info.save(_storePath); Used to save this as a sentinel, but now fails asserts I like
43
44 _seq = seq;
45
46 _index = new ovStoreOfft [_info.maxID() + 1];
47
48 _bof = NULL; // Open the file on the first overlap.
49 _bofSlice = 1; // Constant, never changes.
50 _bofPiece = 1; // Incremented whenever a file is closed.
51
52 _histogram = new ovStoreHistogram(_seq); // Only used for merging in results from output files.
53 }
54
55
56
~ovStoreWriter()57 ovStoreWriter::~ovStoreWriter() {
58
59 // Write the index
60
61 AS_UTL_saveFile(_storePath, '/', "index", _index, _info.maxID()+1);
62
63 delete [] _index;
64
65 // Update our copy of the histogram from the last open file, and close it.
66
67 if (_bof) {
68 _histogram->mergeHistogram(_bof->getHistogram());
69 _bof->removeHistogram();
70 }
71
72 delete _bof;
73
74 // Save the histogram data.
75
76 _histogram->saveHistogram(_storePath);
77
78 delete _histogram;
79
80 // Update the on-disk info with the results and real magic number
81
82 _info.save(_storePath);
83
84 // Report a nice success message.
85
86 fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
87 _storePath, _info.numOverlaps(), _info.bgnID(), _info.endID());
88 }
89
90
91
92 void
writeOverlap(ovOverlap * overlap)93 ovStoreWriter::writeOverlap(ovOverlap *overlap) {
94
95 // Close the current output file if it's too big.
96 // The current output file must exist.
97 // The current output file must be too big.
98 // The current output file must NOT contain overlaps for the same read this overlap does.
99 //
100
101 //fprintf(stderr, "writeOverlap() maxID %u endID %u thisID %u\n", _info.maxID(), _info.endID(), overlap->a_iid);
102
103 // If an output file and it's too big (and we're not in the middle of a set of overlaps
104 // for some read), make a new output file. Save the histogram from the previous
105 // file in our master list, then delete it so it isn't saved to disk.
106
107 if ((_bof != NULL) &&
108 (_bof->fileTooBig() == true) &&
109 (_info.endID() < overlap->a_iid)) {
110 _histogram->mergeHistogram(_bof->getHistogram());
111
112 _bof->removeHistogram();
113
114 delete _bof;
115
116 _bof = NULL;
117 _bofPiece++;
118 }
119
120 // Open a new output file if there isn't one.
121
122 if (_bof == NULL)
123 _bof = new ovFile(_seq, _storePath, _bofSlice, _bofPiece, ovFileNormalWrite);
124
125 // Make sure the overlaps are sorted, and add the overlap to the info file.
126
127 if (overlap->a_iid > _info.maxID()) {
128 assert(0);
129 }
130
131 // Add the overlap to the index and info.
132
133 _index[overlap->a_iid].addOverlap(_bofSlice, _bofPiece, _bof->filePosition(), _info.numOverlaps());
134
135 _info.addOverlaps(overlap->a_iid, 1);
136
137 // Write the overlap.
138
139 _bof->writeOverlap(overlap);
140 }
141
142
143
144
145 ////////////////////////////////////////
146 //
147 // PARALLEL STORE - many functions, all the rest.
148 //
149
ovStoreSliceWriter(const char * path,sqStore * seq,uint32 sliceNum,uint32 numSlices,uint32 numBuckets)150 ovStoreSliceWriter::ovStoreSliceWriter(const char *path,
151 sqStore *seq,
152 uint32 sliceNum,
153 uint32 numSlices,
154 uint32 numBuckets) {
155
156 memset(_storePath, 0, FILENAME_MAX);
157 strncpy(_storePath, path, FILENAME_MAX);
158
159 _seq = seq;
160
161 _sliceNum = sliceNum;
162 _pieceNum = 1;
163 _numSlices = numSlices;
164 _numBuckets = numBuckets;
165 };
166
167
168
~ovStoreSliceWriter()169 ovStoreSliceWriter::~ovStoreSliceWriter() {
170
171 // Report a nice success message.
172
173 //fprintf(stderr, "Created ovStore '%s' with " F_U64 " overlaps for reads from " F_U32 " to " F_U32 ".\n",
174 // _storePath, _info.numOverlaps(), _info.bgnID(), _info.endID());
175 }
176
177
178
179
180 uint64
loadBucketSizes(uint64 * bucketSizes)181 ovStoreSliceWriter::loadBucketSizes(uint64 *bucketSizes) {
182 char name[FILENAME_MAX+1];
183
184 uint64 *sliceSizes = new uint64 [_numSlices + 1]; // For each overlap job, number of overlaps per bucket
185 uint64 totOvl = 0;
186
187 for (uint32 i=0; i<=_numBuckets; i++) {
188 bucketSizes[i] = 0;
189
190 snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, i, _sliceNum);
191
192 // If no file, there are no overlaps, so nothing to load.
193
194 if (fileExists(name) == false)
195 continue;
196
197 // Load the slice sizes, and save the number of overlaps in this slice.
198
199 snprintf(name, FILENAME_MAX, "%s/bucket%04u/sliceSizes", _storePath, i);
200 AS_UTL_loadFile(name, sliceSizes, _numSlices + 1); // Checks that all data is loaded, too.
201
202 fprintf(stderr, " found %10" F_U64P " overlaps in '%s'.\n", sliceSizes[_sliceNum], name);
203
204 bucketSizes[i] = sliceSizes[_sliceNum];
205 totOvl += sliceSizes[_sliceNum];
206 }
207
208 delete [] sliceSizes;
209
210 return(totOvl);
211 }
212
213
214
215 void
loadOverlapsFromBucket(uint32 bucket,uint64 expectedLen,ovOverlap * ovls,uint64 & ovlsLen)216 ovStoreSliceWriter::loadOverlapsFromBucket(uint32 bucket, uint64 expectedLen, ovOverlap *ovls, uint64& ovlsLen) {
217 char name[FILENAME_MAX+1];
218
219 if (expectedLen == 0)
220 return;
221
222 snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bucket, _sliceNum);
223
224 if (fileExists(name) == false)
225 fprintf(stderr, "ERROR: " F_U64 " overlaps claim to exist in bucket '%s', but file not found.\n",
226 expectedLen, name), exit(1);
227
228 fprintf(stderr, " loading %10" F_U64P " overlaps from '%s'.\n", expectedLen, name);
229
230 ovFile *bof = new ovFile(_seq, name, ovFileFull);
231 uint64 before = ovlsLen;
232
233 while (bof->readOverlap(ovls + ovlsLen))
234 ovlsLen++;
235
236 delete bof;
237
238 if (ovlsLen - before != expectedLen)
239 fprintf(stderr, "ERROR: expected " F_U64 " overlaps, found " F_U64 " overlaps.\n",
240 expectedLen, ovlsLen - before), exit(1);
241 }
242
243
244
245 void
writeOverlaps(ovOverlap * ovls,uint64 ovlsLen)246 ovStoreSliceWriter::writeOverlaps(ovOverlap *ovls,
247 uint64 ovlsLen) {
248 ovStoreInfo info(_seq->sqStore_lastReadID());
249
250 // Probably wouldn't be too hard to make this take all overlaps for one read.
251 // But would need to track the open files in the class, not only in this function.
252 assert(info.numOverlaps() == 0);
253
254 // Check that overlaps are sorted.
255
256 uint64 nUnsorted = 0;
257
258 for (uint64 oo=1; oo<ovlsLen; oo++)
259 if (ovls[oo-1].a_iid > ovls[oo].a_iid)
260 nUnsorted++;
261
262 if (nUnsorted > 0) {
263 fprintf(stderr, "ERROR: Overlaps aren't sorted.\n");
264 exit(1);
265 }
266
267 // Create the index and overlaps files
268
269 ovStoreOfft *index = new ovStoreOfft [_seq->sqStore_lastReadID() + 1];
270 ovFile *olapFile = new ovFile(_seq, _storePath, _sliceNum, _pieceNum, ovFileNormalWrite);
271
272 // Dump the overlaps
273
274 for (uint64 oo=0; oo<ovlsLen; oo++ ) {
275
276 // If this overlap is for a new read, and we've written too many overlaps
277 // to the current piece, start a new piece.
278
279 if ((olapFile->fileTooBig() == true) &&
280 (ovls[oo].a_iid > info.endID())) {
281 delete olapFile;
282
283 _pieceNum++;
284
285 olapFile = new ovFile(_seq, _storePath, _sliceNum, _pieceNum, ovFileNormalWrite);
286 }
287
288 // Add the overlap to the index.
289
290 index[ovls[oo].a_iid].addOverlap(_sliceNum, _pieceNum, olapFile->filePosition(), oo);
291
292 // Add the overlap to the file.
293
294 olapFile->writeOverlap(ovls + oo);
295
296 // Add the overlap to the info
297
298 info.addOverlaps(ovls[oo].a_iid, 1);
299 }
300
301 // Close the output file, write the index, write the info.
302
303 delete olapFile;
304
305 char indexName[FILENAME_MAX+1];
306 snprintf(indexName, FILENAME_MAX, "%s/%04u.index", _storePath, _sliceNum);
307 AS_UTL_saveFile(indexName, index, info.maxID()+1);
308
309 delete [] index;
310
311 info.save(_storePath, _sliceNum, true);
312
313 // And done.
314
315 fprintf(stderr, " created '%s/%04u' with " F_U64 " overlaps for reads " F_U32 " to " F_U32 ".\n",
316 _storePath, _sliceNum, info.numOverlaps(), info.bgnID(), info.endID());
317 }
318
319
320
321
322
323 void
mergeInfoFiles(void)324 ovStoreSliceWriter::mergeInfoFiles(void) {
325 char indexName[FILENAME_MAX+1];
326
327 ovStoreInfo infopiece[_numSlices + 1];
328
329 // Load the info files. We need to load at least one to figure out maxID.
330
331 fprintf(stderr, " - Loading slice information.\n");
332 fprintf(stderr, " -\n");
333 fprintf(stderr, " - slice bgnID endID maxID numOlaps\n");
334 fprintf(stderr, " - ----- --------- --------- -------- ----------\n");
335
336 for (uint32 ss=1; ss<=_numSlices; ss++) {
337 infopiece[ss].load(_storePath, ss, true);
338
339 fprintf(stderr, " - %5" F_U32P " %9" F_U32P " %9" F_U32P " %8" F_U32P " %10" F_U64P "\n",
340 ss,
341 infopiece[ss].bgnID(),
342 infopiece[ss].endID(),
343 infopiece[ss].maxID(),
344 infopiece[ss].numOverlaps());
345 }
346
347 fprintf(stderr, " - ----- --------- --------- -------- ----------\n");
348
349 // Set us up and allocate some space.
350
351 ovStoreInfo info(infopiece[1].maxID());
352
353 ovStoreOfft *indexpiece = new ovStoreOfft [infopiece[1].maxID() + 1];
354 ovStoreOfft *index = new ovStoreOfft [infopiece[1].maxID() + 1];
355
356 // Merge in indexes.
357
358 fprintf(stderr, " -\n");
359 fprintf(stderr, " - Merging indexes.\n");
360 fprintf(stderr, " -\n");
361
362 for (uint32 ss=1; ss<=_numSlices; ss++) {
363
364 // Load the index for this piece.
365
366 snprintf(indexName, FILENAME_MAX, "%s/%04u.index", _storePath, ss);
367 AS_UTL_loadFile(indexName, indexpiece, info.maxID()+1);
368
369 // Copy index elements to the master index, updating overlapID.
370
371 for (uint32 ii=infopiece[ss].bgnID(); ii<=infopiece[ss].endID(); ii++) {
372 index[ii] = indexpiece[ii];
373 index[ii]._overlapID += info.numOverlaps();
374 }
375
376 // Update the master info.
377
378 info.addOverlaps(infopiece[ss].bgnID(), 0);
379 info.addOverlaps(infopiece[ss].endID(), infopiece[ss].numOverlaps());
380
381 // Keep users entertained.
382
383 //fprintf(stderr, " - Now finished with fragments " F_U32 " to " F_U32 " -- " F_U64 " overlaps.\n",
384 // info.bgnID(), info.endID(), info.numOverlaps());
385 }
386
387 // Dump the new store info file
388
389 info.save(_storePath, _numSlices);
390
391 // And the new index
392
393 AS_UTL_saveFile(_storePath, '/', "index", index, info.maxID()+1);
394
395 // Cleanup and done!
396
397 delete [] indexpiece;
398 delete [] index;
399
400 fprintf(stderr, " - Finished. " F_U32 " reads with " F_U64 " overlaps.\n",
401 info.endID(), info.numOverlaps());
402 fprintf(stderr, " -\n");
403 }
404
405
406
407 void
mergeHistogram(void)408 ovStoreSliceWriter::mergeHistogram(void) {
409 char dataname[FILENAME_MAX+1] = {0};
410
411 fprintf(stderr, " - Merging histograms.\n");
412 fprintf(stderr, " -\n");
413 fprintf(stderr, " -\n");
414 fprintf(stderr, " - slice piece bgnID endID\n");
415 fprintf(stderr, " - ----- ----- --------- ---------\n");
416
417 ovStoreHistogram *merged = new ovStoreHistogram(_seq);
418
419 for (uint32 ss=1; ss <= _numSlices; ss++) {
420 for (uint32 pp=1; pp < 1000; pp++) {
421 ovStoreHistogram piece(ovFile::createDataName(dataname, _storePath, ss, pp));
422
423 if (piece.overlapScoresLastID() > 0) {
424 fprintf(stderr, " - %5u %5u %9u %9u\n",
425 ss, pp, piece.overlapScoresBaseID(), piece.overlapScoresLastID());
426
427 merged->mergeHistogram(&piece);
428 } else {
429 break;
430 }
431 }
432 }
433
434 merged->saveHistogram(_storePath);
435
436 fprintf(stderr, " - ----- ----- --------- ---------\n");
437
438 delete merged;
439 }
440
441
442
443 void
removeOverlapSlice(void)444 ovStoreSliceWriter::removeOverlapSlice(void) {
445 char name[FILENAME_MAX+1];
446
447 for (uint32 bb=0; bb<=_numBuckets; bb++) {
448 snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bb, _sliceNum);
449 AS_UTL_unlink(name);
450 }
451 }
452
453
454
455 void
checkSortingIsComplete(void)456 ovStoreSliceWriter::checkSortingIsComplete(void) {
457 char nameF[FILENAME_MAX+1];
458 char nameI[FILENAME_MAX+1];
459
460 fprintf(stderr, " - Checking that sorting is complete (every slice should have an info file).\n");
461 fprintf(stderr, " -\n");
462 fprintf(stderr, "\n");
463
464 uint32 failedJobs = 0;
465
466 for (uint32 i=1; i<=_numSlices; i++) {
467 snprintf(nameF, FILENAME_MAX, "%s/%04u.info", _storePath, i);
468 snprintf(nameI, FILENAME_MAX, "%s/%04u.index", _storePath, i);
469
470 bool existF = fileExists(nameF);
471 bool existI = fileExists(nameI);
472
473 if (existF && existI)
474 continue;
475
476 failedJobs++;
477
478 if (existF == false) fprintf(stderr, "ERROR: Segment " F_U32 " info not present (%s)\n", i, nameF);
479 if (existI == false) fprintf(stderr, "ERROR: Segment " F_U32 " index not present (%s)\n", i, nameI);
480 }
481
482 if (failedJobs > 0)
483 fprintf(stderr, "ERROR: " F_U32 " segments, out of " F_U32 ", failed.\n", _numSlices, failedJobs), exit(1);
484 }
485
486
487
488 void
removeAllIntermediateFiles(void)489 ovStoreSliceWriter::removeAllIntermediateFiles(void) {
490 char name[FILENAME_MAX+1];
491 char nomo[FILENAME_MAX+1]; // Esperanto, in case you were wondering.
492
493 // Remove sorting (slices) intermediates.
494
495 fprintf(stderr, " - Removing intermediate files.\n");
496 fprintf(stderr, " -\n");
497
498 for (uint32 ss=1; ss <= _numSlices; ss++) {
499 snprintf(name, FILENAME_MAX, "%s/%04u.index", _storePath, ss);
500 AS_UTL_unlink(name);
501
502 snprintf(name, FILENAME_MAX, "%s/%04u.info", _storePath, ss);
503 AS_UTL_unlink(name);
504
505 for (uint32 pp=1; pp < 1000; pp++) {
506 ovFile::createDataName(name, _storePath, ss, pp);
507 ovStoreHistogram::createDataName(nomo, name);
508
509 if (fileExists(nomo) == false)
510 break;
511
512 AS_UTL_unlink(nomo);
513 }
514 }
515
516 // Remove buckets.
517
518 for (uint32 bb=1; bb <= _numBuckets; bb++) {
519 snprintf(name, FILENAME_MAX, "%s/bucket%04u/sliceSizes", _storePath, bb);
520 AS_UTL_unlink(name);
521
522 for (uint32 ss=1; ss <= _numSlices; ss++) {
523 snprintf(name, FILENAME_MAX, "%s/bucket%04u/slice%04u", _storePath, bb, ss);
524 AS_UTL_unlink(name);
525 }
526
527 snprintf(name, FILENAME_MAX, "%s/bucket%04u", _storePath, bb);
528 AS_UTL_rmdir(name);
529 }
530 }
531