1 // ==========================================================================
2 // RABEMA Read Alignment Benchmark
3 // ==========================================================================
4 // Copyright (C) 2010-1012 Manuel Holtgrewe, FU Berlin
5 //
6 // This program is free software: you can redistribute it and/or modify it
7 // under the terms of the GNU General Public License as published by the Free
8 // Software Foundation, either version 3 of the License, or (at your option)
9 // any later version.
10 //
11 // This program is distributed in the hope that it will be useful, but WITHOUT
12 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 // more details.
15 //
16 // You should have received a copy of the GNU General Public License along
17 // with this program. If not, see <http://www.gnu.org/licenses/>.
18 //
19 // ==========================================================================
20 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
21 // ==========================================================================
22 // RABEMA is a read mapping benchmark tool. See README for an overview.
23 //
24 // This is the program for building the RABEMA gold standard. The input to
25 // this program is the reference sequence as a FASTA file and a "perfect
26 // mapping" SAM/BAM file (see README on how to obtain such a gold standard).
27 // ==========================================================================
28
29 // TODO(holtgrew): Another possible way to save memory is not to store the first, always identical part of the read id.
30
31 #include <fstream>
32 #include <iostream>
33 #include <map>
34
35 #include <seqan/arg_parse.h>
36 #include <seqan/bam_io.h>
37 #include <seqan/basic.h>
38 #include <seqan/sequence.h>
39 #include <seqan/store.h>
40 #include <seqan/seq_io.h>
41
42 #include "curve_smoothing.h"
43 #include "find_hamming_simple_ext.h"
44 #include "find_myers_ukkonen_reads.h"
45 #include "io_gsi.h"
46 #include "sorting.h"
47
48 // ============================================================================
49 // Enums, Tags, Classes, Typedefs.
50 // ============================================================================
51
52 typedef std::map<int, TWeightedMatches> TErrorCurves;
53
54 // ----------------------------------------------------------------------------
55 // Helper Class IntervalizeCmp
56 // ----------------------------------------------------------------------------
57
58 // Comparison functor used in intervalizeErrorCurves().
59
60 struct IntervalizeCmp
61 {
62 StringSet<CharString> const & readNameStore;
63
IntervalizeCmpIntervalizeCmp64 IntervalizeCmp(StringSet<CharString> const & readNameStore) :
65 readNameStore(readNameStore)
66 {}
67
operator ()IntervalizeCmp68 bool operator()(unsigned lhs, unsigned rhs) const
69 {
70 return lessThanSamtoolsQueryName(readNameStore[lhs], readNameStore[rhs]);
71 }
72
73 };
74
75 // ----------------------------------------------------------------------------
76 // Class ContigInterval
77 // ----------------------------------------------------------------------------
78
79 // Represents an interval in a contig.
80 struct ContigInterval
81 {
82 // Contig the interval is in.
83 size_t contigId;
84
85 // true iff the interval is on the forward strand.
86 bool isForward;
87
88 // First value of interval
89 size_t first;
90
91 // Last value of interval.
92 size_t last;
93
94 // Default constructor so it can be used in containers.
ContigIntervalContigInterval95 ContigInterval() : contigId(0), isForward(false), first(0), last(0)
96 {}
97
98 // Constructor for the record.
ContigIntervalContigInterval99 ContigInterval(size_t _contigId, bool _isForward, size_t _first, size_t _last) :
100 contigId(_contigId), isForward(_isForward), first(_first), last(_last)
101 {}
102 };
103
104 // ---------------------------------------------------------------------------
105 // Enum DistanceMetric
106 // ---------------------------------------------------------------------------
107
108 enum DistanceMetric
109 {
110 HAMMING_DISTANCE,
111 EDIT_DISTANCE
112 };
113
114 // ---------------------------------------------------------------------------
115 // Class BuildGoldStandardOptions
116 // ---------------------------------------------------------------------------
117
118 struct BuildGoldStandardOptions
119 {
120 // Verbosity level, quiet: 0, normal: 1, verbose: 2, very verbose: 3.
121 int verbosity;
122
123 // Whether N should match all characters without penalty.
124 bool matchN;
125
126 // Whether or not to run in oracle mode. The input SAM/BAM file must only contain the sample location for each read
127 // in this case. This is used when building a GSI file for reads simulated by Mason where the original locations
128 // are written out in a SAM/BAM file.
129 bool oracleMode;
130
131 // Maximal error rate in percent, relative to the read length.
132 int maxError;
133
134 // Indicates whether a maximal error was set.
135 bool maxErrorSet;
136
137 // Distance metric to use.
138 DistanceMetric distanceMetric;
139
140 // Path to the output file that we will write the GSI to.
141 seqan::CharString outGsiPath;
142
143 // Path to the reference contigs file.
144 seqan::CharString referencePath;
145
146 // Exactly one of the following two has to be given.
147 //
148 // Path to the perfect input SAM/BAM file.
149 seqan::CharString inBamPath;
150
BuildGoldStandardOptionsBuildGoldStandardOptions151 BuildGoldStandardOptions() :
152 verbosity(1),
153 matchN(false),
154 oracleMode(false),
155 maxError(0),
156 maxErrorSet(false),
157 distanceMetric(EDIT_DISTANCE)
158 {}
159 };
160
161 // ============================================================================
162 // Metafunctions
163 // ============================================================================
164
165 // ============================================================================
166 // Functions
167 // ============================================================================
168
169 // ---------------------------------------------------------------------------
170 // Function metricName()
171 // ---------------------------------------------------------------------------
172
metricName(DistanceMetric met)173 char const * metricName(DistanceMetric met)
174 {
175 if (met == EDIT_DISTANCE)
176 return "edit";
177 else // if (met == HAMMING_DISTANCE)
178 return "hamming";
179 }
180
181 // ---------------------------------------------------------------------------
182 // Function yesNo()
183 // ---------------------------------------------------------------------------
184
yesNo(bool b)185 char const * yesNo(bool b)
186 {
187 if (b)
188 return "yes";
189 else
190 return "no";
191 }
192
193 // ----------------------------------------------------------------------------
194 // Function trimSeqHeaderToId()
195 // ----------------------------------------------------------------------------
196
197 // TODO(holtgrew): Such a method is already in the SeqAn library, but in extras. Remove here when it is in core.
trimSeqHeaderToId(seqan::CharString & header)198 void trimSeqHeaderToId(seqan::CharString & header)
199 {
200 unsigned i = 0;
201 for (; i < length(header); ++i)
202 if (isspace(header[i]))
203 break;
204 resize(header, i);
205 }
206
207 // ----------------------------------------------------------------------------
208 // Function intervalizeAndDumpErrorCurves()
209 // ----------------------------------------------------------------------------
210
211 // Build intervals from the error curves and print them to stream.
212
213 template <typename TStream>
intervalizeAndDumpErrorCurves(TStream & stream,TErrorCurves const & errorCurves,String<int> const & readAlignmentDistances,StringSet<CharString> const & readNameStore,StringSet<CharString> const & contigNameStore,BuildGoldStandardOptions const & options,bool toStdout)214 int intervalizeAndDumpErrorCurves(TStream & stream,
215 TErrorCurves const & errorCurves,
216 String<int> const & readAlignmentDistances,
217 StringSet<CharString> const & readNameStore,
218 StringSet<CharString> const & contigNameStore,
219 BuildGoldStandardOptions const & options,
220 bool toStdout)
221 {
222 if (!toStdout)
223 std::cerr << "\n____POINT TO INTERVAL CONVERSION______________________________________________\n\n"
224 << "Progress: ";
225
226 // Write out the header.
227 GsiHeader header;
228 writeHeader(stream, header, Gsi());
229 writeRecord(stream, GSI_COLUMN_NAMES, Gsi());
230
231 // Get a list of read ids, sorted by their read name.
232 String<unsigned> sortedReadIds;
233 reserve(sortedReadIds, length(readNameStore));
234 for (unsigned i = 0; i < length(readNameStore); ++i)
235 appendValue(sortedReadIds, i);
236 IntervalizeCmp cmp(readNameStore);
237 std::sort(begin(sortedReadIds, Standard()), end(sortedReadIds, Standard()), cmp);
238
239 unsigned tenPercent = errorCurves.size() / 10 + 1;
240 typedef TErrorCurves::const_iterator TErrorCurvesIter;
241 for (unsigned i = 0; i < length(sortedReadIds); ++i)
242 {
243 TErrorCurvesIter it = errorCurves.find(sortedReadIds[i]);
244 if (it == errorCurves.end())
245 {
246 if (!toStdout)
247 std::cerr << "WARNING: Something went wrong with read ids! This should not happen.\n";
248 continue;
249 }
250
251 if (!toStdout)
252 {
253 if (tenPercent > 0u && i % tenPercent == 0u)
254 std::cerr << i / tenPercent * 10 << '%';
255 else if (tenPercent > 5u && i % (tenPercent / 5) == 0u)
256 std::cerr << '.';
257 }
258
259 size_t readId = it->first;
260 TWeightedMatches const & matches = it->second;
261
262 // Sort the matches. Matches with high scores (negative score and low absolute value) come first.
263 TWeightedMatches sortedMatches(matches);
264
265 // intervals[e] holds the intervals for error e of the current read.
266 String<String<ContigInterval> > intervals;
267 int maxError = options.oracleMode ? 0 : (int)options.maxError;
268 if (options.oracleMode)
269 for (unsigned j = 0; j < length(matches); ++j)
270 maxError = std::max(maxError, matches[j].distance);
271 resize(intervals, maxError + 1);
272
273 // Join the intervals stored in sortedMatches.
274 //
275 // The position of the previous match, so we can consider only the ones with the smallest error.
276 //
277 // The following two vars should be != first pos and contigId.
278 size_t previousPos = std::numeric_limits<size_t>::max();
279 size_t previousContigId = std::numeric_limits<size_t>::max();
280 typedef Iterator<TWeightedMatches>::Type TWeightedMatchesIter;
281 for (TWeightedMatchesIter it = begin(sortedMatches);
282 it != end(sortedMatches); ++it)
283 {
284 // Skip it if (it - 1) pointed to same pos (and must point to
285 // one with smaller absolute distance.
286 if (it->pos == previousPos && it->contigId == previousContigId)
287 continue;
288 // Consider all currently open intervals with a greater error than the error in *it and extend them or
289 // create a new one.
290 int error = options.oracleMode ? 0 : abs(it->distance);
291 SEQAN_ASSERT_LEQ(error, maxError);
292 for (int e = error; e <= maxError; ++e)
293 {
294 // Handle base case of no open interval: Create new one.
295 if (length(intervals[e]) == 0)
296 {
297 appendValue(intervals[e], ContigInterval(it->contigId, it->isForward, it->pos, it->pos));
298 continue;
299 }
300 ContigInterval & interval = back(intervals[e]);
301 // Either extend the interval or create a new one.
302 if (interval.contigId == it->contigId && interval.isForward == it->isForward)
303 SEQAN_ASSERT_LEQ(interval.last, it->pos);
304 if (interval.contigId == it->contigId && interval.isForward == it->isForward &&
305 interval.last + 1 == it->pos)
306 back(intervals[e]).last += 1;
307 else
308 appendValue(intervals[e], ContigInterval(it->contigId, it->isForward, it->pos, it->pos));
309 }
310 // Book-keeping.
311 previousPos = it->pos;
312 previousContigId = it->contigId;
313 }
314
315 // Print the resulting intervals.
316 typedef Iterator<String<String<ContigInterval> > >::Type TIntervalContainerIter;
317 int distance = 0;
318 for (TIntervalContainerIter it = begin(intervals);
319 it != end(intervals); ++it, ++distance)
320 {
321 typedef Iterator<String<ContigInterval> >::Type TIntervalIter;
322 for (TIntervalIter it2 = begin(*it); it2 != end(*it); ++it2)
323 {
324 int flags = 0;
325 // We appended custom prefixes to the read ids. "/S" means single-end, "/0" means left mate, "/1" means
326 // right mate.
327 int mateNo = -1;
328 if (back(readNameStore[readId]) == '0')
329 mateNo = 0;
330 if (back(readNameStore[readId]) == '1')
331 mateNo = 1;
332 SEQAN_ASSERT_EQ(readNameStore[readId][length(readNameStore[readId]) - 2], '/');
333 if (mateNo == 0)
334 flags = GsiRecord::FLAG_PAIRED | GsiRecord::FLAG_FIRST_MATE;
335 else if (mateNo == 1)
336 flags = GsiRecord::FLAG_PAIRED | GsiRecord::FLAG_SECOND_MATE;
337
338 int gsiDistance = options.oracleMode ? readAlignmentDistances[readId] : distance;
339 if (options.oracleMode && options.maxErrorSet && gsiDistance > options.maxError)
340 continue; // Skip if cut off in Rabema oracle mode.
341 GsiRecord gsiRecord(CharString(prefix(readNameStore[readId], length(readNameStore[readId]) - 2)), flags,
342 gsiDistance, contigNameStore[it2->contigId],
343 it2->isForward, it2->first, it2->last);
344 writeRecord(stream, gsiRecord, Gsi());
345 }
346 }
347 }
348 if (!toStdout)
349 std::cerr << "100% DONE\n";
350
351 return 0;
352 }
353
354 // ----------------------------------------------------------------------------
355 // Function ceilAwayFromZero()
356 // ----------------------------------------------------------------------------
357
358 // Ceil away from Zero.
359 //
360 // ceilAwayFromZero(-1.5) == -2
361 // ceilAwayFromZero(1.5) == 2
362 template <typename T>
ceilAwayFromZero(T x)363 inline T ceilAwayFromZero(T x)
364 {
365 if (x < 0)
366 return floor(x);
367
368 return ceil(x);
369 }
370
371 // ----------------------------------------------------------------------------
372 // Function buildErrorCurvePoint()
373 // ----------------------------------------------------------------------------
374
375 // TODO(holtgrew): This function has still a lot of cleanup potential.
376
377 // Build the error curve points around the end position of the given contig.
378 //
379 // This is pretty involved and this function easily is the most complex one.
380 //
381 // Returns rightmost border of the added points.
382 template <typename TContigSeq, typename TReadSeq, typename TPatternSpec, typename TReadNames>
buildErrorCurvePoints(String<WeightedMatch> & errorCurve,int & maxError,TContigSeq & contig,size_t contigId,bool isForward,TReadSeq & read,size_t readId,size_t endPos,TReadNames const & readNames,bool matchN,TPatternSpec const &)383 void buildErrorCurvePoints(String<WeightedMatch> & errorCurve,
384 int & maxError,
385 TContigSeq /*const*/ & contig,
386 size_t contigId,
387 bool isForward,
388 TReadSeq /*const*/ & read,
389 size_t readId,
390 size_t endPos,
391 TReadNames const & readNames,
392 bool matchN,
393 TPatternSpec const &)
394 {
395 typedef typename Position<TContigSeq>::Type TPosition;
396
397 // In oracle Sam mode, the maximum error is the error at the position given in the Sam alignment.
398 bool oracleMode = false;
399 if (maxError == std::numeric_limits<int>::max()) // We are in oracle mode.
400 {
401 oracleMode = true;
402 Finder<TContigSeq> finder(contig);
403 Pattern<TReadSeq, TPatternSpec> pattern(read, -(int)length(read) * 40);
404 bool ret = setEndPosition(finder, pattern, endPos);
405 (void) ret; // If compiled without assertions.
406 SEQAN_ASSERT(ret);
407 maxError = -getScore(pattern);
408 }
409
410 // Debug-adjustments.
411 #define ENABLE 0
412 #define ALL 1
413 #define READID 1
414
415 //if (readNames[readId] == CharString("rabema_synthetic_ecoli_illumina_l100_100k_e3.000000376")) {
416 // std::cerr << "**************** read id = " << readId << " readname = " << readNames[readId] << " read = " << read << " maxError == " << maxError << std::endl;
417 //}
418
419 // The read maps with less than the given number of errors to [left, right]
420 // to the contig sequence, is forward strand iff is Forwar is true.
421 TPosition /*left = endPos,*/ right = endPos;
422
423 // Skip this alignment position if not right of previously
424 // right border of the interval.
425 // TODO(holtgrew): Remove this piece, it's unused right now. We could/should also fix this to work for streaming over SAM files.
426 //if (readId == previousReadId && contigId == previousContigId && left <= previousRightBorder) {
427 // //std::cerr << __FILE__ << ":" << __LINE__ << " Skipping because " << left << " <= " << previousRightBorder << std::endl;
428 // return previousRightBorder;
429 //}
430
431 bool ret; // Flag used for assertions below.
432 (void) ret; // If run without assertions.
433 int relativeMinScore = (int)ceilAwayFromZero(100.0 * -maxError / length(read));
434
435 // Setup the finder and pattern.
436 Finder<TContigSeq> finder(contig);
437 Pattern<TReadSeq, TPatternSpec> pattern(read, -(int)length(read) * 40);
438 // If configured so, match N against all other values, otherwise match
439 // against none.
440 _patternMatchNOfPattern(pattern, matchN);
441 _patternMatchNOfFinder(pattern, matchN);
442 TPosition hitBeginPosition;
443
444 if (ENABLE && (ALL || readId == READID))
445 {
446 std::cerr << "**************** read id = " << readId << " readname = " << readNames[readId]
447 << " read = " << read << std::endl;
448 std::cerr << __FILE__ << ":" << __LINE__ << " maxError = " << maxError << std::endl;
449 std::cerr << __FILE__ << ":" << __LINE__ << " relative min score = " << relativeMinScore << std::endl;
450 std::cerr << __FILE__ << ":" << __LINE__ << " extending to the right." << std::endl;
451 std::cerr << __FILE__ << ":" << __LINE__ << " contig id == " << contigId << std::endl;
452 std::cerr << __FILE__ << ":" << __LINE__ << " is forward strand? " << isForward << std::endl;
453 std::cerr << __FILE__ << ":" << __LINE__ << " contig length = " << length(contig) << std::endl;
454 std::cerr << __FILE__ << ":" << __LINE__ << " endPos = " << endPos << std::endl;
455 std::cerr << __FILE__ << ":" << __LINE__ << " infix(contig, endPos - length(read), endPos) == "
456 << infix(contig, endPos - length(read), endPos) << std::endl;
457 }
458
459 // We will first gather all results in tempMatches. Below, we
460 // will smooth the curve in these points and throw out too bad
461 // entries. Then, we will append tempMatches to errorCurve.
462 String<WeightedMatch> tempMatches;
463
464 // First, extend the interval to the right.
465 {
466 // Skip to original hit.
467 ret = setEndPosition(finder, pattern, endPos);
468 SEQAN_ASSERT(ret);
469 SEQAN_ASSERT_EQ(endPos, endPosition(finder));
470 SEQAN_ASSERT_GEQ(getScore(pattern), -maxError);
471 while (findBegin(finder, pattern, getScore(pattern)))
472 continue; // Find leftmost begin position.
473 SEQAN_ASSERT_GT(endPos, beginPosition(finder));
474 SEQAN_ASSERT_EQ(getScore(pattern), getBeginScore(pattern));
475
476 // Add original hit to the error curve points.
477 int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
478 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
479 beginPosition(finder)));
480 hitBeginPosition = beginPosition(finder);
481 if (ENABLE && (ALL || readId == READID))
482 {
483 std::cerr << __FILE__ << ":" << __LINE__ << " -- getScore(pattern) == " << getScore(pattern) << std::endl;
484 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
485 << " for read id " << readId << " (FIRST HIT)" << std::endl;
486 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read " << read
487 << " endPos = " << endPos << std::endl;
488 }
489
490 // Now extend to the first hit with too low score.
491 bool foundWithTooLowScore = false;
492 while (find(finder, pattern))
493 {
494 while (findBegin(finder, pattern, getScore(pattern)))
495 continue; // Find leftmost begin position.
496 if (getScore(pattern) < -maxError && beginPosition(finder) != back(tempMatches).beginPos)
497 {
498 foundWithTooLowScore = true;
499 if (ENABLE && (ALL || readId == READID))
500 {
501 std::cerr << __FILE__ << ":" << __LINE__ << " -- Found too low score." << std::endl;
502 std::cerr << __FILE__ << ":" << __LINE__ << " -- low scoring match was "
503 << WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
504 beginPosition(finder)) << std::endl;
505 }
506 break;
507 }
508 int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
509 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
510 beginPosition(finder)));
511 if (ENABLE && (ALL || readId == READID))
512 {
513 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
514 << " for read id " << readId << std::endl;
515 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder)
516 << " read " << read << std::endl;
517 }
518 right += 1;
519 }
520 // If we broke because of the score limit then collect the last not
521 // yet added hit and the ones right of it until the beginPosition
522 // changes.
523 if (foundWithTooLowScore)
524 {
525 if (beginPosition(finder) == hitBeginPosition)
526 {
527 relativeScore = static_cast<int>(ceilAwayFromZero(100.0 * getScore(pattern) / length(read)));
528 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
529 relativeScore, beginPosition(finder)));
530 TPosition currentBeginPosition = beginPosition(finder);
531 // Add the rest until we hit one with a different begin position.
532 //
533 // Loop at most length(read) times (this limit is here
534 // for the quality based DP algorithm which can suffer
535 // from "infinite inserts").
536 for (unsigned i = 0; find(finder, pattern) && i < length(read); ++i)
537 {
538 while (findBegin(finder, pattern, getScore(pattern)))
539 continue; // Find leftmost begin position.
540 SEQAN_ASSERT(ret);
541 if (beginPosition(finder) != currentBeginPosition)
542 break;
543 relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
544 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
545 relativeScore, beginPosition(finder)));
546 if (ENABLE && (ALL || readId == READID))
547 {
548 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
549 << " for read id " << readId << std::endl;
550 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder)
551 << " read " << read << std::endl;
552 }
553 right += 1;
554 }
555 }
556 }
557
558 if (ENABLE && (ALL || readId == READID))
559 {
560 std::cerr << __FILE__ << ":" << __LINE__ << " extending to the left." << std::endl;
561 }
562 // Then, extend the interval to the left.
563 {
564 // Length by which we extend the interval. Note that this must be
565 // at least the length of the read + max error count so no special
566 // treatment of islands on the left side must be done.
567 TPosition kIntervalLen = length(read) + maxError;
568 // Tentatively extend the interval to the left. We will
569 // extend until we hit a position with too low score.
570 TPosition tentativeLeft = endPos;
571 // Flag that indicates whether we found an entry with too low score.
572 bool foundTooLowScore = false;
573 // Flag that indicates whether we performed a loop iteration after having found a too low score.
574 bool loopedAfterFoundTooLowScore = false;
575 // Flag for breaking out of loop if we hit the right border of the last interval.
576 bool hitLastRight = false;
577 while (tentativeLeft > 0 && !loopedAfterFoundTooLowScore && !hitLastRight)
578 {
579 // Stop if went went to the rightmost position of the
580 // last interval with the previous loop iteration.
581 //if (readId == previousReadId && contigId == previousContigId && tentativeLeft == previousRightBorder)
582 // break;
583 loopedAfterFoundTooLowScore = foundTooLowScore;
584 TPosition oldTentativeLeft = tentativeLeft;
585 // Move the tentative left position left by kIntervalLen but not
586 // further left than 0.
587 if (ENABLE && (ALL || readId == READID))
588 std::cerr << "tentativeLeft before = " << tentativeLeft << std::endl;
589 tentativeLeft -= _min(kIntervalLen, tentativeLeft);
590 if (ENABLE && (ALL || readId == READID))
591 std::cerr << "tentativeLeft after = " << tentativeLeft << std::endl;
592 // Do not go further than the previous right position.
593 //if (readId == previousReadId && contigId == previousContigId && tentativeLeft <= previousRightBorder) {
594 // tentativeLeft = previousRightBorder;
595 // hitLastRight = true;
596 //}
597 if (ENABLE && (ALL || readId == READID))
598 {
599 std::cerr << __FILE__ << ":" << __LINE__ << " -- tentative left = " << tentativeLeft << std::endl;
600 }
601 // Search from tentative left position to the previous tentative left position.
602 ret = setEndPosition(finder, pattern, tentativeLeft);
603 SEQAN_ASSERT(ret);
604 if (ENABLE && (ALL || readId == READID))
605 {
606 std::cerr << __FILE__ << ":" << __LINE__ << " -- endPosition(finder) = "
607 << endPosition(finder) << std::endl;
608 }
609 if (endPosition(finder) > oldTentativeLeft)
610 break; // Could not set position of the finder left of old tentative left.
611 while (findBegin(finder, pattern, getScore(pattern)))
612 continue; // Find leftmost begin position.
613 int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
614 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
615 beginPosition(finder)));
616 if (ENABLE && (ALL || readId == READID))
617 {
618 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
619 << " for read id " << readId << std::endl;
620 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read "
621 << read << std::endl;
622 }
623 foundTooLowScore = foundTooLowScore || (relativeScore < relativeMinScore);
624 while (find(finder, pattern) && endPosition(finder) != oldTentativeLeft)
625 {
626 while (findBegin(finder, pattern, getScore(pattern)))
627 continue; // Find leftmost begin position.
628 SEQAN_ASSERT(ret);
629 relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
630 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
631 relativeScore, beginPosition(finder)));
632 if (ENABLE && (ALL || readId == READID))
633 {
634 std::cerr << __FILE__ << ":" << __LINE__ << " -- raw score is " << getScore(pattern)
635 << std::endl;
636 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
637 << " for read id " << readId << std::endl;
638 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read "
639 << read << std::endl;
640 }
641 foundTooLowScore = foundTooLowScore || (relativeScore < relativeMinScore);
642 }
643 }
644 if (ENABLE && (ALL || readId == READID))
645 {
646 std::cerr << __FILE__ << ":" << __LINE__ << " -- after loop" << std::endl;
647 }
648 /*
649 // Now we can be sure that the temporary matches contain an entry
650 // left of the first one with a too low score that has a different
651 // begin position than hitBeginPosition.
652 std::sort(begin(tempMatches, Standard()), end(tempMatches, Standard()));
653 ModifiedString<String<WeightedMatch>, ModReverse> rTempMatches(tempMatches);
654 TPosition prefixLength = 0;
655 TPosition i;
656 // Search up to the first too low score.
657 for (i = 0; i < length(rTempMatches); ++i) {
658 if (ENABLE && (ALL || readId == READID)) {
659 std::cerr << "Considering " << rTempMatches[i] << std::endl;
660 }
661 if (rTempMatches[i].distance < relativeMinScore) {
662 if (rTempMatches[i].beginPos == hitBeginPosition) {
663 prefixLength += 1;
664 left -= 1;
665 i += 1;
666 }
667 break;
668 }
669 prefixLength += 1;
670 left -= 1;
671 }
672 if (ENABLE && (ALL || readId == READID)) {
673 std::cerr << "prefixLength == " << prefixLength << std::endl;
674 std::cerr << "i == " << i << std::endl;
675 }
676 // Then, append until we find a position with a different begin position.
677 for (; i < length(rTempMatches); ++i) {
678 if (rTempMatches[i].beginPos == hitBeginPosition) {
679 prefixLength += 1;
680 left -= 1;
681 } else {
682 break;
683 }
684 }
685 // Finally, append the prefix of the given length.
686 if (ENABLE && (ALL || readId == READID)) {
687 std::cerr << "appending prefix(rTempMatches, " << prefixLength << ")" << std::endl;
688 }
689 */
690 }
691 }
692 // Postprocessing: Sorting, smoothing, filtering.
693 std::sort(begin(tempMatches, Standard()), end(tempMatches, Standard()));
694 smoothErrorCurve(tempMatches);
695 appendValue(tempMatches, WeightedMatch(0, 0, 0, relativeMinScore - 1, 0)); // Sentinel.
696 if (oracleMode)
697 {
698 // In oracle Sam mode, we only want the lake with last pos endPos-1.
699 String<WeightedMatch> buffer;
700 bool flag = false;
701 for (size_t i = 0; i < length(tempMatches); ++i)
702 {
703 if (tempMatches[i].distance < relativeMinScore)
704 {
705 if (flag)
706 {
707 append(errorCurve, buffer);
708 break;
709 }
710 else
711 {
712 clear(buffer);
713 }
714 }
715 else
716 {
717 appendValue(buffer, tempMatches[i]);
718 if (tempMatches[i].pos == endPos - 1)
719 flag = true;
720 }
721 }
722 }
723 else
724 {
725 for (size_t i = 0; i < length(tempMatches); ++i)
726 {
727 if (tempMatches[i].distance >= relativeMinScore)
728 appendValue(errorCurve, tempMatches[i]);
729 }
730 }
731
732 // if (readId == READID) {
733 // std::cerr << ",-- errorCurve is (read id = " << readId << " readname = " << readNames[readId] << ")" << std::endl;
734 // for (unsigned i = 0; i < length(errorCurve); ++i) {
735 // std::cerr << "| " << errorCurve[i] << std::endl;
736 // }
737 // std::cerr << "`--" << std::endl;
738 // }
739
740 // std::cerr << __FILE__ << ":" << __LINE__ << " return " << right << std::endl;
741 }
742
743 // ----------------------------------------------------------------------------
744 // Function matchesToErrorFunction()
745 // ----------------------------------------------------------------------------
746
747 // Compute error curve from the reads and reference sequences.
748
749 template <typename TPatternSpec>
matchesToErrorFunction(TErrorCurves & errorCurves,String<int> & readAlignmentDistances,BamFileIn & inBam,StringSet<CharString> & readNameStore,StringSet<CharString> const & contigNameStore,FaiIndex const & faiIndex,BuildGoldStandardOptions const & options,TPatternSpec const &)750 int matchesToErrorFunction(TErrorCurves & errorCurves,
751 String<int> & readAlignmentDistances, // only used in case of oracle mode
752 BamFileIn & inBam,
753 StringSet<CharString> & readNameStore,
754 StringSet<CharString> const & contigNameStore,
755 FaiIndex const & faiIndex,
756 BuildGoldStandardOptions const & options,
757 TPatternSpec const & /*patternTag*/)
758 {
759 double startTime = 0;
760
761 if (atEnd(inBam))
762 return 0; // Do nothing if there are no more alignments in the file.
763
764 // We store the read names and append "/0" and "/1", depending on their flag value ("/0" for first, "/1" for last
765 // flag). Sadly, there is no easy way out here. Single-end reads are stored as "/S".
766 NameStoreCache<StringSet<CharString> > readNameStoreCache(readNameStore);
767 String<unsigned> readLengthStore;
768
769 std::cerr << "\n____EXTENDING INTERVALS_______________________________________________________\n\n"
770 << "Each dot represents work for 100k nucleotides in the genome.\n";
771
772 // Stream over SAM/BAM file. The main assumption here is that the reads are sorted by coordinate, which is check
773 // when loading the header.
774 //
775 // For each read alignments:
776 // If aligned on forward strand:
777 // Build error curve for this read alignment on forward strand.
778 // Else:
779 // Build error curve for this read alignment on backward strand.
780
781 startTime = sysTime(); // Time at beginning for total time display at end.
782 int prevRefId = -1; // Previous contig id.
783 int prevPos = -1;
784 int posOnContig = 0;
785 BamAlignmentRecord record; // Current read record.
786 Dna5String contig;
787 Dna5String rcContig;
788 Dna5String readSeq;
789 CharString readName;
790 while (!atEnd(inBam))
791 {
792 // -------------------------------------------------------------------
793 // Read SAM/BAM Record and retrieve name and sequence.
794 // -------------------------------------------------------------------
795
796 // Read SAM/BAM record.
797 try
798 {
799 readRecord(record, inBam);
800 }
801 catch (seqan::ParseError const & ioErr)
802 {
803 std::cerr << "ERROR reading SAM/BAM record(" << ioErr.what() << ")!\n";
804 return 1;
805 }
806 // Break if we have an unaligned SAM/BAM record.
807 if (record.rID == -1)
808 break; // Done!
809 // Check that the file is actually sorted by coordinate.
810 if (prevRefId != -1 && (prevRefId > record.rID || (prevRefId == record.rID && prevPos > record.beginPos)))
811 {
812 std::cerr << "ERROR: File was not sorted by coordinate!\n";
813 return 1;
814 }
815 // Get read name and sequence from record.
816 readSeq = record.seq; // Convert read sequence to Dna5.
817 // Compute reverse complement since we align against reverse strand, SAM has aligned sequence against forward
818 // strand.
819 if (hasFlagRC(record))
820 reverseComplement(readSeq);
821 trimSeqHeaderToId(record.qName); // Remove everything after the first whitespace.
822 if (!hasFlagMultiple(record))
823 append(record.qName, "/S");
824 if (hasFlagMultiple(record) && hasFlagFirst(record))
825 append(record.qName, "/0");
826 if (hasFlagMultiple(record) && hasFlagLast(record))
827 append(record.qName, "/1");
828 // Translate read to read id.
829 unsigned readId = 0;
830 if (!getIdByName(readId, readNameStoreCache, record.qName))
831 {
832 readId = length(readNameStore);
833 appendName(readNameStoreCache, record.qName);
834 appendValue(readLengthStore, length(record.seq));
835 if (options.oracleMode)
836 appendValue(readAlignmentDistances, -1);
837 }
838
839 // -------------------------------------------------------------------
840 // Handle Progress
841 // -------------------------------------------------------------------
842
843 // Handle progress: Change of contig.
844 SEQAN_ASSERT_LEQ(prevRefId, record.rID);
845 if (prevRefId != record.rID)
846 {
847 for (int i = prevRefId + 1; i <= record.rID; ++i)
848 {
849 if (i != prevRefId)
850 std::cerr << "\n";
851 std::cerr << contigNameStore[record.rID] << " (" << record.rID + 1 << "/"
852 << length(contigNameStore) << ") ";
853 }
854 posOnContig = 0;
855
856 // Load reference sequence on the fly using FAI index to save memory.
857 unsigned faiRefId = 0;
858 if (!getIdByName(faiRefId, faiIndex, contigNameStore[record.rID]))
859 {
860 std::cerr << "Reference sequence " << contigNameStore[record.rID] << " not known in FAI file.\n";
861 return 1;
862 }
863 try
864 {
865 readSequence(contig, faiIndex, faiRefId);
866 }
867 catch (seqan::ParseError const & ioErr)
868 {
869 std::cerr << "Could not load sequence " << contigNameStore[record.rID]
870 << " from FASTA file with FAI (" << ioErr.what() << ").\n";
871 return 1;
872 }
873 rcContig = contig;
874 reverseComplement(rcContig);
875 }
876 // Handle progress: Position on contig.
877 for (; posOnContig < record.beginPos; posOnContig += 100 * 1000)
878 {
879 if (posOnContig % (1000 * 1000) == 0 && posOnContig > 0)
880 std::cerr << posOnContig / 1000 / 1000 << "M";
881 else
882 std::cerr << ".";
883 }
884
885 // -------------------------------------------------------------------
886 // Extend error curve points for read.
887 // -------------------------------------------------------------------
888
889 // In oracle mode, set max error to -1, buildErrorCurvePoints() will use the error at the alignment position
890 // from the SAM/BAM file. In normal mode, convert from error rate from options to error count.
891 int maxError = std::numeric_limits<int>::max();
892 if (!options.oracleMode)
893 maxError = static_cast<int>(floor(0.01 * options.maxError * length(record.seq)));
894
895 // Compute end position of alignment.
896 int endPos = record.beginPos + getAlignmentLengthInRef(record) - countPaddings(record.cigar);
897
898 if (!hasFlagRC(record))
899 buildErrorCurvePoints(errorCurves[readId], maxError, contig, record.rID, !hasFlagRC(record),
900 readSeq, readId, endPos, readNameStore, options.matchN, TPatternSpec());
901 else
902 buildErrorCurvePoints(errorCurves[readId], maxError, rcContig, record.rID, !hasFlagRC(record),
903 readSeq, readId, length(rcContig) - record.beginPos, readNameStore, options.matchN,
904 TPatternSpec());
905 #if ENABLE
906 std::cerr << "AFTER BUILD, readId == " << readId << ", read name == " << readNameStore[readId] << ", maxError == " << maxError << "\n";
907 #endif // #if ENABLE
908 if (options.oracleMode)
909 readAlignmentDistances[readId] = maxError;
910
911 // Update variables storing the previous read/contig id and position.
912 prevRefId = record.rID;
913 prevPos = record.beginPos;
914 }
915 std::cerr << "\n\nTook " << sysTime() - startTime << " s\n";
916
917 // For all reads:
918 // Sort all error curve points.
919 // Fill gaps.
920 // Smooth them.
921 // Filter out low scoring ones.
922
923 std::cerr << "\n____SMOOTHING ERROR CURVES____________________________________________________\n\n";
924 startTime = sysTime();
925 unsigned tenPercent = length(readLengthStore) / 10 + 1;
926 std::cerr << "Progress: ";
927 for (unsigned readId = 0; readId < length(readLengthStore); ++readId)
928 {
929 if (tenPercent > 0u && readId % tenPercent == 0u)
930 std::cerr << readId / tenPercent * 10 << '%';
931 else if (tenPercent > 5u && readId % (tenPercent / 5) == 0u)
932 std::cerr << '.';
933
934 std::sort(begin(errorCurves[readId], Standard()), end(errorCurves[readId], Standard()));
935 fillGaps(errorCurves[readId]);
936 smoothErrorCurve(errorCurves[readId]);
937
938 // Compute relative min score for the read.
939 String<WeightedMatch> filtered;
940 int maxError = (int)floor(options.maxError / 100.0 * readLengthStore[readId]);
941 if (options.oracleMode)
942 {
943 SEQAN_ASSERT_NEQ(readAlignmentDistances[readId], -1);
944 maxError = readAlignmentDistances[readId];
945 if (options.maxErrorSet && maxError > options.maxError)
946 maxError = options.maxError;
947 }
948 int relativeMinScore = (int)ceilAwayFromZero(100.0 * -maxError / readLengthStore[readId]);
949 #if ENABLE
950 std::cerr << "AFTER SMOOTHING, readId == " << readId << ", read name == " << readNameStore[readId] << ", maxError == " << maxError << "\n";
951 #endif // #if ENABLE
952
953 // Filter out low scoring ones.
954 typedef typename Iterator<String<WeightedMatch> >::Type TIterator;
955 for (TIterator it = begin(errorCurves[readId]); it != end(errorCurves[readId]); ++it)
956 {
957 if (value(it).distance >= relativeMinScore)
958 appendValue(filtered, value(it));
959 }
960 move(errorCurves[readId], filtered);
961 }
962 std::cerr << "100% DONE\n"
963 << "\nTook: " << sysTime() - startTime << " s\n";
964
965 return 0;
966 }
967
968 // ---------------------------------------------------------------------------
969 // Function parseCommandLine()
970 // ---------------------------------------------------------------------------
971
972 seqan::ArgumentParser::ParseResult
parseCommandLine(BuildGoldStandardOptions & options,int argc,char const ** argv)973 parseCommandLine(BuildGoldStandardOptions & options, int argc, char const ** argv)
974 {
975 // -----------------------------------------------------------------------
976 // Parse Command Line Using ArgumentParser
977 // -----------------------------------------------------------------------
978
979 seqan::ArgumentParser parser("rabema_build_gold_standard");
980 setShortDescription(parser, "RABEMA Gold Standard Builder");
981 setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
982 setDate(parser, SEQAN_DATE);
983 setCategory(parser, "Benchmarking");
984
985 addUsageLine(parser,
986 "[\\fIOPTIONS\\fP] \\fB--out-gsi\\fP \\fIOUT.gsi\\fP \\fB--reference\\fP \\fIREF.fa\\fP "
987 "\\fB--in-bam\\fP \\fIPERFECT.{sam,bam}\\fP");
988 addDescription(parser,
989 "This program allows one to build a RABEMA gold standard. The input is a reference FASTA file "
990 "and a perfect SAM/BAM map (e.g. created using RazerS 3 in full-sensitivity mode).");
991 addDescription(parser,
992 "The input SAM/BAM file must be \\fIsorted by coordinate\\fP. The program will create a "
993 "FASTA index file \\fIREF.fa.fai\\fP for fast random access to the reference.");
994
995 addOption(parser, seqan::ArgParseOption("v", "verbose", "Enable verbose output."));
996 addOption(parser, seqan::ArgParseOption("vv", "very-verbose", "Enable even more verbose output."));
997
998 addSection(parser, "Input / Output");
999 addOption(parser, seqan::ArgParseOption("o", "out-gsi", "Path to write the resulting GSI file to.",
1000 seqan::ArgParseArgument::OUTPUT_FILE, "GSI"));
1001 setValidValues(parser, "out-gsi", "gsi gsi.gz");
1002 setRequired(parser, "out-gsi", true);
1003 addOption(parser, seqan::ArgParseOption("r", "reference", "Path to load reference FASTA from.",
1004 seqan::ArgParseArgument::INPUT_FILE, "FASTA"));
1005 setRequired(parser, "reference", true);
1006 setValidValues(parser, "reference", seqan::SeqFileIn::getFileExtensions());
1007 addOption(parser, seqan::ArgParseOption("b", "in-bam", "Path to load the \"perfect\" SAM/BAM file from.",
1008 seqan::ArgParseArgument::INPUT_FILE, "BAM"));
1009 setValidValues(parser, "in-bam", BamFileIn::getFileExtensions());
1010
1011 addSection(parser, "Gold Standard Parameters");
1012 addOption(parser, seqan::ArgParseOption("", "oracle-mode",
1013 "Enable oracle mode. This is used for simulated data when the input "
1014 "SAM/BAM file gives exactly one position that is considered as the true "
1015 "sample position."));
1016 addOption(parser, seqan::ArgParseOption("", "match-N", "When set, N matches all characters without penalty."));
1017 addOption(parser, seqan::ArgParseOption("", "distance-metric",
1018 "Set distance metric. Valid values: hamming, edit. Default: edit.",
1019 seqan::ArgParseOption::STRING, "METRIC"));
1020 setValidValues(parser, "distance-metric", "hamming edit");
1021 setDefaultValue(parser, "distance-metric", "edit");
1022
1023 addOption(parser, seqan::ArgParseOption("e", "max-error",
1024 "Maximal error rate to build gold standard for in percent. This "
1025 "parameter is an integer and relative to the read length. "
1026 "In case of oracle mode, the error rate for the read at the "
1027 "sampling position is used and \\fIRATE\\fP is used as a cutoff "
1028 "threshold.",
1029 seqan::ArgParseArgument::INTEGER, "RATE"));
1030 setDefaultValue(parser, "max-error", 0);
1031
1032 addTextSection(parser, "Return Values");
1033 addText(parser, "A return value of 0 indicates success, any other value indicates an error.");
1034
1035 addTextSection(parser, "Examples");
1036
1037 addListItem(parser,
1038 "\\fBrabema_build_gold_standard\\fP \\fB-e\\fP \\fI4\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-s\\fP "
1039 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1040 "Build gold standard from a SAM file \\fIIN.sam\\fP with all mapping locations and a FASTA "
1041 "reference \\fIREF.fa\\fP to GSI file \\fIOUT.gsi\\fP with a maximal error rate of \\fI4\\fP.");
1042 addListItem(parser,
1043 "\\fBrabema_build_gold_standard\\fP \\fB--distance-metric\\fP \\fIedit\\fP \\fB-e\\fP \\fI4\\fP "
1044 "\\fB-o\\fP \\fIOUT.gsi\\fP \\fB-b\\fP \\fIIN.bam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1045 "Same as above, but using Hamming instead of edit distance and BAM as the input.");
1046 addListItem(parser,
1047 "\\fBrabema_build_gold_standard\\fP \\fB--oracle-mode\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-s\\fP "
1048 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1049 "Build gold standard from a SAM file \\fIIN.sam\\fP with the original sample position, e.g. "
1050 "as exported by read simulator Mason.");
1051
1052 addTextSection(parser, "Memory Requirements");
1053 addText(parser,
1054 "From version 1.1, great care has been taken to keep the memory requirements as low as possible. "
1055 "There memory required is two times the size of the largest chromosome plus some constant memory "
1056 "for each match.");
1057 addText(parser,
1058 "For example, the memory usage for 100bp human genome reads at 5% error rate was 1.7GB. Of this, "
1059 "roughly 400GB came from the chromosome and 1.3GB from the matches.");
1060
1061 addTextSection(parser, "References");
1062 addText(parser,
1063 "M. Holtgrewe, A.-K. Emde, D. Weese and K. Reinert. A Novel And Well-Defined Benchmarking Method "
1064 "For Second Generation Read Mapping, BMC Bioinformatics 2011, 12:210.");
1065 addListItem(parser, "\\fIhttp://www.seqan.de/rabema\\fP", "RABEMA Homepage");
1066 addListItem(parser, "\\fIhttp://www.seqan.de/mason\\fP", "Mason Homepage");
1067
1068 // Actually do the parsing and exit on error, help display etc.
1069 seqan::ArgumentParser::ParseResult res = parse(parser, argc, argv);
1070 if (res != seqan::ArgumentParser::PARSE_OK)
1071 return res;
1072
1073 // -----------------------------------------------------------------------
1074 // Fill BuildGoldStandardOptions Object
1075 // -----------------------------------------------------------------------
1076
1077 if (isSet(parser, "verbose"))
1078 options.verbosity = 2;
1079 if (isSet(parser, "very-verbose"))
1080 options.verbosity = 3;
1081
1082 if (isSet(parser, "match-N"))
1083 options.matchN = true;
1084 if (isSet(parser, "oracle-mode"))
1085 options.oracleMode = true;
1086 if (isSet(parser, "max-error"))
1087 {
1088 getOptionValue(options.maxError, parser, "max-error");
1089 options.maxErrorSet = true;
1090 }
1091 CharString distanceMetric;
1092 getOptionValue(distanceMetric, parser, "distance-metric");
1093 if (distanceMetric == "hamming")
1094 options.distanceMetric = HAMMING_DISTANCE;
1095
1096 if (isSet(parser, "out-gsi"))
1097 getOptionValue(options.outGsiPath, parser, "out-gsi");
1098 if (isSet(parser, "reference"))
1099 getOptionValue(options.referencePath, parser, "reference");
1100 if (isSet(parser, "in-bam"))
1101 getOptionValue(options.inBamPath, parser, "in-bam");
1102
1103 return res;
1104 }
1105
1106 // ----------------------------------------------------------------------------
1107 // Function main()
1108 // ----------------------------------------------------------------------------
1109
main(int argc,char const ** argv)1110 int main(int argc, char const ** argv)
1111 {
1112 // Parse command line and store results in options.
1113 BuildGoldStandardOptions options;
1114 seqan::ArgumentParser::ParseResult parseRes = parseCommandLine(options, argc, argv);
1115 if (parseRes != seqan::ArgumentParser::PARSE_OK)
1116 return parseRes == seqan::ArgumentParser::PARSE_ERROR;
1117
1118 double startTime = 0; // For measuring time below.
1119
1120 std::cerr << "==============================================================================\n"
1121 << " RABEMA - Read Alignment BEnchMArk\n"
1122 << "==============================================================================\n"
1123 << " Building Gold Standard\n"
1124 << "==============================================================================\n"
1125 << "\n"
1126 << "____OPTIONS___________________________________________________________________\n\n";
1127
1128 std::cerr << "Max error rate [%] " << options.maxError << "\n"
1129 << "Oracle mode " << yesNo(options.oracleMode) << '\n'
1130 << "Distance measure " << metricName(options.distanceMetric) << "\n"
1131 << "Match Ns " << yesNo(options.matchN) << '\n'
1132 << "GSI Output File " << options.outGsiPath << '\n'
1133 << "SAM/BAM Input File " << options.inBamPath << '\n'
1134 << "Reference File " << options.referencePath << '\n'
1135 << "Verbosity " << options.verbosity << "\n\n";
1136
1137 std::cerr << "____LOADING FILES_____________________________________________________________\n\n";
1138
1139 // =================================================================
1140 // Prepare File I/O.
1141 // =================================================================
1142
1143 startTime = sysTime();
1144 std::cerr << "Reference Index " << options.referencePath << ".fai ...";
1145 FaiIndex faiIndex;
1146 if (!open(faiIndex, toCString(options.referencePath)))
1147 {
1148 std::cerr << " FAILED (not fatal, we can just build it)\n";
1149 std::cerr << "Building Index " << options.referencePath << ".fai ...";
1150 if (!build(faiIndex, toCString(options.referencePath)))
1151 {
1152 std::cerr << "Could not build FAI index.\n";
1153 return 1;
1154 }
1155 std::cerr << " OK\n";
1156 seqan::CharString faiPath = options.referencePath;
1157 append(faiPath, ".fai");
1158 std::cerr << "Reference Index " << faiPath << " ...";
1159 try
1160 {
1161 save(faiIndex, toCString(faiPath));
1162 }
1163 catch (seqan::IOError const & ioErr)
1164 {
1165 std::cerr << "Could not write FAI index we just built (" << ioErr.what() << ").\n";
1166 return 1;
1167 }
1168 std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1169 }
1170 std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1171
1172 // Open SAM file and read in header.
1173 BamHeader bamHeader;
1174 BamFileIn inBam;
1175 if (!open(inBam, toCString(options.inBamPath)))
1176 {
1177 std::cerr << "Could not open SAM file.\n";
1178 return 1;
1179 }
1180 try
1181 {
1182 readHeader(bamHeader, inBam);
1183 }
1184 catch (seqan::ParseError const & ioErr)
1185 {
1186 std::cerr << "Could not read BAM header (" << ioErr.what() << ").\n";
1187 return 1;
1188 }
1189 std::cerr << " OK\n";
1190 std::cerr << "\nTook " << sysTime() - startTime << "s\n";
1191
1192 // =================================================================
1193 // Build point-wise error curve.
1194 // =================================================================
1195 startTime = sysTime();
1196 TErrorCurves errorCurves;
1197 int res = 0;
1198 StringSet<CharString> readNameStore;
1199 // In oracle mode, we store the distance of the alignment from the SAM file for each read. Otherwise, this variable
1200 // remains unused.
1201 String<int> readAlignmentDistances;
1202 if (options.distanceMetric == EDIT_DISTANCE)
1203 res = matchesToErrorFunction(errorCurves, readAlignmentDistances, inBam, readNameStore,
1204 contigNames(context(inBam)), faiIndex, options, MyersUkkonenReads());
1205 else // options.distanceMetric == DISTANCE_HAMMING
1206 res = matchesToErrorFunction(errorCurves, readAlignmentDistances, inBam, readNameStore,
1207 contigNames(context(inBam)), faiIndex, options, HammingSimple());
1208 if (res != 0)
1209 return 1;
1210
1211 // =================================================================
1212 // Convert points in error curves to intervals and write them to
1213 // stdout or a file.
1214 // =================================================================
1215
1216 // The two alternatives are equivalent after opening the file.
1217 startTime = sysTime();
1218 std::cerr << "\n____WRITING OUTPUT____________________________________________________________\n\n";
1219 VirtualStream<char, Output> gsiStream;
1220 if ((options.outGsiPath == "-" && !open(gsiStream, std::cout, Nothing())) ||
1221 !open(gsiStream, toCString(options.outGsiPath)))
1222 {
1223 std::cerr << "Could not open output file " << options.outGsiPath << ".\n";
1224 return 1;
1225 }
1226
1227 if (options.outGsiPath == "-")
1228 std::cerr << "Writing to stdout ...";
1229 else
1230 std::cerr << "Writing to " << options.outGsiPath << " ...";
1231 intervalizeAndDumpErrorCurves(gsiStream, errorCurves, readAlignmentDistances, readNameStore,
1232 contigNames(context(inBam)), options, true);
1233 std::cerr << " DONE\n"
1234 << "\n Took " << sysTime() - startTime << " s\n";
1235
1236 return 0;
1237 }
1238