1 // ==========================================================================
2 // RABEMA Read Alignment Benchmark
3 // ==========================================================================
4 // Copyright (C) 2010-1012 Manuel Holtgrewe, FU Berlin
5 //
6 // This program is free software: you can redistribute it and/or modify it
7 // under the terms of the GNU General Public License as published by the Free
8 // Software Foundation, either version 3 of the License, or (at your option)
9 // any later version.
10 //
11 // This program is distributed in the hope that it will be useful, but WITHOUT
12 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 // more details.
15 //
16 // You should have received a copy of the GNU General Public License along
17 // with this program. If not, see <http://www.gnu.org/licenses/>.
18 //
19 // ==========================================================================
20 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
21 // ==========================================================================
22 // RABEMA is a read mapping benchmark tool. See README for an overview.
23 //
24 // This is the program for comparing the SAM output of a read mapper to a
25 // Gold Standard Intervals (GSI) file.
26 // ==========================================================================
27
28 #include <seqan/arg_parse.h>
29 #include <seqan/bam_io.h>
30 #include <seqan/basic.h>
31 #include <seqan/find.h>
32 #include <seqan/misc/interval_tree.h>
33 #include <seqan/store.h>
34 #include <seqan/seq_io.h>
35
36 #include "find_hamming_simple_ext.h"
37 #include "find_myers_ukkonen_ext.h"
38 #include "find_myers_ukkonen_reads.h"
39 #include "io_gsi.h"
40 #include "rabema_stats.h"
41 #include "ref_id_mapping.h"
42 #include "sorting.h"
43
44 // ============================================================================
45 // Enums, Tags, Classes.
46 // ============================================================================
47
48 // ---------------------------------------------------------------------------
49 // Enum BenchmarkCategory
50 // ---------------------------------------------------------------------------
51
52 enum BenchmarkCategory
53 {
54 CATEGORY_ALL,
55 CATEGORY_ALL_BEST,
56 CATEGORY_ANY_BEST
57 };
58
59 // ---------------------------------------------------------------------------
60 // Enum DistanceMetric
61 // ---------------------------------------------------------------------------
62
63 enum DistanceMetric
64 {
65 HAMMING_DISTANCE,
66 EDIT_DISTANCE
67 };
68
69 // ----------------------------------------------------------------------------
70 // Class RabemaEvaluationOptions
71 // ----------------------------------------------------------------------------
72
73 class RabemaEvaluationOptions
74 {
75 public:
76 // Verbosity level, quiet: 0, normal: 1, verbose: 2, very verbose: 3.
77 int verbosity;
78
79 // ------------------------------------------------------------------------
80 // Benchmark Related
81 // ------------------------------------------------------------------------
82
83 // Maximum number or errors per read length in percent.
84 int maxError;
85
86 // If true, N matches as a wildcard. Otherwise it matches none.
87 bool matchN;
88
89 // If enabled, the distance of alignments is considered to be 0. This is used when comparing against GSI files that
90 // were generated by the read simulator. In this case, the intervals will have a length of 1 and be the single
91 // original position of this read.
92 bool oracleMode;
93
94 // Consider only reads that have a unique match in the mapping result file. Useful for precision computation.
95 bool onlyUniqueReads;
96
97 // The benchmark category, one of {"all", "any-best", "all-best"}.
98 BenchmarkCategory benchmarkCategory;
99
100 // Distance function to use, also see validDistanceFunction.
101 DistanceMetric distanceMetric;
102
103 // If true then we trust the "NM" tag of the SAM alignment if present. Otherwise, we perform a realignment at the
104 // target position.
105 bool trustNM;
106
107 // If the CIGAR string is absent, this tag provides the position of the other end of the alignment.
108 CharString extraPosTag;
109
110 // If enabled then all reads are treated as single-end mapped. This is required for analyzing SOAP output that has
111 // been converted to SAM since it will set the paired-end flags.
112 bool ignorePairedFlags;
113
114 // If enabled, don't panic on additional hits in non-weighted mode.
115 bool dontPanic;
116
117 // ------------------------------------------------------------------------
118 // Input / Output
119 // ------------------------------------------------------------------------
120
121 // Path to reference sequence file.
122 CharString referencePath;
123
124 // Path to input GSI file.
125 CharString inGsiPath;
126
127 // Exactly one of the following has to be set.
128 //
129 // Path to input SAM or BAM file.
130 CharString inBamPath;
131
132 // Path to output TSV stats file.
133 CharString outTsvPath;
134
135 // Whether to check sorting or not.
136 bool checkSorting;
137
138 // ------------------------------------------------------------------------
139 // Logging configuration.
140 // ------------------------------------------------------------------------
141
142 // Print the missed intervals to stderr for debugging purposes.
143 bool showMissedIntervals;
144
145 // Print superfluous intervals (intervals found in BAM file but have too bad score).
146 bool showSuperflousIntervals;
147
148 // Print additional intervals (intervals found in BAM with good score that are not in WIT file).
149 bool showAdditionalIntervals;
150
151 // Print the hit intervals to stderr for debugging purposes.
152 bool showHitIntervals;
153
154 // Print each end position that we try to match agains the interval.
155 bool showTryHitIntervals;
156
RabemaEvaluationOptions()157 RabemaEvaluationOptions() :
158 verbosity(1),
159 maxError(0),
160 matchN(false),
161 oracleMode(false),
162 onlyUniqueReads(false),
163 benchmarkCategory(CATEGORY_ALL),
164 distanceMetric(EDIT_DISTANCE),
165 trustNM(false),
166 ignorePairedFlags(false),
167 dontPanic(false),
168 checkSorting(true),
169 // outPath("-"),
170 showMissedIntervals(false),
171 showSuperflousIntervals(false),
172 showAdditionalIntervals(false),
173 showHitIntervals(false),
174 showTryHitIntervals(false)
175 {}
176 };
177
178 // ----------------------------------------------------------------------------
179 // Helper Class CmpGsiRecordLowering.
180 // ----------------------------------------------------------------------------
181
182 // Comparison functor for lexicographically sorting by (readId, contigId, first pos).
183
184 struct CmpGsiRecordLowering
185 {
operator ()CmpGsiRecordLowering186 bool operator()(GsiRecord const & lhs, GsiRecord const & rhs) const
187 {
188 return (lhs.readId < rhs.readId) || (lhs.readId == rhs.readId && lhs.contigId < rhs.contigId) ||
189 (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos < rhs.firstPos) ||
190 (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
191 lhs.lastPos > rhs.lastPos) ||
192 (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
193 lhs.lastPos == rhs.lastPos && lhs.distance > rhs.distance);
194 }
195
196 };
197
198 // ============================================================================
199 // Metafunctions
200 // ============================================================================
201
202 // ============================================================================
203 // Functions
204 // ============================================================================
205
206 // ---------------------------------------------------------------------------
207 // Function categoryName()
208 // ---------------------------------------------------------------------------
209
categoryName(BenchmarkCategory cat)210 char const * categoryName(BenchmarkCategory cat)
211 {
212 if (cat == CATEGORY_ALL)
213 return "all";
214 else if (cat == CATEGORY_ALL_BEST)
215 return "all-best";
216 else
217 return "any-best";
218 }
219
220 // ---------------------------------------------------------------------------
221 // Function metricName()
222 // ---------------------------------------------------------------------------
223
metricName(DistanceMetric met)224 char const * metricName(DistanceMetric met)
225 {
226 if (met == EDIT_DISTANCE)
227 return "edit";
228 else // if (met == HAMMING_DISTANCE)
229 return "hamming";
230 }
231
232 // ---------------------------------------------------------------------------
233 // Function yesNo()
234 // ---------------------------------------------------------------------------
235
yesNo(bool b)236 char const * yesNo(bool b)
237 {
238 if (b)
239 return "yes";
240 else
241 return "no";
242 }
243
244 // ----------------------------------------------------------------------------
245 // Function performIntervalLowering()
246 // ----------------------------------------------------------------------------
247
248 // Relabel intervals with the smallest distance of a contained interval. Filter out intervals with distance > maxError.
249
performIntervalLowering(String<GsiRecord> & gsiRecords,int maxError)250 void performIntervalLowering(String<GsiRecord> & gsiRecords, int maxError)
251 {
252 if (empty(gsiRecords))
253 return;
254
255 typedef Iterator<String<GsiRecord> >::Type TIterator;
256 typedef IntervalAndCargo<unsigned, unsigned> TInterval;
257
258 // Step 1: Adjust distances.
259 std::sort(begin(gsiRecords, Standard()), end(gsiRecords, Standard()), CmpGsiRecordLowering());
260
261 // Add sentinel interval.
262 GsiRecord sentinel(back(gsiRecords));
263 sentinel.firstPos = std::numeric_limits<size_t>::max();
264 sentinel.lastPos = std::numeric_limits<size_t>::max();
265 // sentinel.id = std::numeric_limits<size_t>::max();
266 appendValue(gsiRecords, sentinel);
267
268 String<TInterval> openIntervals;
269 unsigned i = 0;
270 for (TIterator it = begin(gsiRecords, Standard()), itEnd = end(gsiRecords, Standard()); it != itEnd; ++it, ++i)
271 {
272 unsigned count = 0;
273 for (unsigned j = 0; j < length(openIntervals); ++j)
274 {
275 unsigned idx = length(openIntervals) - 1 - j;
276 GsiRecord const thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
277 SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
278 if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
279 count += 1;
280 }
281 resize(openIntervals, length(openIntervals) - count);
282
283 // Perform distance lowering for containing intervals.
284 // std::cerr << "OPEN INTERVALS\n";
285 // for (unsigned j = 0; j < length(openIntervals); ++j)
286 // std::cerr << " " << gsiRecords[cargo(openIntervals[j])] << "\t" << gsiRecords[cargo(openIntervals[j])].originalDistance << "\n";
287 // std::cerr << "\n";
288 for (unsigned j = 0; j < length(openIntervals); ++j)
289 {
290 unsigned idx = j; //length(openIntervals) - 1 - j;
291 unsigned id = cargo(openIntervals[idx]);
292 if (gsiRecords[id].distance <= maxError)
293 gsiRecords[id].distance = _min(gsiRecords[id].distance, it->distance);
294 else
295 break; // All containing intervals must have a greater distance.
296 }
297
298 appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
299 }
300
301 // Step 2: Filter out intervals that are contained in intervals of lesser/equal distance.
302 String<GsiRecord> filteredGsiRecords;
303 clear(openIntervals);
304 i = 0;
305 for (TIterator it = begin(gsiRecords, Standard()), itend = end(gsiRecords, Standard()); it != itend; ++it, ++i)
306 {
307 // Remove non-overlapping intervals on top of openInterval stack, appending to filtered intervals
308 unsigned count = 0;
309 for (unsigned j = 0; j < length(openIntervals); ++j)
310 {
311 unsigned idx = length(openIntervals) - 1 - j;
312 GsiRecord const & thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
313 SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
314 if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
315 {
316 count += 1;
317 unsigned startDistance = gsiRecords[cargo(openIntervals[idx])].distance;
318 if (!empty(filteredGsiRecords))
319 {
320 if (back(filteredGsiRecords).lastPos >= leftBoundary(openIntervals[idx]))
321 {
322 if (back(filteredGsiRecords).contigId == gsiRecords[cargo(openIntervals[idx])].contigId)
323 {
324 // Assert current containing already written out.
325 SEQAN_ASSERT_GEQ(back(filteredGsiRecords).firstPos, leftBoundary(openIntervals[idx]));
326 SEQAN_ASSERT_LEQ(back(filteredGsiRecords).lastPos + 1, rightBoundary(openIntervals[idx]));
327 // Get start distance.
328 startDistance = back(filteredGsiRecords).distance + 1;
329 }
330 }
331 }
332 unsigned upperLimit = maxError;
333 if ((unsigned)maxError < startDistance)
334 upperLimit = startDistance;
335 for (unsigned i = startDistance; i <= upperLimit; ++i)
336 {
337 appendValue(filteredGsiRecords, gsiRecords[cargo(openIntervals[idx])]);
338 back(filteredGsiRecords).originalDistance = i;
339 }
340 }
341 }
342 resize(openIntervals, length(openIntervals) - count);
343
344 // Add interval to the stack of intervals.
345 if (empty(openIntervals) || gsiRecords[cargo(back(openIntervals))].distance > it->distance)
346 appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
347 }
348 move(gsiRecords, filteredGsiRecords);
349 }
350
351 // ----------------------------------------------------------------------------
352 // Function benchmarkReadResult()
353 // ----------------------------------------------------------------------------
354
355 template <typename TPatternSpec>
benchmarkReadResult(RabemaStats & result,String<BamAlignmentRecord> const & samRecords,BamFileIn const & bamFileIn,String<GsiRecord> const & gsiRecords,FaiIndex const & faiIndex,StringSet<Dna5String> const & refSeqs,RefIdMapping const & refIdMapping,RabemaEvaluationOptions const & options,TPatternSpec const &,bool pairedEnd=false,bool second=false)356 int benchmarkReadResult(RabemaStats & result,
357 String<BamAlignmentRecord> const & samRecords,
358 BamFileIn const & bamFileIn,
359 String<GsiRecord> const & gsiRecords,
360 FaiIndex const & faiIndex,
361 StringSet<Dna5String> const & refSeqs,
362 RefIdMapping const & refIdMapping,
363 RabemaEvaluationOptions const & options,
364 TPatternSpec const & /*tagPattern*/,
365 bool pairedEnd = false,
366 bool second = false)
367 {
368 typedef IntervalAndCargo<unsigned, unsigned> TInterval;
369 #define DEBUG_RABEMA 0
370 #if DEBUG_RABEMA
371 std::cerr << ",--\n"
372 << "| num SAM\t" << length(samRecords) << "\n"
373 << "| num GSI\t" << length(gsiRecords) << "\n"
374 << "`--\n";
375 if (!empty(samRecords))
376 std::cerr << "SAM ID\t" << samRecords[0].qName << "\n";
377 if (!empty(gsiRecords))
378 std::cerr << "GSI ID\t" << gsiRecords[0].readName << "\n";
379 #endif // #if DEBUG_RABEMA
380
381 if (options.oracleMode && empty(gsiRecords))
382 {
383 // There are no GSI intervals in oracle mode. This can be the case if we constructed the gold standard
384 // with a maximal error rate. If this is the case then we ignore these reads.
385 return 0;
386 }
387
388 if (options.onlyUniqueReads && length(samRecords) != 1)
389 {
390 // The read was mapped non-uniquely by the mapper. Either it was not found or found at multiple locations
391 // However, to compute the precision of mapper we only evaluate whether reads mapped to a single location
392 // were correctly mapped.
393 return 0;
394 }
395
396 // Select gold standard intervals (GSI) records.
397 //
398 // We only select intervals that match the specification of pairedEnd, second. Also, the intervals must be for an
399 // error rate less than or equal to the maximal configured one. In case of any-best or all-best mode, we only
400 // select those of the eligible intervals with the lowest error rate.
401 //
402 // In case of oracle mode, we ignore the distance of the intervals in the GSI file here but use it later on.
403 //
404 // Start with picking the smallest distance if *-best mode.
405 int smallestDistance = options.oracleMode ? std::numeric_limits<int>::max() : options.maxError;
406 // Note that smallestDistance (as bestDistance defined below) is expressed as percent of read length ceiled
407 // and cat to an int value.
408 if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST ||
409 options.benchmarkCategory == CATEGORY_ALL_BEST)
410 for (unsigned i = 0; i < length(gsiRecords); ++i)
411 smallestDistance = std::min(smallestDistance, gsiRecords[i].distance);
412 int largestDistance = options.maxError;
413 if (options.oracleMode && smallestDistance != std::numeric_limits<int>::max())
414 for (unsigned i = 0; i < length(gsiRecords); ++i)
415 largestDistance = std::max(largestDistance, gsiRecords[i].distance);
416 String<GsiRecord> pickedGsiRecords;
417 #if DEBUG_RABEMA
418 std::cerr << "--\n";
419 #endif // #if DEBUG_RABEMA
420 for (unsigned i = 0; i < length(gsiRecords); ++i)
421 {
422 #if DEBUG_RABEMA
423 std::cerr << "ORIGINAL\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
424 #endif // DEBUG_RABEMA
425
426 // Note: In case of oracle mode, we ignore the distance.
427 // TODO(holtgrew): Remove the following two lines.
428 //if (!options.oracleMode && gsiRecords[i].distance > smallestDistance)
429 // continue; // Skip with wrong distance.
430 if (!options.oracleMode && gsiRecords[i].distance > options.maxError)
431 continue; // Ignore intervals with too high error rate.
432 if (!pairedEnd && (gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
433 continue; // Skip paired if non-paired selected.
434 if (pairedEnd && !(gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
435 continue; // Skip non-paired if paired selected.
436 if (pairedEnd && second && !(gsiRecords[i].flags & GsiRecord::FLAG_SECOND_MATE))
437 continue; // Skip if second selected but this interval is not for second.
438
439 appendValue(pickedGsiRecords, gsiRecords[i]);
440 #if DEBUG_RABEMA
441 std::cerr << "PICKED\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
442 #endif // DEBUG_RABEMA
443
444 // Get index of the sequence from GSI record contig name.
445 if (!getIdByName(back(pickedGsiRecords).contigId, faiIndex, back(pickedGsiRecords).contigName))
446 {
447 std::cerr << "ERROR: Could not find reference sequence for name "
448 << back(pickedGsiRecords).contigName << '\n';
449 return 1;
450 }
451 }
452 #if DEBUG_RABEMA
453 std::cerr << "--\n";
454 #endif // DEBUG_RABEMA
455
456 // On these selected GSI records, we now perform interval lowering if not in oracle mode.. This means that an
457 // interval I with distance k_i containing an interval J with distance k_j < k_i is re-labeled with a distance of
458 // k_j for the smallest k_j of all contained intervals J. The original distance is stored in the originalDistance
459 // member.
460 if (!options.oracleMode)
461 performIntervalLowering(pickedGsiRecords, options.maxError);
462
463 #if DEBUG_RABEMA
464 for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
465 std::cerr << "LOWERED\t" << pickedGsiRecords[i] << "\t" << pickedGsiRecords[i].originalDistance << "\n";
466 std::cerr << "--\n";
467 #endif // DEBUG_RABEMA
468
469 // Next we filter the lowered intervals to those with an original distance of options.maxError. In the case of
470 // all-best and any-best we also only keep those with the smallest distance of all intervals for this read.
471 String<GsiRecord> filteredGsiRecords;
472 for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
473 {
474 GsiRecord const & record = pickedGsiRecords[i];
475 bool keep = (record.originalDistance == options.maxError);
476 if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ALL)
477 keep = keep && (record.distance == smallestDistance);
478 if (options.oracleMode && record.originalDistance <= largestDistance)
479 keep = true;
480 if (keep)
481 appendValue(filteredGsiRecords, record);
482 }
483
484 // Build string of intervals for each reference sequence from these lowered and filtered records.
485 String<String<TInterval> > intervals;
486 resize(intervals, length(refSeqs));
487 String<unsigned> numIntervalsForErrorRate;
488 resize(numIntervalsForErrorRate, options.maxError + 1, 0);
489 String<int> intervalDistances; // Distance of interval i.
490 unsigned numIntervals = 0;
491 for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
492 {
493 #if DEBUG_RABEMA
494 std::cerr << "USED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
495 #endif // #if DEBUG_RABEMA
496 int distance = filteredGsiRecords[i].distance;
497
498 appendValue(intervals[filteredGsiRecords[i].contigId],
499 TInterval(filteredGsiRecords[i].firstPos, filteredGsiRecords[i].lastPos + 1,
500 length(intervalDistances)));
501 appendValue(intervalDistances, distance);
502 numIntervals += 1;
503 if (distance >= (int)length(numIntervalsForErrorRate))
504 resize(numIntervalsForErrorRate, distance + 1, 0);
505 if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ANY_BEST)
506 numIntervalsForErrorRate[distance] += 1;
507 }
508 if (options.benchmarkCategory == CATEGORY_ANY_BEST && !empty(filteredGsiRecords))
509 numIntervalsForErrorRate[smallestDistance] += 1;
510 // Marker array that states whether an interval was hit.
511 String<bool> intervalHit;
512 resize(intervalHit, length(intervalDistances), false);
513
514 // Build interval trees.
515 String<IntervalTree<unsigned> > intervalTrees;
516 resize(intervalTrees, length(refSeqs));
517 for (unsigned i = 0; i < length(intervalTrees); ++i)
518 createIntervalTree(intervalTrees[i], intervals[i]);
519
520 // One of the SAM records must have a non-empty SEQ field, extract the read seq.
521 Dna5String readSeqL, readSeqR;
522 bool seenL = false, seenR = false;
523 for (unsigned i = 0; i < length(samRecords); ++i)
524 {
525 seenL |= hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]));
526 seenR |= hasFlagLast(samRecords[i]);
527 if ((hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]))) &&
528 !empty(samRecords[i].seq))
529 {
530 readSeqL = samRecords[i].seq;
531 if (hasFlagRC(samRecords[i]))
532 reverseComplement(readSeqL);
533 }
534 else if (hasFlagLast(samRecords[i]) && !empty(samRecords[i].seq))
535 {
536 readSeqR = samRecords[i].seq;
537 if (hasFlagRC(samRecords[i]))
538 reverseComplement(readSeqR);
539 }
540 if (!empty(readSeqL) && !empty(readSeqR))
541 break; // Short-circuit and break.
542 }
543 if (seenL && empty(readSeqL))
544 {
545 std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (left-end)\n";
546 return 1;
547 }
548 if (seenR && empty(readSeqR))
549 {
550 std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (right-end)\n";
551 return 1;
552 }
553
554 Dna5String readSeq;
555 Dna5String contigSeq;
556 String<unsigned> queryResult;
557
558 // Try to hit intervals.
559 bool mappedAny = false; // Whether any valid alignment was found.
560 for (unsigned i = 0; i < length(samRecords); ++i)
561 {
562 BamAlignmentRecord const & samRecord = samRecords[i];
563 int seqId = refIdMapping.map[samRecord.rID];
564
565 // Compute actual alignment score to rule out invalidly reported alignments.
566 //
567 // If we run in oracle mode then we ignore the actual alignment score and use the best alignment. We only care
568 // about the alignment's end position in this case.
569 if (hasFlagLast(samRecord))
570 readSeq = readSeqR;
571 else
572 readSeq = readSeqL;
573
574 // TODO(holtgrew): Remove const cast once we have const holders!
575 BamTagsDict bamTags(const_cast<CharString &>(samRecord.tags));
576 unsigned idx = 0;
577
578 // If the CIGAR is not present, e.g. a * is present instead,
579 // we can still get the extra position from an user-defined tag.
580 unsigned endPos = samRecord.beginPos + getAlignmentLengthInRef(samRecord) - countPaddings(samRecord.cigar);
581 if (empty(samRecord.cigar))
582 {
583 if (empty(options.extraPosTag) ||
584 !(findTagKey(idx, bamTags, options.extraPosTag) && extractTagValue(endPos, bamTags, idx)))
585 {
586 // Simply try to guess the end position.
587 endPos = samRecord.beginPos + length(readSeq) + 1;
588 std::cerr << "WARNING: Unknown alignment end position for read " << samRecord.qName << ".\n";
589 }
590 // The extra position tag must be 1-based.
591 SEQAN_ASSERT_GT(endPos, 0u);
592 endPos--;
593 }
594
595 int bestDistance = std::numeric_limits<int>::min(); // Marker for "not set yet".
596 // Note that bestDistance expresses the distance in percent error, relative to the read length, ceiled up
597 // and converted to an int value.
598 if (!options.oracleMode)
599 {
600 // Get best distance from NM tag if set and we are to trust it.
601 if (options.trustNM && findTagKey(idx, bamTags, "NM") && extractTagValue(bestDistance, bamTags, idx))
602 {
603 // Convert from count to rate.
604 bestDistance = static_cast<int>(ceil(100.0 * bestDistance / length(readSeq)));
605 }
606 // Otherwise, perform a realignment.
607 else
608 {
609 unsigned bandwidth = static_cast<int>(ceil(0.01 * options.maxError * length(readSeq)));
610
611 unsigned intervalBegin = samRecord.beginPos;
612 if (intervalBegin > bandwidth)
613 intervalBegin -= bandwidth;
614
615 unsigned intervalEnd = endPos;
616 if (intervalEnd < length(refSeqs[seqId]) - 2 * bandwidth)
617 intervalEnd += 2 * bandwidth;
618
619 contigSeq = infix(refSeqs[seqId], intervalBegin, intervalEnd);
620 if (hasFlagRC(samRecord))
621 reverseComplement(contigSeq);
622 Finder<Dna5String> finder(contigSeq);
623 Pattern<Dna5String, TPatternSpec> pattern(readSeq, -static_cast<int>(length(readSeq)) * 1000);
624 _patternMatchNOfPattern(pattern, options.matchN);
625 _patternMatchNOfFinder(pattern, options.matchN);
626 bool ret = setEndPosition(finder, pattern, length(contigSeq) - bandwidth);
627 ignoreUnusedVariableWarning(ret);
628 // write2(std::cerr, samRecord, bamIOContext, Sam());
629 SEQAN_CHECK(ret, "setEndPosition() must not fail!");
630 bestDistance = static_cast<int>(ceil(-100.0 * getScore(pattern) / length(readSeq)));
631 }
632 }
633
634 // Get sequence id and last position of alignment. We try to hit the interval with the last position (not
635 // C-style end) of the read.
636 unsigned lastPos = hasFlagRC(samRecord) ? length(refSeqs[seqId]) - samRecord.beginPos - 1 : endPos - 1;
637
638 if (options.showTryHitIntervals)
639 std::cerr << "TRY HIT\tchr=" << sequenceName(faiIndex, seqId) << "\tlastPos=" << lastPos << "\tqName="
640 << samRecord.qName << "\n";
641
642 // Try to hit any interval.
643 clear(queryResult);
644 findIntervals(queryResult, intervalTrees[seqId], lastPos);
645 mappedAny = mappedAny || !empty(queryResult);
646 #if DEBUG_RABEMA
647 if (mappedAny)
648 std::cout << "MAPPED\t" << front(samRecords).qName << "\n";
649 #endif // #if DEBUG_RABEMA
650 if (!empty(queryResult))
651 {
652 for (unsigned i = 0; i < length(queryResult); ++i)
653 intervalHit[queryResult[i]] = true;
654 }
655 else if (bestDistance != std::numeric_limits<int>::min())
656 {
657 // && bestDistance <= options.maxError)
658
659 // We now have a hit with a distance below the maximal configured error. This is a candidate for an
660 // additional hit. In case of all-best and any-best, this is only one if its distance is better than the
661 // best distance for this read. In the case of all, it is one if its distance is less than the allowed
662 // maximal distance. If it is not an additional hit then it is an invalid one.
663 //
664 // Note that all distances including allowedDistance are percent of read length, ceiled up.
665 int allowedDistance = options.maxError;
666 if ((options.benchmarkCategory == CATEGORY_ALL_BEST || options.benchmarkCategory == CATEGORY_ANY_BEST) &&
667 (smallestDistance != std::numeric_limits<int>::max()))
668 allowedDistance = smallestDistance;
669 if (bestDistance > allowedDistance)
670 {
671 if (options.showSuperflousIntervals)
672 {
673 std::cerr << "SUPERFLOUS/INVALID\t";
674 DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
675 write(cerrIt, samRecord, context(bamFileIn), Sam());
676 std::cerr << " DISTANCE: \t" << bestDistance << '\n'
677 << " ALLOWED DISTANCE:\t" << options.maxError << '\n';
678 }
679 result.invalidAlignments += 1;
680 continue;
681 }
682
683 // We found an additional hit.
684 if (options.showAdditionalIntervals || !options.dontPanic)
685 {
686 std::cerr << "ADDITIONAL HIT\t";
687 DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
688 write(cerrIt, samRecord, context(bamFileIn), Sam());
689 std::cerr << '\n';
690
691 for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
692 std::cerr << "FILTERED GSI RECORD\t" << filteredGsiRecords[i] << "\n";
693 }
694
695 if (!options.dontPanic)
696 {
697 std::cerr << "ERROR: Found an additional hit for read " << samRecord.qName << "!\n";
698 return 1;
699 }
700 std::cerr << "WARNING: Found an additional hit for read " << samRecord.qName << ".\n";
701 }
702 }
703
704 // Compute number of found intervals.
705 unsigned numFound = 0;
706 String<unsigned> foundIntervalsForErrorRate;
707 if ((int)length(foundIntervalsForErrorRate) <= largestDistance + 1)
708 resize(foundIntervalsForErrorRate, largestDistance + 1, 0);
709 if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST)
710 {
711 int bestDistance = std::numeric_limits<int>::max();
712 int bestIdx = 0;
713 for (unsigned i = 0; i < length(intervalDistances); ++i)
714 if (intervalHit[i])
715 {
716 if (options.showHitIntervals)
717 std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
718 if (bestDistance > intervalDistances[i])
719 bestIdx = i;
720 bestDistance = std::min(bestDistance, intervalDistances[i]);
721 }
722 if (bestDistance != std::numeric_limits<int>::max())
723 {
724 if (options.showHitIntervals)
725 std::cerr << "HIT_BEST\t" << filteredGsiRecords[bestIdx] << "\t" << filteredGsiRecords[bestIdx].originalDistance << "\n";
726 numFound += 1;
727 foundIntervalsForErrorRate[bestDistance] += 1;
728 }
729 if (!mappedAny && options.showMissedIntervals)
730 {
731 for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
732 std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
733 }
734 }
735 else // !options.oracleMode && options.benchmarkCategory in ["all-best", "all"]
736 {
737 for (unsigned i = 0; i < length(intervalDistances); ++i)
738 {
739 // if (options.benchmarkCategory == CATEGORY_ALL && intervalDistances[i] != options.maxError)
740 // continue; // Only count intervals on our maximal error rate in "all" mode.
741 if (intervalHit[i])
742 {
743 if (options.showHitIntervals)
744 std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
745 numFound += 1;
746 foundIntervalsForErrorRate[intervalDistances[i]] += 1;
747 }
748 else
749 {
750 if (options.showMissedIntervals) // inside braces for consistency with above
751 std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\n";
752 }
753 }
754 SEQAN_ASSERT_LEQ(numFound, length(intervalDistances));
755 }
756
757 // Update the resulting RabemaStats.
758 updateMaximalErrorRate(result, largestDistance);
759 result.totalReads += 1;
760 result.readsInGsi += (numIntervals > 0u);
761 result.mappedReads += mappedAny;
762 if (options.oracleMode)
763 {
764 bool found = (numFound > 0u);
765 result.intervalsToFind += 1;
766 result.intervalsFound += found;
767 result.normalizedIntervals += found;
768 int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
769 result.intervalsToFindForErrorRate[d] += 1;
770 result.intervalsFoundForErrorRate[d] += found;
771 result.normalizedIntervalsToFindForErrorRate[d] += 1;
772 result.normalizedIntervalsFoundForErrorRate[d] += found;
773 }
774 else if (options.benchmarkCategory == CATEGORY_ANY_BEST)
775 {
776 int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
777 bool toFind = (numIntervalsForErrorRate[d] > 0u);
778 bool found = (foundIntervalsForErrorRate[d] > 0u);
779 SEQAN_ASSERT_LEQ(found, toFind);
780 result.intervalsToFind += toFind;
781 result.intervalsFound += found;
782 result.normalizedIntervals += found;
783 result.intervalsToFindForErrorRate[d] += toFind;
784 result.intervalsFoundForErrorRate[d] += found;
785 result.normalizedIntervalsToFindForErrorRate[d] += toFind;
786 result.normalizedIntervalsFoundForErrorRate[d] += found;
787 }
788 else // all-best or all was selected
789 {
790 unsigned intervalsToFind = 0;
791 unsigned intervalsFound = 0;
792 for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
793 {
794 intervalsToFind += numIntervalsForErrorRate[d];
795 intervalsFound += foundIntervalsForErrorRate[d];
796 result.intervalsToFindForErrorRate[d] += numIntervalsForErrorRate[d];
797 result.intervalsFoundForErrorRate[d] += foundIntervalsForErrorRate[d];
798 }
799 result.intervalsToFind += intervalsToFind;
800 result.intervalsFound += intervalsFound;
801 if (intervalsToFind > 0u)
802 result.normalizedIntervals += 1.0 * intervalsFound / intervalsToFind;
803 for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
804 {
805 if (intervalsToFind == 0u)
806 continue;
807 // In case of all-best we only count those with the best error rate for this read.
808 if (options.benchmarkCategory == CATEGORY_ALL)
809 {
810 result.normalizedIntervalsToFindForErrorRate[d] += 1.0 * numIntervalsForErrorRate[d] / intervalsToFind;
811 result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
812 }
813 else if (options.benchmarkCategory == CATEGORY_ALL_BEST && (int)d == smallestDistance)
814 {
815 result.normalizedIntervalsToFindForErrorRate[d] += 1;
816 result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
817 }
818 }
819 }
820
821 return 0;
822 }
823
824 // ----------------------------------------------------------------------------
825 // Function clearPairedFlags()
826 // ----------------------------------------------------------------------------
827
clearPairedFlags(seqan::BamAlignmentRecord & record)828 void clearPairedFlags(seqan::BamAlignmentRecord & record)
829 {
830 if (hasFlagMultiple(record))
831 record.flag = record.flag ^ seqan::BAM_FLAG_MULTIPLE;
832 if (hasFlagFirst(record))
833 record.flag = record.flag ^ seqan::BAM_FLAG_FIRST;
834 if (hasFlagLast(record))
835 record.flag = record.flag ^ seqan::BAM_FLAG_LAST;
836 if (hasFlagNextRC(record))
837 record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_RC;
838 if (hasFlagNextUnmapped(record))
839 record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_UNMAPPED;
840 }
841
842 // ----------------------------------------------------------------------------
843 // Function compareAlignedReadsToReference()
844 // ----------------------------------------------------------------------------
845
846 // Stream over both the SAM/BAM and GSI file and compare the hits in the SAM/BAM file against the intervals in the GSI
847 // file.
848 //
849 // Both the SAM/BAM file and the GSI file have to be sorted by queryname for this to work.
850
851 template <typename TForwardIter, typename TPatternSpec>
852 int
compareAlignedReadsToReference(RabemaStats & result,BamFileIn & bamFileIn,FaiIndex const & faiIndex,StringSet<Dna5String> const & refSeqs,TForwardIter & gsiIter,RabemaEvaluationOptions const & options,TPatternSpec const & tagPattern)853 compareAlignedReadsToReference(RabemaStats & result,
854 BamFileIn & bamFileIn,
855 FaiIndex const & faiIndex,
856 StringSet<Dna5String> const & refSeqs,
857 TForwardIter & gsiIter,
858 RabemaEvaluationOptions const & options,
859 TPatternSpec const & tagPattern)
860 {
861 // Mapping between ref IDs from SAM/BAM file and reference sequence (from SAM/BAM file to reference sequences).
862 RefIdMapping refIdMapping;
863 rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
864 contigNames(context(bamFileIn)));
865
866 // Read in initial SAM/GSI records.
867 BamAlignmentRecord samRecord;
868 if (!atEnd(bamFileIn))
869 try
870 {
871 readRecord(samRecord, bamFileIn);
872 }
873 catch (seqan::ParseError const & ioErr)
874 {
875 std::cerr << "ERROR: Could not read first SAM/BAM record.\n";
876 return 1;
877 }
878 if (options.ignorePairedFlags)
879 clearPairedFlags(samRecord);
880 GsiRecord gsiRecord;
881 if (!atEnd(gsiIter))
882 try
883 {
884 readRecord(gsiRecord, gsiIter, Gsi());
885 }
886 catch (seqan::ParseError const & ioErr)
887 {
888 std::cerr << "ERROR: Could not read first GSI record.\n";
889 return 1;
890 }
891
892 // Current SAM/BAM and GSI records are stored in these arrays.
893 String<BamAlignmentRecord> currentSamRecords;
894 String<GsiRecord> currentGsiRecords;
895
896 // These flags store whether we processed the last SAM/BAM and GSI record.
897 bool samDone = false, gsiDone = false;
898
899 // The main loop: We walk over both the SAM/BAM and GSI records.
900 // unsigned chunkI = 0;
901 std::cerr << "Each dot corresponds to 10k processed reads.\n"
902 << "\n"
903 << "Progress: ";
904 unsigned i = 0;
905 while (!samDone || !gsiDone)
906 {
907 if (i > 0u && i % (100 * 1000) == 0u)
908 std::cerr << i / 100 / 1000 << "00k";
909 else if (i > 0 && i % (10 * 1000) == 0u)
910 std::cerr << '.';
911 ++i;
912
913 // We process the record for the next query/read. Since records for this next query/read might be missing in
914 // both files, we need to determine which is the next one.
915 CharString currentReadName;
916 if (gsiDone)
917 currentReadName = samRecord.qName;
918 else if (samDone)
919 currentReadName = gsiRecord.readName;
920 else
921 currentReadName = lessThanSamtoolsQueryName(gsiRecord.readName, samRecord.qName) ?
922 gsiRecord.readName : samRecord.qName;
923
924 // These flags determine whether evaluation is run for single-end and/or paired-end reads.
925 bool seenSingleEnd = false, seenPairedEnd = false;
926
927 // Read all SAM/BAM records with the same query name.
928 clear(currentSamRecords);
929 while (!samDone && samRecord.qName == currentReadName)
930 {
931 if (!hasFlagUnmapped(samRecord)) // Ignore records with non-aligned reads.
932 {
933 seenSingleEnd |= !hasFlagMultiple(samRecord);
934 seenPairedEnd |= hasFlagMultiple(samRecord);
935 appendValue(currentSamRecords, samRecord);
936 }
937 if (atEnd(bamFileIn))
938 {
939 // At end of SAM/BAM File, do not read next one.
940 samDone = true;
941 continue;
942 }
943 try
944 {
945 readRecord(samRecord, bamFileIn);
946 }
947 catch (seqan::ParseError const & ioErr)
948 {
949 std::cerr << "ERROR: Could not read SAM/BAM record.\n";
950 return 1;
951 }
952 if (options.ignorePairedFlags)
953 clearPairedFlags(samRecord);
954 if (options.checkSorting && lessThanSamtoolsQueryName(samRecord.qName, currentReadName))
955 {
956 std::cerr << "ERROR: Wrong order in SAM/BAM file: " << samRecord.qName << " succeeds "
957 << currentReadName << " in file.\n"
958 << "File must be sorted by read name/queryname.\n";
959 return 1;
960 }
961 // Rebuild ref ID mapping if we discovered a new reference sequence.
962 if (length(contigNames(context(bamFileIn))) != length(refIdMapping))
963 rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
964 contigNames(context(bamFileIn)));
965 }
966
967 // Read in the next block of GSI records.
968 clear(currentGsiRecords);
969 while (!gsiDone && gsiRecord.readName == currentReadName)
970 {
971 seenSingleEnd |= !(gsiRecord.flags & GsiRecord::FLAG_PAIRED);
972 seenPairedEnd |= (gsiRecord.flags & GsiRecord::FLAG_PAIRED);
973 appendValue(currentGsiRecords, gsiRecord);
974 if (atEnd(gsiIter))
975 {
976 // At end of GSI File, do not read next one.
977 gsiDone = true;
978 continue;
979 }
980 try
981 {
982 readRecord(gsiRecord, gsiIter, Gsi());
983 }
984 catch (seqan::ParseError const & ioErr)
985 {
986 std::cerr << "ERROR: Could not read GSI record.\n";
987 return 1;
988 }
989 if (options.checkSorting && lessThanSamtoolsQueryName(gsiRecord.readName, currentReadName))
990 {
991 std::cerr << "ERROR: Wrong order in GSI file: " << gsiRecord.readName << " succeeds "
992 << currentReadName << " in file.\n"
993 << "File must be sorted by read name/queryname.\n";
994 return 1;
995 }
996 }
997
998 // Now, compare the SAM/BAM records against the intervals stored in the GSI records.
999 //
1000 // We collected the records for all queries. Here, we differentiate between the different cases.
1001 if (seenSingleEnd)
1002 {
1003 int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1004 faiIndex, refSeqs, refIdMapping, options, tagPattern,
1005 /*pairedEnd=*/ false);
1006 if (res != 0)
1007 return 1;
1008 }
1009 if (seenPairedEnd)
1010 {
1011 int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1012 faiIndex, refSeqs, refIdMapping, options, tagPattern,
1013 /*pairedEnd=*/ true, /*second=*/ false);
1014 if (res != 0)
1015 return 1;
1016
1017 res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1018 faiIndex, refSeqs, refIdMapping, options, tagPattern,
1019 /*pairedEnd=*/ true, /*second=*/ true);
1020 if (res != 0)
1021 return 1;
1022 }
1023 }
1024 std::cerr << " DONE\n";
1025
1026 return 0;
1027 }
1028
1029 // ---------------------------------------------------------------------------
1030 // Function parseCommandLine()
1031 // ---------------------------------------------------------------------------
1032
1033 seqan::ArgumentParser::ParseResult
parseCommandLine(RabemaEvaluationOptions & options,int argc,char const ** argv)1034 parseCommandLine(RabemaEvaluationOptions & options, int argc, char const ** argv)
1035 {
1036 // -----------------------------------------------------------------------
1037 // Parse Command Line Using ArgumentParser
1038 // -----------------------------------------------------------------------
1039
1040 seqan::ArgumentParser parser("rabema_evaluate");
1041 setShortDescription(parser, "RABEMA Evaluation");
1042 setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
1043 setDate(parser, SEQAN_DATE);
1044 setCategory(parser, "Benchmarking");
1045
1046 addUsageLine(parser,
1047 "[\\fIOPTIONS\\fP] \\fB--reference\\fP \\fIREF.fa\\fP \\fB--in-gsi\\fP \\fIIN.gsi\\fP "
1048 "\\fB--in-bam\\fP \\fIMAPPING.{sam,bam}\\fP");
1049 addDescription(parser,
1050 "Compare the SAM/bam output \\fIMAPPING.sam\\fP/\\fIMAPPING.bam\\fP of any read mapper against "
1051 "the RABEMA gold standard previously built with \\fBrabema_build_gold_standard\\fP. The input "
1052 "is a reference FASTA file, a gold standard interval (GSI) file and the SAM/BAM input to "
1053 "evaluate.");
1054 addDescription(parser,
1055 "The input SAM/BAM file must be \\fIsorted by queryname\\fP. The program will create a "
1056 "FASTA index file \\fIREF.fa.fai\\fP for fast random access to the reference.");
1057
1058 addOption(parser, seqan::ArgParseOption("v", "verbose", "Enable verbose output."));
1059 addOption(parser, seqan::ArgParseOption("vv", "very-verbose", "Enable even more verbose output."));
1060
1061 addSection(parser, "Input / Output");
1062 // addOption(parser, seqan::ArgParseOption("o", "out-gsi", "Path to write the resulting GSI file to.",
1063 // seqan::ArgParseArgument::STRING, false, "GSI"));
1064 // setRequired(parser, "out-gsi", true);
1065 addOption(parser, seqan::ArgParseOption("r", "reference", "Path to load reference FASTA from.",
1066 seqan::ArgParseArgument::INPUT_FILE, "FASTA"));
1067 setValidValues(parser, "reference", seqan::SeqFileIn::getFileExtensions());
1068 setRequired(parser, "reference", true);
1069 addOption(parser, seqan::ArgParseOption("g", "in-gsi",
1070 "Path to load gold standard intervals from. If compressed using gzip, "
1071 "the file will be decompressed on the fly.",
1072 seqan::ArgParseArgument::INPUT_FILE, "GSI"));
1073 setRequired(parser, "in-gsi", true);
1074 setValidValues(parser, "in-gsi", "gsi gsi.gz"); // GSI (Gold Standard Intervals) Format only.
1075
1076 addOption(parser, seqan::ArgParseOption("b", "in-bam", "Path to load the read mapper SAM or BAM output from.",
1077 seqan::ArgParseArgument::INPUT_FILE, "BAM"));
1078 setValidValues(parser, "in-bam", BamFileIn::getFileExtensions());
1079 setRequired(parser, "in-bam");
1080 addOption(parser, seqan::ArgParseOption("", "out-tsv", "Path to write the statistics to as TSV.",
1081 seqan::ArgParseArgument::OUTPUT_FILE, "TSV"));
1082 setValidValues(parser, "out-tsv", "rabema_report_tsv");
1083
1084 addOption(parser, seqan::ArgParseOption("", "dont-check-sorting",
1085 "Do not check sortedness (by name) of input SAM/BAM files. This is "
1086 "required if the reads are not sorted by name in the original FASTQ "
1087 "files. Files from the SRA and ENA generally are sorted."));
1088
1089 addSection(parser, "Benchmark Parameters");
1090 addOption(parser, seqan::ArgParseOption("", "oracle-mode",
1091 "Enable oracle mode. This is used for simulated data when the input "
1092 "GSI file gives exactly one position that is considered as the true "
1093 "sample position. For simulated data."));
1094 addOption(parser, seqan::ArgParseOption("", "only-unique-reads",
1095 "Consider only reads that a single alignment in the mapping result file. "
1096 "Useful for precision computation."));
1097 addOption(parser, seqan::ArgParseOption("", "match-N", "When set, N matches all characters without penalty."));
1098 addOption(parser, seqan::ArgParseOption("", "distance-metric",
1099 "Set distance metric. Valid values: hamming, edit. Default: edit.",
1100 seqan::ArgParseOption::STRING, "METRIC"));
1101 setValidValues(parser, "distance-metric", "hamming edit");
1102 setDefaultValue(parser, "distance-metric", "edit");
1103
1104 addOption(parser, seqan::ArgParseOption("e", "max-error",
1105 "Maximal error rate to build gold standard for in percent. This "
1106 "parameter is an integer and relative to the read length. "
1107 "The error rate is ignored in oracle mode, here the distance "
1108 "of the read at the sample position is taken, individually "
1109 "for each read. Default: 0",
1110 seqan::ArgParseArgument::INTEGER, "RATE"));
1111 setDefaultValue(parser, "max-error", 0);
1112
1113 addOption(parser, seqan::ArgParseOption("c", "benchmark-category",
1114 "Set benchmark category. One of {all, all-best, any-best. Default: all",
1115 seqan::ArgParseOption::STRING, "CAT"));
1116 setValidValues(parser, "benchmark-category", "all all-best any-best");
1117 setDefaultValue(parser, "benchmark-category", "all");
1118
1119 addOption(parser, seqan::ArgParseOption("", "trust-NM",
1120 "When set, we trust the alignment and distance from SAM/BAM file and no "
1121 "realignment is performed. Off by default."));
1122 addOption(parser, seqan::ArgParseOption("", "extra-pos-tag",
1123 "If the CIGAR string is absent, the missing alignment end position can be "
1124 "provided by this BAM tag.",
1125 seqan::ArgParseOption::STRING));
1126
1127 addOption(parser, seqan::ArgParseOption("", "ignore-paired-flags",
1128 "When set, we ignore all SAM/BAM flags related to pairing. This is "
1129 "necessary when analyzing SAM from SOAP's soap2sam.pl script."));
1130 addOption(parser, seqan::ArgParseOption("", "DONT-PANIC",
1131 "Do not stop program execution if an additional hit was found that "
1132 "indicates that the gold standard is incorrect."));
1133
1134 addSection(parser, "Logging");
1135 addText(parser, "");
1136 addText(parser,
1137 "The occurrence of \"invalid\" hits in the read mapper's output is not an error. If there are "
1138 "additional hits, however, this shows an error in the gold standard.");
1139 addOption(parser, seqan::ArgParseOption("", "show-missed-intervals",
1140 "Show details for each missed interval from the GSI."));
1141 addOption(parser, seqan::ArgParseOption("", "show-invalid-hits",
1142 "Show details for invalid hits (with too high error rate)."));
1143 addOption(parser, seqan::ArgParseOption("", "show-additional-hits",
1144 "Show details for additional hits (low enough error rate but not in "
1145 "gold standard."));
1146 addOption(parser, seqan::ArgParseOption("", "show-hits", "Show details for hit intervals."));
1147 addOption(parser, seqan::ArgParseOption("", "show-try-hit", "Show details for each alignment in SAM/BAM input."));
1148
1149 addTextSection(parser, "Return Values");
1150 addText(parser, "A return value of 0 indicates success, any other value indicates an error.");
1151
1152 // addTextSection(parser, "Examples");
1153
1154 // addListItem(parser,
1155 // "\\fBrabema_build_gold_standard\\fP \\fB-e\\fP \\fI4\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
1156 // "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1157 // "Build gold standard from a SAM file \\fIIN.sam\\fP with all mapping locations and a FASTA "
1158 // "reference \\fIREF.fa\\fP to GSI file \\fIOUT.gsi\\fP with a maximal error rate of \\fI4\\fP.");
1159 // addListItem(parser,
1160 // "\\fBrabema_build_gold_standard\\fP \\fB--distance-metric\\fP \\fIedit\\fP \\fB-e\\fP \\fI4\\fP "
1161 // "\\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP \\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1162 // "Same as above, but using Hamming instead of edit distance.");
1163 // addListItem(parser,
1164 // "\\fBrabema_build_gold_standard\\fP \\fB--oracle-mode\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
1165 // "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1166 // "Build gold standard from a SAM file \\fIIN.sam\\fP with the original sample position, e.g. "
1167 // "as exported by read simulator Mason.");
1168
1169 addTextSection(parser, "Memory Requirements");
1170 addText(parser,
1171 "From version 1.1, great care has been taken to keep the memory requirements as low as possible.");
1172 addText(parser,
1173 "The evaluation step needs to store the whole reference sequence in memory but little more "
1174 "memory. So, for the human genome, the memory requirements are below 4 GB, regardless of "
1175 "the size of the GSI or SAM/BAM file.");
1176
1177 addTextSection(parser, "References");
1178 addText(parser,
1179 "M. Holtgrewe, A.-K. Emde, D. Weese and K. Reinert. A Novel And Well-Defined Benchmarking Method "
1180 "For Second Generation Read Mapping, BMC Bioinformatics 2011, 12:210.");
1181 addListItem(parser, "\\fIhttp://www.seqan.de/rabema\\fP", "RABEMA Homepage");
1182 addListItem(parser, "\\fIhttp://www.seqan.de/mason\\fP", "Mason Homepage");
1183
1184 // Actually do the parsing and exit on error, help display etc.
1185 seqan::ArgumentParser::ParseResult res = parse(parser, argc, argv);
1186 if (res != seqan::ArgumentParser::PARSE_OK)
1187 return res;
1188
1189 // -----------------------------------------------------------------------
1190 // Fill BuildGoldStandardOptions Object
1191 // -----------------------------------------------------------------------
1192
1193 if (isSet(parser, "verbose"))
1194 options.verbosity = 2;
1195 if (isSet(parser, "very-verbose"))
1196 options.verbosity = 3;
1197
1198 if (isSet(parser, "reference"))
1199 getOptionValue(options.referencePath, parser, "reference");
1200 if (isSet(parser, "in-bam"))
1201 getOptionValue(options.inBamPath, parser, "in-bam");
1202 if (isSet(parser, "in-gsi"))
1203 getOptionValue(options.inGsiPath, parser, "in-gsi");
1204 if (isSet(parser, "out-tsv"))
1205 getOptionValue(options.outTsvPath, parser, "out-tsv");
1206
1207 getOptionValue(options.maxError, parser, "max-error");
1208 options.matchN = isSet(parser, "match-N");
1209 options.oracleMode = isSet(parser, "oracle-mode");
1210 options.onlyUniqueReads = isSet(parser, "only-unique-reads");
1211 CharString benchmarkCategory;
1212 getOptionValue(benchmarkCategory, parser, "benchmark-category");
1213 if (benchmarkCategory == "all")
1214 options.benchmarkCategory = CATEGORY_ALL;
1215 else if (benchmarkCategory == "all-best")
1216 options.benchmarkCategory = CATEGORY_ALL_BEST;
1217 else // if (benchmarkCategory == "any-best")
1218 options.benchmarkCategory = CATEGORY_ANY_BEST;
1219 CharString distanceMetric;
1220 getOptionValue(distanceMetric, parser, "distance-metric");
1221 if (distanceMetric == "edit")
1222 options.distanceMetric = EDIT_DISTANCE;
1223 else
1224 options.distanceMetric = HAMMING_DISTANCE;
1225 options.trustNM = isSet(parser, "trust-NM");
1226 getOptionValue(options.extraPosTag, parser, "extra-pos-tag");
1227 options.ignorePairedFlags = isSet(parser, "ignore-paired-flags");
1228 options.dontPanic = isSet(parser, "DONT-PANIC");
1229
1230 getOptionValue(options.checkSorting, parser, "dont-check-sorting");
1231 options.checkSorting = !options.checkSorting;
1232
1233 options.showMissedIntervals = isSet(parser, "show-missed-intervals");
1234 options.showSuperflousIntervals = isSet(parser, "show-invalid-hits");
1235 options.showAdditionalIntervals = isSet(parser, "show-additional-hits");
1236 options.showHitIntervals = isSet(parser, "show-hits");
1237 options.showTryHitIntervals = isSet(parser, "show-try-hit");
1238
1239 return res;
1240 }
1241
1242 // ----------------------------------------------------------------------------
1243 // Function main()
1244 // ----------------------------------------------------------------------------
1245
main(int argc,char const ** argv)1246 int main(int argc, char const ** argv)
1247 {
1248 // Parse command line and store results in options.
1249 RabemaEvaluationOptions options;
1250 seqan::ArgumentParser::ParseResult parseRes = parseCommandLine(options, argc, argv);
1251 if (parseRes != seqan::ArgumentParser::PARSE_OK)
1252 return parseRes == seqan::ArgumentParser::PARSE_ERROR;
1253
1254 double startTime = 0; // For measuring time below.
1255
1256 std::cerr << "==============================================================================\n"
1257 << " RABEMA - Read Alignment BEnchMArk\n"
1258 << "==============================================================================\n"
1259 << " Result Comparison\n"
1260 << "==============================================================================\n"
1261 << "\n";
1262 std::cerr << "____OPTIONS___________________________________________________________________\n\n";
1263
1264 std::cerr << "Max error rate [%] " << options.maxError << "\n"
1265 << "Oracle mode " << yesNo(options.oracleMode) << '\n'
1266 << "Only unique reads " << yesNo(options.onlyUniqueReads) << '\n'
1267 << "Benchmark category " << categoryName(options.benchmarkCategory) << "\n"
1268 << "Distance measure " << metricName(options.distanceMetric) << "\n"
1269 << "Match Ns " << yesNo(options.matchN) << '\n'
1270 << "Trust NM tag " << yesNo(options.trustNM) << '\n'
1271 << "Ignore paired flags " << yesNo(options.ignorePairedFlags) << '\n'
1272 << "GSI File " << options.inGsiPath << '\n'
1273 << "BAM File " << options.inBamPath << '\n'
1274 << "Reference File " << options.referencePath << '\n'
1275 << "TSV Output File " << options.outTsvPath << '\n'
1276 << "Check Sorting " << yesNo(options.checkSorting) << '\n'
1277 << "Show\n"
1278 << " additional " << yesNo(options.showAdditionalIntervals) << '\n'
1279 << " hit " << yesNo(options.showHitIntervals) << '\n'
1280 << " missed " << yesNo(options.showMissedIntervals) << '\n'
1281 << " superfluous " << yesNo(options.showSuperflousIntervals) << '\n'
1282 << " try hit " << yesNo(options.showTryHitIntervals) << '\n'
1283 << "\n";
1284
1285 std::cerr << "____LOADING FILES_____________________________________________________________\n\n";
1286
1287 // =================================================================
1288 // Prepare File I/O.
1289 // =================================================================
1290
1291 startTime = sysTime();
1292 // Open reference FAI index.
1293 std::cerr << "Reference Index " << options.referencePath << ".fai ...";
1294 FaiIndex faiIndex;
1295 if (!open(faiIndex, toCString(options.referencePath)))
1296 {
1297 std::cerr << " FAILED (not fatal, we can just build it)\n";
1298 std::cerr << "Building Index " << options.referencePath << ".fai ...";
1299 if (!build(faiIndex, toCString(options.referencePath)))
1300 {
1301 std::cerr << "Could not build FAI index.\n";
1302 return 1;
1303 }
1304 std::cerr << " OK\n";
1305 seqan::CharString faiPath = options.referencePath;
1306 append(faiPath, ".fai");
1307 std::cerr << "Reference Index " << faiPath << " ...";
1308 try
1309 {
1310 save(faiIndex, toCString(faiPath));
1311 std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1312 }
1313 catch (IOError const & ioErr)
1314 {
1315 std::cerr << "Could not write FAI index we just built.\n";
1316 return 1;
1317 }
1318 }
1319 std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1320
1321 std::cerr << "Reference Sequences " << options.referencePath << " ...";
1322 StringSet<Dna5String> refSeqs;
1323 resize(refSeqs, length(faiIndex.seqNameStore));
1324 for (unsigned i = 0; i < length(faiIndex.seqNameStore); ++i)
1325 {
1326 reserve(refSeqs[i], sequenceLength(faiIndex, i), Exact());
1327 try
1328 {
1329 readSequence(refSeqs[i], faiIndex, i);
1330 }
1331 catch (seqan::ParseError const & ioErr)
1332 {
1333 std::cerr << "ERROR: Could not read sequence " << faiIndex.seqNameStore[i] << ".\n";
1334 return 0;
1335 }
1336 }
1337 std::cerr << " OK\n";
1338
1339 // Open gold standard intervals (GSI) file and read in header.
1340 std::cerr << "Gold Standard Intervals " << options.inGsiPath << " (header) ...";
1341 VirtualStream<char, Input> inGsi;
1342 if (!open(inGsi, toCString(options.inGsiPath)))
1343 {
1344 std::cerr << "Could not open GSI file.\n";
1345 return 1;
1346 }
1347 if (!isEqual(format(inGsi), Nothing()))
1348 std::cerr << " (is compressed)";
1349 DirectionIterator<VirtualStream<char, Input>, Input>::Type inGsiIter = directionIterator(inGsi, Input());
1350 GsiHeader gsiHeader;
1351 try
1352 {
1353 readHeader(gsiHeader, inGsiIter, Gsi());
1354 std::cerr << " OK\n";
1355 }
1356 catch (seqan::ParseError const & ioErr)
1357 {
1358 std::cerr << "Could not read GSI header(" << ioErr.what() << ").\n";
1359 return 1;
1360 }
1361
1362 // Open SAM/BAM file and read in header.
1363 BamFileIn bamFileIn;
1364 if (!open(bamFileIn, toCString(options.inBamPath)))
1365 {
1366 std::cerr << "Could not open SAM file." << std::endl;
1367 return 1;
1368 }
1369 BamHeader bamHeader;
1370 try
1371 {
1372 std::cerr << "Alignments " << options.inBamPath << " (header) ...";
1373 readHeader(bamHeader, bamFileIn);
1374 std::cerr << " OK\n";
1375 }
1376 catch (seqan::ParseError const & ioErr)
1377 {
1378 std::cerr << "Could not read SAM header (" << ioErr.what() << ").\n";
1379 return 1;
1380 }
1381
1382 // The SAM/BAM file has to be sorted by query name.
1383 //
1384 // We do not look at the SAM header for read ordering since samtools sort does not update the SAM header. This
1385 // means users would have to update it manually after sorting which is painful. We will check SAM record order on
1386 // the fly.
1387
1388 std::cerr << "\nTook " << sysTime() - startTime << "s\n";
1389
1390 // =================================================================
1391 // Stream the alignments against gold standard intervals and check.
1392 // =================================================================
1393
1394 std::cerr << "\n____COMPARING ALIGNMENTS WITH INTERVALS_______________________________________\n\n";
1395
1396 startTime = sysTime();
1397 // The result will be a list of ids to entries in witStore.
1398 int res = 0;
1399 RabemaStats result(options.maxError);
1400 if (options.distanceMetric == EDIT_DISTANCE)
1401 res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
1402 inGsiIter, options, MyersUkkonenReads());
1403 else // options.distanceMetric == HAMMING_DISTANCE
1404 res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
1405 inGsiIter, options, HammingSimple());
1406 if (res != 0)
1407 return 1;
1408
1409 std::cerr << "\nTook " << sysTime() - startTime << " s\n";
1410
1411 std::cerr << "\n____RESULTING STATISTICS______________________________________________________\n\n";
1412
1413 std::cerr << "Note that in all-best and any-best mode, we differentiate the intervals we\n"
1414 << "found and those we have to find by their distance. This is not possible in\n"
1415 << "all mode since a multiple lower-error intervals might be contained in an\n"
1416 << "higher-error interval.\n"
1417 << '\n'
1418 << "Alignments will be marked as \"invalid\" if they have a higher error rate than\n"
1419 << "allowed, i.e. they might become valid when increasing the allowed error rate.\n"
1420 << '\n'
1421 << "In \"all\" mode, only intervals from the maximal level are required. In \"all-best\"\n"
1422 << "and \"any-best\" mode, the intervals are relabeled with the smallest distance that\n"
1423 << "a containing interval has. Contained intervals are then removed.\n\n\n";
1424
1425 // int maxError = -1;
1426 // if (!options.oracleMode && options.benchmarkCategory == CATEGORY_ALL)
1427 // maxError = options.maxError;
1428 write(std::cout, result, /*options.maxError,*/ Raw());
1429
1430 if (!empty(options.outTsvPath))
1431 {
1432 std::cerr << '\n'
1433 << "Writing output TSV " << options.outTsvPath << " ...";
1434 std::ofstream tsvOut(toCString(options.outTsvPath), std::ios::out | std::ios::binary);
1435 bool failed = false;
1436 if (!tsvOut.good())
1437 {
1438 failed = true;
1439 std::cerr << " FAILED - could not open output file!\n";
1440 }
1441 else if (write(tsvOut, result, options.maxError, categoryName(options.benchmarkCategory),
1442 options.oracleMode, metricName(options.distanceMetric), Tsv()) != 0)
1443 {
1444 failed = true;
1445 std::cerr << " FAILED - error writing to output file!\n";
1446 }
1447 if (!failed)
1448 std::cerr << " OK\n";
1449 }
1450
1451 return 0;
1452 }
1453