1 // ==========================================================================
2 //                      RABEMA Read Alignment Benchmark
3 // ==========================================================================
4 // Copyright (C) 2010-1012 Manuel Holtgrewe, FU Berlin
5 //
6 // This program is free software: you can redistribute it and/or modify it
7 // under the terms of the GNU General Public License as published by the Free
8 // Software Foundation, either version 3 of the License, or (at your option)
9 // any later version.
10 //
11 // This program is distributed in the hope that it will be useful, but WITHOUT
12 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14 // more details.
15 //
16 // You should have received a copy of the GNU General Public License along
17 // with this program.  If not, see <http://www.gnu.org/licenses/>.
18 //
19 // ==========================================================================
20 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
21 // ==========================================================================
22 // RABEMA is a read mapping benchmark tool.  See README for an overview.
23 //
24 // This is the program for comparing the SAM output of a read mapper to a
25 // Gold Standard Intervals (GSI) file.
26 // ==========================================================================
27 
28 #include <seqan/arg_parse.h>
29 #include <seqan/bam_io.h>
30 #include <seqan/basic.h>
31 #include <seqan/find.h>
32 #include <seqan/misc/interval_tree.h>
33 #include <seqan/store.h>
34 #include <seqan/seq_io.h>
35 
36 #include "find_hamming_simple_ext.h"
37 #include "find_myers_ukkonen_ext.h"
38 #include "find_myers_ukkonen_reads.h"
39 #include "io_gsi.h"
40 #include "rabema_stats.h"
41 #include "ref_id_mapping.h"
42 #include "sorting.h"
43 
44 // ============================================================================
45 // Enums, Tags, Classes.
46 // ============================================================================
47 
48 // ---------------------------------------------------------------------------
49 // Enum BenchmarkCategory
50 // ---------------------------------------------------------------------------
51 
52 enum BenchmarkCategory
53 {
54     CATEGORY_ALL,
55     CATEGORY_ALL_BEST,
56     CATEGORY_ANY_BEST
57 };
58 
59 // ---------------------------------------------------------------------------
60 // Enum DistanceMetric
61 // ---------------------------------------------------------------------------
62 
63 enum DistanceMetric
64 {
65     HAMMING_DISTANCE,
66     EDIT_DISTANCE
67 };
68 
69 // ----------------------------------------------------------------------------
70 // Class RabemaEvaluationOptions
71 // ----------------------------------------------------------------------------
72 
73 class RabemaEvaluationOptions
74 {
75 public:
76     // Verbosity level, quiet: 0, normal: 1, verbose: 2, very verbose: 3.
77     int verbosity;
78 
79     // ------------------------------------------------------------------------
80     // Benchmark Related
81     // ------------------------------------------------------------------------
82 
83     // Maximum number or errors per read length in percent.
84     int maxError;
85 
86     // If true, N matches as a wildcard.  Otherwise it matches none.
87     bool matchN;
88 
89     // If enabled, the distance of alignments is considered to be 0.  This is used when comparing against GSI files that
90     // were generated by the read simulator.  In this case, the intervals will have a length of 1 and be the single
91     // original position of this read.
92     bool oracleMode;
93 
94     // Consider only reads that have a unique match in the mapping result file. Useful for precision computation.
95     bool onlyUniqueReads;
96 
97     // The benchmark category, one of {"all", "any-best", "all-best"}.
98     BenchmarkCategory benchmarkCategory;
99 
100     // Distance function to use, also see validDistanceFunction.
101     DistanceMetric distanceMetric;
102 
103     // If true then we trust the "NM" tag of the SAM alignment if present.  Otherwise, we perform a realignment at the
104     // target position.
105     bool trustNM;
106 
107     // If the CIGAR string is absent, this tag provides the position of the other end of the alignment.
108     CharString extraPosTag;
109 
110     // If enabled then all reads are treated as single-end mapped.  This is required for analyzing SOAP output that has
111     // been converted to SAM since it will set the paired-end flags.
112     bool ignorePairedFlags;
113 
114     // If enabled, don't panic on additional hits in non-weighted mode.
115     bool dontPanic;
116 
117     // ------------------------------------------------------------------------
118     // Input / Output
119     // ------------------------------------------------------------------------
120 
121     // Path to reference sequence file.
122     CharString referencePath;
123 
124     // Path to input GSI file.
125     CharString inGsiPath;
126 
127     // Exactly one of the following has to be set.
128     //
129     // Path to input SAM or BAM file.
130     CharString inBamPath;
131 
132     // Path to output TSV stats file.
133     CharString outTsvPath;
134 
135     // Whether to check sorting or not.
136     bool checkSorting;
137 
138     // ------------------------------------------------------------------------
139     // Logging configuration.
140     // ------------------------------------------------------------------------
141 
142     // Print the missed intervals to stderr for debugging purposes.
143     bool showMissedIntervals;
144 
145     // Print superfluous intervals (intervals found in BAM file but have too bad score).
146     bool showSuperflousIntervals;
147 
148     // Print additional intervals (intervals found in BAM with good score that are not in WIT file).
149     bool showAdditionalIntervals;
150 
151     // Print the hit intervals to stderr for debugging purposes.
152     bool showHitIntervals;
153 
154     // Print each end position that we try to match agains the interval.
155     bool showTryHitIntervals;
156 
RabemaEvaluationOptions()157     RabemaEvaluationOptions() :
158         verbosity(1),
159         maxError(0),
160         matchN(false),
161         oracleMode(false),
162         onlyUniqueReads(false),
163         benchmarkCategory(CATEGORY_ALL),
164         distanceMetric(EDIT_DISTANCE),
165         trustNM(false),
166         ignorePairedFlags(false),
167         dontPanic(false),
168         checkSorting(true),
169         // outPath("-"),
170         showMissedIntervals(false),
171         showSuperflousIntervals(false),
172         showAdditionalIntervals(false),
173         showHitIntervals(false),
174         showTryHitIntervals(false)
175     {}
176 };
177 
178 // ----------------------------------------------------------------------------
179 // Helper Class CmpGsiRecordLowering.
180 // ----------------------------------------------------------------------------
181 
182 // Comparison functor for lexicographically sorting by (readId, contigId, first pos).
183 
184 struct CmpGsiRecordLowering
185 {
operator ()CmpGsiRecordLowering186     bool operator()(GsiRecord const & lhs, GsiRecord const & rhs) const
187     {
188         return (lhs.readId < rhs.readId) || (lhs.readId == rhs.readId && lhs.contigId < rhs.contigId) ||
189                (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos < rhs.firstPos) ||
190                (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
191                 lhs.lastPos > rhs.lastPos) ||
192                (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
193                 lhs.lastPos == rhs.lastPos && lhs.distance > rhs.distance);
194     }
195 
196 };
197 
198 // ============================================================================
199 // Metafunctions
200 // ============================================================================
201 
202 // ============================================================================
203 // Functions
204 // ============================================================================
205 
206 // ---------------------------------------------------------------------------
207 // Function categoryName()
208 // ---------------------------------------------------------------------------
209 
categoryName(BenchmarkCategory cat)210 char const * categoryName(BenchmarkCategory cat)
211 {
212     if (cat == CATEGORY_ALL)
213         return "all";
214     else if (cat == CATEGORY_ALL_BEST)
215         return "all-best";
216     else
217         return "any-best";
218 }
219 
220 // ---------------------------------------------------------------------------
221 // Function metricName()
222 // ---------------------------------------------------------------------------
223 
metricName(DistanceMetric met)224 char const * metricName(DistanceMetric met)
225 {
226     if (met == EDIT_DISTANCE)
227         return "edit";
228     else  // if (met == HAMMING_DISTANCE)
229         return "hamming";
230 }
231 
232 // ---------------------------------------------------------------------------
233 // Function yesNo()
234 // ---------------------------------------------------------------------------
235 
yesNo(bool b)236 char const * yesNo(bool b)
237 {
238     if (b)
239         return "yes";
240     else
241         return "no";
242 }
243 
244 // ----------------------------------------------------------------------------
245 // Function performIntervalLowering()
246 // ----------------------------------------------------------------------------
247 
248 // Relabel intervals with the smallest distance of a contained interval.  Filter out intervals with distance > maxError.
249 
performIntervalLowering(String<GsiRecord> & gsiRecords,int maxError)250 void performIntervalLowering(String<GsiRecord> & gsiRecords, int maxError)
251 {
252     if (empty(gsiRecords))
253         return;
254 
255     typedef Iterator<String<GsiRecord> >::Type TIterator;
256     typedef IntervalAndCargo<unsigned, unsigned> TInterval;
257 
258     // Step 1: Adjust distances.
259     std::sort(begin(gsiRecords, Standard()), end(gsiRecords, Standard()), CmpGsiRecordLowering());
260 
261     // Add sentinel interval.
262     GsiRecord sentinel(back(gsiRecords));
263     sentinel.firstPos = std::numeric_limits<size_t>::max();
264     sentinel.lastPos = std::numeric_limits<size_t>::max();
265     // sentinel.id = std::numeric_limits<size_t>::max();
266     appendValue(gsiRecords, sentinel);
267 
268     String<TInterval> openIntervals;
269     unsigned i = 0;
270     for (TIterator it = begin(gsiRecords, Standard()), itEnd = end(gsiRecords, Standard()); it != itEnd; ++it, ++i)
271     {
272         unsigned count = 0;
273         for (unsigned j = 0; j < length(openIntervals); ++j)
274         {
275             unsigned idx = length(openIntervals) - 1 - j;
276             GsiRecord const thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
277             SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
278             if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
279                 count += 1;
280         }
281         resize(openIntervals, length(openIntervals) - count);
282 
283         // Perform distance lowering for containing intervals.
284         // std::cerr << "OPEN INTERVALS\n";
285         // for (unsigned j = 0; j < length(openIntervals); ++j)
286         //     std::cerr << "  " << gsiRecords[cargo(openIntervals[j])] << "\t" << gsiRecords[cargo(openIntervals[j])].originalDistance << "\n";
287         // std::cerr << "\n";
288         for (unsigned j = 0; j < length(openIntervals); ++j)
289         {
290             unsigned idx = j; //length(openIntervals) - 1 - j;
291             unsigned id = cargo(openIntervals[idx]);
292             if (gsiRecords[id].distance <= maxError)
293                 gsiRecords[id].distance = _min(gsiRecords[id].distance, it->distance);
294             else
295                 break;  // All containing intervals must have a greater distance.
296         }
297 
298         appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
299     }
300 
301     // Step 2: Filter out intervals that are contained in intervals of lesser/equal distance.
302     String<GsiRecord> filteredGsiRecords;
303     clear(openIntervals);
304     i = 0;
305     for (TIterator it = begin(gsiRecords, Standard()), itend = end(gsiRecords, Standard()); it != itend; ++it, ++i)
306     {
307         // Remove non-overlapping intervals on top of openInterval stack, appending to filtered intervals
308         unsigned count = 0;
309         for (unsigned j = 0; j < length(openIntervals); ++j)
310         {
311             unsigned idx = length(openIntervals) - 1 - j;
312             GsiRecord const & thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
313             SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
314             if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
315             {
316                 count += 1;
317                 unsigned startDistance = gsiRecords[cargo(openIntervals[idx])].distance;
318                 if (!empty(filteredGsiRecords))
319                 {
320                     if (back(filteredGsiRecords).lastPos >= leftBoundary(openIntervals[idx]))
321                     {
322                         if (back(filteredGsiRecords).contigId == gsiRecords[cargo(openIntervals[idx])].contigId)
323                         {
324                             // Assert current containing already written out.
325                             SEQAN_ASSERT_GEQ(back(filteredGsiRecords).firstPos, leftBoundary(openIntervals[idx]));
326                             SEQAN_ASSERT_LEQ(back(filteredGsiRecords).lastPos + 1, rightBoundary(openIntervals[idx]));
327                             // Get start distance.
328                             startDistance = back(filteredGsiRecords).distance + 1;
329                         }
330                     }
331                 }
332                 unsigned upperLimit = maxError;
333                 if ((unsigned)maxError < startDistance)
334                     upperLimit = startDistance;
335                 for (unsigned i = startDistance; i <= upperLimit; ++i)
336                 {
337                     appendValue(filteredGsiRecords, gsiRecords[cargo(openIntervals[idx])]);
338                     back(filteredGsiRecords).originalDistance = i;
339                 }
340             }
341         }
342         resize(openIntervals, length(openIntervals) - count);
343 
344         // Add interval to the stack of intervals.
345         if (empty(openIntervals) || gsiRecords[cargo(back(openIntervals))].distance > it->distance)
346             appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
347     }
348     move(gsiRecords, filteredGsiRecords);
349 }
350 
351 // ----------------------------------------------------------------------------
352 // Function benchmarkReadResult()
353 // ----------------------------------------------------------------------------
354 
355 template <typename TPatternSpec>
benchmarkReadResult(RabemaStats & result,String<BamAlignmentRecord> const & samRecords,BamFileIn const & bamFileIn,String<GsiRecord> const & gsiRecords,FaiIndex const & faiIndex,StringSet<Dna5String> const & refSeqs,RefIdMapping const & refIdMapping,RabemaEvaluationOptions const & options,TPatternSpec const &,bool pairedEnd=false,bool second=false)356 int benchmarkReadResult(RabemaStats & result,
357                         String<BamAlignmentRecord> const & samRecords,
358                         BamFileIn const & bamFileIn,
359                         String<GsiRecord> const & gsiRecords,
360                         FaiIndex const & faiIndex,
361                         StringSet<Dna5String> const & refSeqs,
362                         RefIdMapping const & refIdMapping,
363                         RabemaEvaluationOptions const & options,
364                         TPatternSpec const & /*tagPattern*/,
365                         bool pairedEnd = false,
366                         bool second = false)
367 {
368     typedef IntervalAndCargo<unsigned, unsigned> TInterval;
369 #define DEBUG_RABEMA 0
370 #if DEBUG_RABEMA
371     std::cerr << ",--\n"
372               << "| num SAM\t" << length(samRecords) << "\n"
373               << "| num GSI\t" << length(gsiRecords) << "\n"
374               << "`--\n";
375     if (!empty(samRecords))
376         std::cerr << "SAM ID\t" << samRecords[0].qName << "\n";
377     if (!empty(gsiRecords))
378         std::cerr << "GSI ID\t" << gsiRecords[0].readName << "\n";
379 #endif  // #if DEBUG_RABEMA
380 
381     if (options.oracleMode && empty(gsiRecords))
382     {
383         // There are no GSI intervals in oracle mode.  This can be the case if we constructed the gold standard
384         // with a maximal error rate.  If this is the case then we ignore these reads.
385         return 0;
386     }
387 
388     if (options.onlyUniqueReads && length(samRecords) != 1)
389     {
390         // The read was mapped non-uniquely by the mapper. Either it was not found or found at multiple locations
391         // However, to compute the precision of mapper we only evaluate whether reads mapped to a single location
392         // were correctly mapped.
393         return 0;
394     }
395 
396     // Select gold standard intervals (GSI) records.
397     //
398     // We only select intervals that match the specification of pairedEnd, second.  Also, the intervals must be for an
399     // error rate less than or equal to the maximal configured one.  In case of any-best or all-best mode, we only
400     // select those of the eligible intervals with the lowest error rate.
401     //
402     // In case of oracle mode, we ignore the distance of the intervals in the GSI file here but use it later on.
403     //
404     // Start with picking the smallest distance if *-best mode.
405     int smallestDistance = options.oracleMode ? std::numeric_limits<int>::max() : options.maxError;
406     // Note that smallestDistance (as bestDistance defined below) is expressed as percent of read length ceiled
407     // and cat to an int value.
408     if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST ||
409         options.benchmarkCategory == CATEGORY_ALL_BEST)
410         for (unsigned i = 0; i < length(gsiRecords); ++i)
411             smallestDistance = std::min(smallestDistance, gsiRecords[i].distance);
412     int largestDistance = options.maxError;
413     if (options.oracleMode && smallestDistance != std::numeric_limits<int>::max())
414         for (unsigned i = 0; i < length(gsiRecords); ++i)
415             largestDistance = std::max(largestDistance, gsiRecords[i].distance);
416     String<GsiRecord> pickedGsiRecords;
417 #if DEBUG_RABEMA
418     std::cerr << "--\n";
419 #endif  // #if DEBUG_RABEMA
420     for (unsigned i = 0; i < length(gsiRecords); ++i)
421     {
422 #if DEBUG_RABEMA
423         std::cerr << "ORIGINAL\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
424 #endif  // DEBUG_RABEMA
425 
426         // Note: In case of oracle mode, we ignore the distance.
427         // TODO(holtgrew): Remove the following two lines.
428         //if (!options.oracleMode && gsiRecords[i].distance > smallestDistance)
429         //    continue;  // Skip with wrong distance.
430         if (!options.oracleMode && gsiRecords[i].distance > options.maxError)
431             continue;  // Ignore intervals with too high error rate.
432         if (!pairedEnd && (gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
433             continue;  // Skip paired if non-paired selected.
434         if (pairedEnd && !(gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
435             continue;  // Skip non-paired if paired selected.
436         if (pairedEnd && second && !(gsiRecords[i].flags & GsiRecord::FLAG_SECOND_MATE))
437             continue;  // Skip if second selected but this interval is not for second.
438 
439         appendValue(pickedGsiRecords, gsiRecords[i]);
440 #if DEBUG_RABEMA
441         std::cerr << "PICKED\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
442 #endif  // DEBUG_RABEMA
443 
444         // Get index of the sequence from GSI record contig name.
445         if (!getIdByName(back(pickedGsiRecords).contigId, faiIndex, back(pickedGsiRecords).contigName))
446         {
447             std::cerr << "ERROR: Could not find reference sequence for name "
448                       << back(pickedGsiRecords).contigName << '\n';
449             return 1;
450         }
451     }
452 #if DEBUG_RABEMA
453     std::cerr << "--\n";
454 #endif  // DEBUG_RABEMA
455 
456     // On these selected GSI records, we now perform interval lowering if not in oracle mode..  This means that an
457     // interval I with distance k_i containing an interval J with distance k_j < k_i is re-labeled with a distance of
458     // k_j for the smallest k_j of all contained intervals J.  The original distance is stored in the originalDistance
459     // member.
460     if (!options.oracleMode)
461         performIntervalLowering(pickedGsiRecords, options.maxError);
462 
463 #if DEBUG_RABEMA
464     for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
465         std::cerr << "LOWERED\t" << pickedGsiRecords[i] << "\t" << pickedGsiRecords[i].originalDistance << "\n";
466     std::cerr << "--\n";
467 #endif  // DEBUG_RABEMA
468 
469     // Next we filter the lowered intervals to those with an original distance of options.maxError.  In the case of
470     // all-best and any-best we also only keep those with the smallest distance of all intervals for this read.
471     String<GsiRecord> filteredGsiRecords;
472     for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
473     {
474         GsiRecord const & record = pickedGsiRecords[i];
475         bool keep = (record.originalDistance == options.maxError);
476         if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ALL)
477             keep = keep && (record.distance == smallestDistance);
478         if (options.oracleMode && record.originalDistance <= largestDistance)
479             keep = true;
480         if (keep)
481             appendValue(filteredGsiRecords, record);
482     }
483 
484     // Build string of intervals for each reference sequence from these lowered and filtered records.
485     String<String<TInterval> > intervals;
486     resize(intervals, length(refSeqs));
487     String<unsigned> numIntervalsForErrorRate;
488     resize(numIntervalsForErrorRate, options.maxError + 1, 0);
489     String<int> intervalDistances;  // Distance of interval i.
490     unsigned numIntervals = 0;
491     for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
492     {
493 #if DEBUG_RABEMA
494         std::cerr << "USED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
495 #endif  // #if DEBUG_RABEMA
496         int distance = filteredGsiRecords[i].distance;
497 
498         appendValue(intervals[filteredGsiRecords[i].contigId],
499                     TInterval(filteredGsiRecords[i].firstPos, filteredGsiRecords[i].lastPos + 1,
500                               length(intervalDistances)));
501         appendValue(intervalDistances, distance);
502         numIntervals += 1;
503         if (distance >= (int)length(numIntervalsForErrorRate))
504             resize(numIntervalsForErrorRate, distance + 1, 0);
505         if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ANY_BEST)
506             numIntervalsForErrorRate[distance] += 1;
507     }
508     if (options.benchmarkCategory == CATEGORY_ANY_BEST && !empty(filteredGsiRecords))
509         numIntervalsForErrorRate[smallestDistance] += 1;
510     // Marker array that states whether an interval was hit.
511     String<bool> intervalHit;
512     resize(intervalHit, length(intervalDistances), false);
513 
514     // Build interval trees.
515     String<IntervalTree<unsigned> > intervalTrees;
516     resize(intervalTrees, length(refSeqs));
517     for (unsigned i = 0; i < length(intervalTrees); ++i)
518         createIntervalTree(intervalTrees[i], intervals[i]);
519 
520     // One of the SAM records must have a non-empty SEQ field, extract the read seq.
521     Dna5String readSeqL, readSeqR;
522     bool seenL = false, seenR = false;
523     for (unsigned i = 0; i < length(samRecords); ++i)
524     {
525         seenL |= hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]));
526         seenR |= hasFlagLast(samRecords[i]);
527         if ((hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]))) &&
528             !empty(samRecords[i].seq))
529         {
530             readSeqL = samRecords[i].seq;
531             if (hasFlagRC(samRecords[i]))
532                 reverseComplement(readSeqL);
533         }
534         else if (hasFlagLast(samRecords[i]) && !empty(samRecords[i].seq))
535         {
536             readSeqR = samRecords[i].seq;
537             if (hasFlagRC(samRecords[i]))
538                 reverseComplement(readSeqR);
539         }
540         if (!empty(readSeqL) && !empty(readSeqR))
541             break;  // Short-circuit and break.
542     }
543     if (seenL && empty(readSeqL))
544     {
545         std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (left-end)\n";
546         return 1;
547     }
548     if (seenR && empty(readSeqR))
549     {
550         std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (right-end)\n";
551         return 1;
552     }
553 
554     Dna5String readSeq;
555     Dna5String contigSeq;
556     String<unsigned> queryResult;
557 
558     // Try to hit intervals.
559     bool mappedAny = false;  // Whether any valid alignment was found.
560     for (unsigned i = 0; i < length(samRecords); ++i)
561     {
562         BamAlignmentRecord const & samRecord = samRecords[i];
563         int seqId = refIdMapping.map[samRecord.rID];
564 
565         // Compute actual alignment score to rule out invalidly reported alignments.
566         //
567         // If we run in oracle mode then we ignore the actual alignment score and use the best alignment.  We only care
568         // about the alignment's end position in this case.
569         if (hasFlagLast(samRecord))
570             readSeq = readSeqR;
571         else
572             readSeq = readSeqL;
573 
574         // TODO(holtgrew): Remove const cast once we have const holders!
575         BamTagsDict bamTags(const_cast<CharString &>(samRecord.tags));
576         unsigned idx = 0;
577 
578         // If the CIGAR is not present, e.g. a * is present instead,
579         // we can still get the extra position from an user-defined tag.
580         unsigned endPos = samRecord.beginPos + getAlignmentLengthInRef(samRecord) - countPaddings(samRecord.cigar);
581         if (empty(samRecord.cigar))
582         {
583             if (empty(options.extraPosTag) ||
584                 !(findTagKey(idx, bamTags, options.extraPosTag) && extractTagValue(endPos, bamTags, idx)))
585             {
586                 // Simply try to guess the end position.
587                 endPos = samRecord.beginPos + length(readSeq) + 1;
588                 std::cerr << "WARNING: Unknown alignment end position for read " << samRecord.qName << ".\n";
589             }
590             // The extra position tag must be 1-based.
591             SEQAN_ASSERT_GT(endPos, 0u);
592             endPos--;
593         }
594 
595         int bestDistance = std::numeric_limits<int>::min();  // Marker for "not set yet".
596         // Note that bestDistance expresses the distance in percent error, relative to the read length, ceiled up
597         // and converted to an int value.
598         if (!options.oracleMode)
599         {
600             // Get best distance from NM tag if set and we are to trust it.
601             if (options.trustNM && findTagKey(idx, bamTags, "NM") && extractTagValue(bestDistance, bamTags, idx))
602             {
603                 // Convert from count to rate.
604                 bestDistance = static_cast<int>(ceil(100.0 * bestDistance / length(readSeq)));
605             }
606             // Otherwise, perform a realignment.
607             else
608             {
609                 unsigned bandwidth = static_cast<int>(ceil(0.01 * options.maxError * length(readSeq)));
610 
611                 unsigned intervalBegin = samRecord.beginPos;
612                 if (intervalBegin > bandwidth)
613                     intervalBegin -= bandwidth;
614 
615                 unsigned intervalEnd = endPos;
616                 if (intervalEnd < length(refSeqs[seqId]) - 2 * bandwidth)
617                     intervalEnd += 2 * bandwidth;
618 
619                 contigSeq = infix(refSeqs[seqId], intervalBegin, intervalEnd);
620                 if (hasFlagRC(samRecord))
621                     reverseComplement(contigSeq);
622                 Finder<Dna5String> finder(contigSeq);
623                 Pattern<Dna5String, TPatternSpec> pattern(readSeq, -static_cast<int>(length(readSeq)) * 1000);
624                 _patternMatchNOfPattern(pattern, options.matchN);
625                 _patternMatchNOfFinder(pattern, options.matchN);
626                 bool ret = setEndPosition(finder, pattern, length(contigSeq) - bandwidth);
627                 ignoreUnusedVariableWarning(ret);
628 //                write2(std::cerr, samRecord, bamIOContext, Sam());
629                 SEQAN_CHECK(ret, "setEndPosition() must not fail!");
630                 bestDistance = static_cast<int>(ceil(-100.0 * getScore(pattern) / length(readSeq)));
631             }
632         }
633 
634         // Get sequence id and last position of alignment.  We try to hit the interval with the last position (not
635         // C-style end) of the read.
636         unsigned lastPos = hasFlagRC(samRecord) ? length(refSeqs[seqId]) - samRecord.beginPos - 1 : endPos - 1;
637 
638         if (options.showTryHitIntervals)
639             std::cerr << "TRY HIT\tchr=" << sequenceName(faiIndex, seqId) << "\tlastPos=" << lastPos << "\tqName="
640                       << samRecord.qName << "\n";
641 
642         // Try to hit any interval.
643         clear(queryResult);
644         findIntervals(queryResult, intervalTrees[seqId], lastPos);
645         mappedAny = mappedAny || !empty(queryResult);
646 #if DEBUG_RABEMA
647         if (mappedAny)
648             std::cout << "MAPPED\t" << front(samRecords).qName << "\n";
649 #endif  // #if DEBUG_RABEMA
650         if (!empty(queryResult))
651         {
652             for (unsigned i = 0; i < length(queryResult); ++i)
653                 intervalHit[queryResult[i]] = true;
654         }
655         else if (bestDistance != std::numeric_limits<int>::min())
656         {
657             // && bestDistance <= options.maxError)
658 
659             // We now have a hit with a distance below the maximal configured error.  This is a candidate for an
660             // additional hit.  In case of all-best and any-best, this is only one if its distance is better than the
661             // best distance for this read.  In the case of all, it is one if its distance is less than the allowed
662             // maximal distance.  If it is not an additional hit then it is an invalid one.
663             //
664             // Note that all distances including allowedDistance are percent of read length, ceiled up.
665             int allowedDistance = options.maxError;
666             if ((options.benchmarkCategory == CATEGORY_ALL_BEST || options.benchmarkCategory == CATEGORY_ANY_BEST) &&
667                 (smallestDistance != std::numeric_limits<int>::max()))
668                 allowedDistance = smallestDistance;
669             if (bestDistance > allowedDistance)
670             {
671                 if (options.showSuperflousIntervals)
672                 {
673                     std::cerr << "SUPERFLOUS/INVALID\t";
674                     DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
675                     write(cerrIt, samRecord, context(bamFileIn), Sam());
676                     std::cerr << "  DISTANCE:        \t" << bestDistance << '\n'
677                               << "  ALLOWED DISTANCE:\t" << options.maxError << '\n';
678                 }
679                 result.invalidAlignments += 1;
680                 continue;
681             }
682 
683             // We found an additional hit.
684             if (options.showAdditionalIntervals || !options.dontPanic)
685             {
686                 std::cerr << "ADDITIONAL HIT\t";
687                 DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
688                 write(cerrIt, samRecord, context(bamFileIn), Sam());
689                 std::cerr << '\n';
690 
691                 for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
692                     std::cerr << "FILTERED GSI RECORD\t" << filteredGsiRecords[i] << "\n";
693             }
694 
695             if (!options.dontPanic)
696             {
697                 std::cerr << "ERROR: Found an additional hit for read " << samRecord.qName << "!\n";
698                 return 1;
699             }
700             std::cerr << "WARNING: Found an additional hit for read " << samRecord.qName << ".\n";
701         }
702     }
703 
704     // Compute number of found intervals.
705     unsigned numFound = 0;
706     String<unsigned> foundIntervalsForErrorRate;
707     if ((int)length(foundIntervalsForErrorRate) <= largestDistance + 1)
708         resize(foundIntervalsForErrorRate, largestDistance + 1, 0);
709     if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST)
710     {
711         int bestDistance = std::numeric_limits<int>::max();
712         int bestIdx = 0;
713         for (unsigned i = 0; i < length(intervalDistances); ++i)
714             if (intervalHit[i])
715             {
716                 if (options.showHitIntervals)
717                     std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
718                 if (bestDistance > intervalDistances[i])
719                     bestIdx = i;
720                 bestDistance = std::min(bestDistance, intervalDistances[i]);
721             }
722         if (bestDistance != std::numeric_limits<int>::max())
723         {
724             if (options.showHitIntervals)
725                 std::cerr << "HIT_BEST\t" << filteredGsiRecords[bestIdx] << "\t" << filteredGsiRecords[bestIdx].originalDistance << "\n";
726             numFound += 1;
727             foundIntervalsForErrorRate[bestDistance] += 1;
728         }
729         if (!mappedAny && options.showMissedIntervals)
730         {
731             for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
732                 std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
733         }
734     }
735     else  // !options.oracleMode && options.benchmarkCategory in ["all-best", "all"]
736     {
737         for (unsigned i = 0; i < length(intervalDistances); ++i)
738         {
739             // if (options.benchmarkCategory == CATEGORY_ALL && intervalDistances[i] != options.maxError)
740             //     continue;  // Only count intervals on our maximal error rate in "all" mode.
741             if (intervalHit[i])
742             {
743                 if (options.showHitIntervals)
744                     std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
745                 numFound += 1;
746                 foundIntervalsForErrorRate[intervalDistances[i]] += 1;
747             }
748             else
749             {
750                 if (options.showMissedIntervals)  // inside braces for consistency with above
751                     std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\n";
752             }
753         }
754         SEQAN_ASSERT_LEQ(numFound, length(intervalDistances));
755     }
756 
757     // Update the resulting RabemaStats.
758     updateMaximalErrorRate(result, largestDistance);
759     result.totalReads += 1;
760     result.readsInGsi += (numIntervals > 0u);
761     result.mappedReads += mappedAny;
762     if (options.oracleMode)
763     {
764         bool found = (numFound > 0u);
765         result.intervalsToFind += 1;
766         result.intervalsFound += found;
767         result.normalizedIntervals += found;
768         int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
769         result.intervalsToFindForErrorRate[d] += 1;
770         result.intervalsFoundForErrorRate[d] += found;
771         result.normalizedIntervalsToFindForErrorRate[d] += 1;
772         result.normalizedIntervalsFoundForErrorRate[d] += found;
773     }
774     else if (options.benchmarkCategory == CATEGORY_ANY_BEST)
775     {
776         int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
777         bool toFind = (numIntervalsForErrorRate[d] > 0u);
778         bool found = (foundIntervalsForErrorRate[d] > 0u);
779         SEQAN_ASSERT_LEQ(found, toFind);
780         result.intervalsToFind += toFind;
781         result.intervalsFound += found;
782         result.normalizedIntervals += found;
783         result.intervalsToFindForErrorRate[d] += toFind;
784         result.intervalsFoundForErrorRate[d] += found;
785         result.normalizedIntervalsToFindForErrorRate[d] += toFind;
786         result.normalizedIntervalsFoundForErrorRate[d] += found;
787     }
788     else  // all-best or all was selected
789     {
790         unsigned intervalsToFind = 0;
791         unsigned intervalsFound = 0;
792         for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
793         {
794             intervalsToFind += numIntervalsForErrorRate[d];
795             intervalsFound += foundIntervalsForErrorRate[d];
796             result.intervalsToFindForErrorRate[d] += numIntervalsForErrorRate[d];
797             result.intervalsFoundForErrorRate[d] += foundIntervalsForErrorRate[d];
798         }
799         result.intervalsToFind += intervalsToFind;
800         result.intervalsFound += intervalsFound;
801         if (intervalsToFind > 0u)
802             result.normalizedIntervals += 1.0 * intervalsFound / intervalsToFind;
803         for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
804         {
805             if (intervalsToFind == 0u)
806                 continue;
807             // In case of all-best we only count those with the best error rate for this read.
808             if (options.benchmarkCategory == CATEGORY_ALL)
809             {
810                 result.normalizedIntervalsToFindForErrorRate[d] += 1.0 * numIntervalsForErrorRate[d] / intervalsToFind;
811                 result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
812             }
813             else if (options.benchmarkCategory == CATEGORY_ALL_BEST && (int)d == smallestDistance)
814             {
815                 result.normalizedIntervalsToFindForErrorRate[d] += 1;
816                 result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
817             }
818         }
819     }
820 
821     return 0;
822 }
823 
824 // ----------------------------------------------------------------------------
825 // Function clearPairedFlags()
826 // ----------------------------------------------------------------------------
827 
clearPairedFlags(seqan::BamAlignmentRecord & record)828 void clearPairedFlags(seqan::BamAlignmentRecord & record)
829 {
830     if (hasFlagMultiple(record))
831         record.flag = record.flag ^ seqan::BAM_FLAG_MULTIPLE;
832     if (hasFlagFirst(record))
833         record.flag = record.flag ^ seqan::BAM_FLAG_FIRST;
834     if (hasFlagLast(record))
835         record.flag = record.flag ^ seqan::BAM_FLAG_LAST;
836     if (hasFlagNextRC(record))
837         record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_RC;
838     if (hasFlagNextUnmapped(record))
839         record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_UNMAPPED;
840 }
841 
842 // ----------------------------------------------------------------------------
843 // Function compareAlignedReadsToReference()
844 // ----------------------------------------------------------------------------
845 
846 // Stream over both the SAM/BAM and GSI file and compare the hits in the SAM/BAM file against the intervals in the GSI
847 // file.
848 //
849 // Both the SAM/BAM file and the GSI file have to be sorted by queryname for this to work.
850 
851 template <typename TForwardIter, typename TPatternSpec>
852 int
compareAlignedReadsToReference(RabemaStats & result,BamFileIn & bamFileIn,FaiIndex const & faiIndex,StringSet<Dna5String> const & refSeqs,TForwardIter & gsiIter,RabemaEvaluationOptions const & options,TPatternSpec const & tagPattern)853 compareAlignedReadsToReference(RabemaStats & result,
854                                BamFileIn & bamFileIn,
855                                FaiIndex const & faiIndex,
856                                StringSet<Dna5String> const & refSeqs,
857                                TForwardIter & gsiIter,
858                                RabemaEvaluationOptions const & options,
859                                TPatternSpec const & tagPattern)
860 {
861     // Mapping between ref IDs from SAM/BAM file and reference sequence (from SAM/BAM file to reference sequences).
862     RefIdMapping refIdMapping;
863     rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
864                    contigNames(context(bamFileIn)));
865 
866     // Read in initial SAM/GSI records.
867     BamAlignmentRecord samRecord;
868     if (!atEnd(bamFileIn))
869         try
870         {
871             readRecord(samRecord, bamFileIn);
872         }
873         catch (seqan::ParseError const & ioErr)
874         {
875             std::cerr << "ERROR: Could not read first SAM/BAM record.\n";
876             return 1;
877         }
878     if (options.ignorePairedFlags)
879         clearPairedFlags(samRecord);
880     GsiRecord gsiRecord;
881     if (!atEnd(gsiIter))
882         try
883         {
884             readRecord(gsiRecord, gsiIter, Gsi());
885         }
886         catch (seqan::ParseError const & ioErr)
887         {
888             std::cerr << "ERROR: Could not read first GSI record.\n";
889             return 1;
890         }
891 
892     // Current SAM/BAM and GSI records are stored in these arrays.
893     String<BamAlignmentRecord> currentSamRecords;
894     String<GsiRecord> currentGsiRecords;
895 
896     // These flags store whether we processed the last SAM/BAM and GSI record.
897     bool samDone = false, gsiDone = false;
898 
899     // The main loop: We walk over both the SAM/BAM and GSI records.
900     // unsigned chunkI = 0;
901     std::cerr << "Each dot corresponds to 10k processed reads.\n"
902               << "\n"
903               << "Progress: ";
904     unsigned i = 0;
905     while (!samDone || !gsiDone)
906     {
907         if (i > 0u && i % (100 * 1000) == 0u)
908             std::cerr << i / 100 / 1000 << "00k";
909         else if (i > 0 && i % (10 * 1000) == 0u)
910             std::cerr << '.';
911         ++i;
912 
913         // We process the record for the next query/read.  Since records for this next query/read might be missing in
914         // both files, we need to determine which is the next one.
915         CharString currentReadName;
916         if (gsiDone)
917             currentReadName = samRecord.qName;
918         else if (samDone)
919             currentReadName = gsiRecord.readName;
920         else
921             currentReadName = lessThanSamtoolsQueryName(gsiRecord.readName, samRecord.qName) ?
922                               gsiRecord.readName : samRecord.qName;
923 
924         // These flags determine whether evaluation is run for single-end and/or paired-end reads.
925         bool seenSingleEnd = false, seenPairedEnd = false;
926 
927         // Read all SAM/BAM records with the same query name.
928         clear(currentSamRecords);
929         while (!samDone && samRecord.qName == currentReadName)
930         {
931             if (!hasFlagUnmapped(samRecord))  // Ignore records with non-aligned reads.
932             {
933                 seenSingleEnd |= !hasFlagMultiple(samRecord);
934                 seenPairedEnd |= hasFlagMultiple(samRecord);
935                 appendValue(currentSamRecords, samRecord);
936             }
937             if (atEnd(bamFileIn))
938             {
939                 // At end of SAM/BAM File, do not read next one.
940                 samDone = true;
941                 continue;
942             }
943             try
944             {
945                 readRecord(samRecord, bamFileIn);
946             }
947             catch (seqan::ParseError const & ioErr)
948             {
949                 std::cerr << "ERROR: Could not read SAM/BAM record.\n";
950                 return 1;
951             }
952             if (options.ignorePairedFlags)
953                 clearPairedFlags(samRecord);
954             if (options.checkSorting && lessThanSamtoolsQueryName(samRecord.qName, currentReadName))
955             {
956                 std::cerr << "ERROR: Wrong order in SAM/BAM file: " << samRecord.qName << " succeeds "
957                           << currentReadName << " in file.\n"
958                           << "File must be sorted by read name/queryname.\n";
959                 return 1;
960             }
961             // Rebuild ref ID mapping if we discovered a new reference sequence.
962             if (length(contigNames(context(bamFileIn))) != length(refIdMapping))
963                 rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
964                                contigNames(context(bamFileIn)));
965         }
966 
967         // Read in the next block of GSI records.
968         clear(currentGsiRecords);
969         while (!gsiDone && gsiRecord.readName == currentReadName)
970         {
971             seenSingleEnd |= !(gsiRecord.flags & GsiRecord::FLAG_PAIRED);
972             seenPairedEnd |= (gsiRecord.flags & GsiRecord::FLAG_PAIRED);
973             appendValue(currentGsiRecords, gsiRecord);
974             if (atEnd(gsiIter))
975             {
976                 // At end of GSI File, do not read next one.
977                 gsiDone = true;
978                 continue;
979             }
980             try
981             {
982                 readRecord(gsiRecord, gsiIter, Gsi());
983             }
984             catch (seqan::ParseError const & ioErr)
985             {
986                 std::cerr << "ERROR: Could not read GSI record.\n";
987                 return 1;
988             }
989             if (options.checkSorting && lessThanSamtoolsQueryName(gsiRecord.readName, currentReadName))
990             {
991                 std::cerr << "ERROR: Wrong order in GSI file: " << gsiRecord.readName << " succeeds "
992                           << currentReadName << " in file.\n"
993                           << "File must be sorted by read name/queryname.\n";
994                 return 1;
995             }
996         }
997 
998         // Now, compare the SAM/BAM records against the intervals stored in the GSI records.
999         //
1000         // We collected the records for all queries.  Here, we differentiate between the different cases.
1001         if (seenSingleEnd)
1002         {
1003             int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1004                                           faiIndex, refSeqs, refIdMapping, options, tagPattern,
1005                                           /*pairedEnd=*/ false);
1006             if (res != 0)
1007                 return 1;
1008         }
1009         if (seenPairedEnd)
1010         {
1011             int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1012                                           faiIndex, refSeqs, refIdMapping, options, tagPattern,
1013                                           /*pairedEnd=*/ true, /*second=*/ false);
1014             if (res != 0)
1015                 return 1;
1016 
1017             res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
1018                                       faiIndex, refSeqs, refIdMapping, options, tagPattern,
1019                                       /*pairedEnd=*/ true, /*second=*/ true);
1020             if (res != 0)
1021                 return 1;
1022         }
1023     }
1024     std::cerr << " DONE\n";
1025 
1026     return 0;
1027 }
1028 
1029 // ---------------------------------------------------------------------------
1030 // Function parseCommandLine()
1031 // ---------------------------------------------------------------------------
1032 
1033 seqan::ArgumentParser::ParseResult
parseCommandLine(RabemaEvaluationOptions & options,int argc,char const ** argv)1034 parseCommandLine(RabemaEvaluationOptions & options, int argc, char const ** argv)
1035 {
1036     // -----------------------------------------------------------------------
1037     // Parse Command Line Using ArgumentParser
1038     // -----------------------------------------------------------------------
1039 
1040     seqan::ArgumentParser parser("rabema_evaluate");
1041     setShortDescription(parser, "RABEMA Evaluation");
1042     setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
1043     setDate(parser, SEQAN_DATE);
1044     setCategory(parser, "Benchmarking");
1045 
1046     addUsageLine(parser,
1047                  "[\\fIOPTIONS\\fP] \\fB--reference\\fP \\fIREF.fa\\fP \\fB--in-gsi\\fP \\fIIN.gsi\\fP "
1048                  "\\fB--in-bam\\fP \\fIMAPPING.{sam,bam}\\fP");
1049     addDescription(parser,
1050                    "Compare the SAM/bam output \\fIMAPPING.sam\\fP/\\fIMAPPING.bam\\fP of any read mapper against "
1051                    "the RABEMA gold standard previously built with \\fBrabema_build_gold_standard\\fP.  The input "
1052                    "is a reference FASTA file, a gold standard interval (GSI) file and the SAM/BAM input to "
1053                    "evaluate.");
1054     addDescription(parser,
1055                    "The input SAM/BAM file must be \\fIsorted by queryname\\fP.  The program will create a "
1056                    "FASTA index file \\fIREF.fa.fai\\fP for fast random access to the reference.");
1057 
1058     addOption(parser, seqan::ArgParseOption("v", "verbose", "Enable verbose output."));
1059     addOption(parser, seqan::ArgParseOption("vv", "very-verbose", "Enable even more verbose output."));
1060 
1061     addSection(parser, "Input / Output");
1062     // addOption(parser, seqan::ArgParseOption("o", "out-gsi", "Path to write the resulting GSI file to.",
1063     //                                         seqan::ArgParseArgument::STRING, false, "GSI"));
1064     // setRequired(parser, "out-gsi", true);
1065     addOption(parser, seqan::ArgParseOption("r", "reference", "Path to load reference FASTA from.",
1066                                             seqan::ArgParseArgument::INPUT_FILE, "FASTA"));
1067     setValidValues(parser, "reference", seqan::SeqFileIn::getFileExtensions());
1068     setRequired(parser, "reference", true);
1069     addOption(parser, seqan::ArgParseOption("g", "in-gsi",
1070                                             "Path to load gold standard intervals from. If compressed using gzip, "
1071                                             "the file will be decompressed on the fly.",
1072                                             seqan::ArgParseArgument::INPUT_FILE, "GSI"));
1073     setRequired(parser, "in-gsi", true);
1074     setValidValues(parser, "in-gsi", "gsi gsi.gz");  // GSI (Gold Standard Intervals) Format only.
1075 
1076     addOption(parser, seqan::ArgParseOption("b", "in-bam", "Path to load the read mapper SAM or BAM output from.",
1077                                             seqan::ArgParseArgument::INPUT_FILE, "BAM"));
1078     setValidValues(parser, "in-bam", BamFileIn::getFileExtensions());
1079     setRequired(parser, "in-bam");
1080     addOption(parser, seqan::ArgParseOption("", "out-tsv", "Path to write the statistics to as TSV.",
1081                                             seqan::ArgParseArgument::OUTPUT_FILE, "TSV"));
1082     setValidValues(parser, "out-tsv", "rabema_report_tsv");
1083 
1084     addOption(parser, seqan::ArgParseOption("", "dont-check-sorting",
1085                                             "Do not check sortedness (by name) of input SAM/BAM files.  This is "
1086                                             "required if the reads are not sorted by name in the original FASTQ "
1087                                             "files.  Files from the SRA and ENA generally are sorted."));
1088 
1089     addSection(parser, "Benchmark Parameters");
1090     addOption(parser, seqan::ArgParseOption("", "oracle-mode",
1091                                             "Enable oracle mode.  This is used for simulated data when the input "
1092                                             "GSI file gives exactly one position that is considered as the true "
1093                                             "sample position.  For simulated data."));
1094     addOption(parser, seqan::ArgParseOption("", "only-unique-reads",
1095                                             "Consider only reads that a single alignment in the mapping result file. "
1096                                             "Useful for precision computation."));
1097     addOption(parser, seqan::ArgParseOption("", "match-N", "When set, N matches all characters without penalty."));
1098     addOption(parser, seqan::ArgParseOption("", "distance-metric",
1099                                             "Set distance metric.  Valid values: hamming, edit.  Default: edit.",
1100                                             seqan::ArgParseOption::STRING, "METRIC"));
1101     setValidValues(parser, "distance-metric", "hamming edit");
1102     setDefaultValue(parser, "distance-metric", "edit");
1103 
1104     addOption(parser, seqan::ArgParseOption("e", "max-error",
1105                                             "Maximal error rate to build gold standard for in percent.  This "
1106                                             "parameter is an integer and relative to the read length.  "
1107                                             "The error rate is ignored in oracle mode, here the distance "
1108                                             "of the read at the sample position is taken, individually "
1109                                             "for each read.  Default: 0",
1110                                             seqan::ArgParseArgument::INTEGER, "RATE"));
1111     setDefaultValue(parser, "max-error", 0);
1112 
1113     addOption(parser, seqan::ArgParseOption("c", "benchmark-category",
1114                                             "Set benchmark category.  One of {all, all-best, any-best.  Default: all",
1115                                             seqan::ArgParseOption::STRING, "CAT"));
1116     setValidValues(parser, "benchmark-category", "all all-best any-best");
1117     setDefaultValue(parser, "benchmark-category", "all");
1118 
1119     addOption(parser, seqan::ArgParseOption("", "trust-NM",
1120                                             "When set, we trust the alignment and distance from SAM/BAM file and no "
1121                                             "realignment is performed.  Off by default."));
1122     addOption(parser, seqan::ArgParseOption("", "extra-pos-tag",
1123                                             "If the CIGAR string is absent, the missing alignment end position can be "
1124                                             "provided by this BAM tag.",
1125                                             seqan::ArgParseOption::STRING));
1126 
1127     addOption(parser, seqan::ArgParseOption("", "ignore-paired-flags",
1128                                             "When set, we ignore all SAM/BAM flags related to pairing.  This is "
1129                                             "necessary when analyzing SAM from SOAP's soap2sam.pl script."));
1130     addOption(parser, seqan::ArgParseOption("", "DONT-PANIC",
1131                                             "Do not stop program execution if an additional hit was found that "
1132                                             "indicates that the gold standard is incorrect."));
1133 
1134     addSection(parser, "Logging");
1135     addText(parser, "");
1136     addText(parser,
1137             "The occurrence of \"invalid\" hits in the read mapper's output is not an error.  If there are "
1138             "additional hits, however, this shows an error in the gold standard.");
1139     addOption(parser, seqan::ArgParseOption("", "show-missed-intervals",
1140                                             "Show details for each missed interval from the GSI."));
1141     addOption(parser, seqan::ArgParseOption("", "show-invalid-hits",
1142                                             "Show details for invalid hits (with too high error rate)."));
1143     addOption(parser, seqan::ArgParseOption("", "show-additional-hits",
1144                                             "Show details for additional hits (low enough error rate but not in "
1145                                             "gold standard."));
1146     addOption(parser, seqan::ArgParseOption("", "show-hits", "Show details for hit intervals."));
1147     addOption(parser, seqan::ArgParseOption("", "show-try-hit", "Show details for each alignment in SAM/BAM input."));
1148 
1149     addTextSection(parser, "Return Values");
1150     addText(parser, "A return value of 0 indicates success, any other value indicates an error.");
1151 
1152     // addTextSection(parser, "Examples");
1153 
1154     // addListItem(parser,
1155     //             "\\fBrabema_build_gold_standard\\fP \\fB-e\\fP \\fI4\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
1156     //                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1157     //             "Build gold standard from a SAM file \\fIIN.sam\\fP with all mapping locations and a FASTA "
1158     //             "reference \\fIREF.fa\\fP to GSI file \\fIOUT.gsi\\fP with a maximal error rate of \\fI4\\fP.");
1159     // addListItem(parser,
1160     //             "\\fBrabema_build_gold_standard\\fP \\fB--distance-metric\\fP \\fIedit\\fP \\fB-e\\fP \\fI4\\fP "
1161     //                 "\\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP \\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1162     //             "Same as above, but using Hamming instead of edit distance.");
1163     // addListItem(parser,
1164     //             "\\fBrabema_build_gold_standard\\fP \\fB--oracle-mode\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
1165     //                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1166     //             "Build gold standard from a SAM file \\fIIN.sam\\fP with the original sample position, e.g.  "
1167     //             "as exported by read simulator Mason.");
1168 
1169     addTextSection(parser, "Memory Requirements");
1170     addText(parser,
1171             "From version 1.1, great care has been taken to keep the memory requirements as low as possible.");
1172     addText(parser,
1173             "The evaluation step needs to store the whole reference sequence in memory but little more "
1174             "memory.  So, for the human genome, the memory requirements are below 4 GB, regardless of "
1175             "the size of the GSI or SAM/BAM file.");
1176 
1177     addTextSection(parser, "References");
1178     addText(parser,
1179             "M. Holtgrewe, A.-K. Emde, D. Weese and K. Reinert.  A Novel And Well-Defined Benchmarking Method "
1180             "For Second Generation Read Mapping, BMC Bioinformatics 2011, 12:210.");
1181     addListItem(parser, "\\fIhttp://www.seqan.de/rabema\\fP", "RABEMA Homepage");
1182     addListItem(parser, "\\fIhttp://www.seqan.de/mason\\fP", "Mason Homepage");
1183 
1184     // Actually do the parsing and exit on error, help display etc.
1185     seqan::ArgumentParser::ParseResult res = parse(parser, argc, argv);
1186     if (res != seqan::ArgumentParser::PARSE_OK)
1187         return res;
1188 
1189     // -----------------------------------------------------------------------
1190     // Fill BuildGoldStandardOptions Object
1191     // -----------------------------------------------------------------------
1192 
1193     if (isSet(parser, "verbose"))
1194         options.verbosity = 2;
1195     if (isSet(parser, "very-verbose"))
1196         options.verbosity = 3;
1197 
1198     if (isSet(parser, "reference"))
1199         getOptionValue(options.referencePath, parser, "reference");
1200     if (isSet(parser, "in-bam"))
1201         getOptionValue(options.inBamPath, parser, "in-bam");
1202     if (isSet(parser, "in-gsi"))
1203         getOptionValue(options.inGsiPath, parser, "in-gsi");
1204     if (isSet(parser, "out-tsv"))
1205         getOptionValue(options.outTsvPath, parser, "out-tsv");
1206 
1207     getOptionValue(options.maxError, parser, "max-error");
1208     options.matchN = isSet(parser, "match-N");
1209     options.oracleMode = isSet(parser, "oracle-mode");
1210     options.onlyUniqueReads = isSet(parser, "only-unique-reads");
1211     CharString benchmarkCategory;
1212     getOptionValue(benchmarkCategory, parser, "benchmark-category");
1213     if (benchmarkCategory == "all")
1214         options.benchmarkCategory = CATEGORY_ALL;
1215     else if (benchmarkCategory == "all-best")
1216         options.benchmarkCategory = CATEGORY_ALL_BEST;
1217     else  // if (benchmarkCategory == "any-best")
1218         options.benchmarkCategory = CATEGORY_ANY_BEST;
1219     CharString distanceMetric;
1220     getOptionValue(distanceMetric, parser, "distance-metric");
1221     if (distanceMetric == "edit")
1222         options.distanceMetric = EDIT_DISTANCE;
1223     else
1224         options.distanceMetric = HAMMING_DISTANCE;
1225     options.trustNM = isSet(parser, "trust-NM");
1226     getOptionValue(options.extraPosTag, parser, "extra-pos-tag");
1227     options.ignorePairedFlags = isSet(parser, "ignore-paired-flags");
1228     options.dontPanic = isSet(parser, "DONT-PANIC");
1229 
1230     getOptionValue(options.checkSorting, parser, "dont-check-sorting");
1231     options.checkSorting = !options.checkSorting;
1232 
1233     options.showMissedIntervals = isSet(parser, "show-missed-intervals");
1234     options.showSuperflousIntervals = isSet(parser, "show-invalid-hits");
1235     options.showAdditionalIntervals = isSet(parser, "show-additional-hits");
1236     options.showHitIntervals = isSet(parser, "show-hits");
1237     options.showTryHitIntervals = isSet(parser, "show-try-hit");
1238 
1239     return res;
1240 }
1241 
1242 // ----------------------------------------------------------------------------
1243 // Function main()
1244 // ----------------------------------------------------------------------------
1245 
main(int argc,char const ** argv)1246 int main(int argc, char const ** argv)
1247 {
1248     // Parse command line and store results in options.
1249     RabemaEvaluationOptions options;
1250     seqan::ArgumentParser::ParseResult parseRes = parseCommandLine(options, argc, argv);
1251     if (parseRes != seqan::ArgumentParser::PARSE_OK)
1252         return parseRes == seqan::ArgumentParser::PARSE_ERROR;
1253 
1254     double startTime = 0;  // For measuring time below.
1255 
1256     std::cerr << "==============================================================================\n"
1257               << "                RABEMA - Read Alignment BEnchMArk\n"
1258               << "==============================================================================\n"
1259               << "                        Result Comparison\n"
1260               << "==============================================================================\n"
1261               << "\n";
1262     std::cerr << "____OPTIONS___________________________________________________________________\n\n";
1263 
1264     std::cerr << "Max error rate [%]    " << options.maxError << "\n"
1265               << "Oracle mode           " << yesNo(options.oracleMode) << '\n'
1266               << "Only unique reads     " << yesNo(options.onlyUniqueReads) << '\n'
1267               << "Benchmark category    " << categoryName(options.benchmarkCategory) << "\n"
1268               << "Distance measure      " << metricName(options.distanceMetric) << "\n"
1269               << "Match Ns              " << yesNo(options.matchN) << '\n'
1270               << "Trust NM tag          " << yesNo(options.trustNM) << '\n'
1271               << "Ignore paired flags   " << yesNo(options.ignorePairedFlags) << '\n'
1272               << "GSI File              " << options.inGsiPath << '\n'
1273               << "BAM File              " << options.inBamPath << '\n'
1274               << "Reference File        " << options.referencePath << '\n'
1275               << "TSV Output File       " << options.outTsvPath << '\n'
1276               << "Check Sorting         " << yesNo(options.checkSorting) << '\n'
1277               << "Show\n"
1278               << "    additional        " << yesNo(options.showAdditionalIntervals) << '\n'
1279               << "    hit               " << yesNo(options.showHitIntervals) << '\n'
1280               << "    missed            " << yesNo(options.showMissedIntervals) << '\n'
1281               << "    superfluous       " << yesNo(options.showSuperflousIntervals) << '\n'
1282               << "    try hit           " << yesNo(options.showTryHitIntervals) << '\n'
1283               << "\n";
1284 
1285     std::cerr << "____LOADING FILES_____________________________________________________________\n\n";
1286 
1287     // =================================================================
1288     // Prepare File I/O.
1289     // =================================================================
1290 
1291     startTime = sysTime();
1292     // Open reference FAI index.
1293     std::cerr << "Reference Index           " << options.referencePath << ".fai ...";
1294     FaiIndex faiIndex;
1295     if (!open(faiIndex, toCString(options.referencePath)))
1296     {
1297         std::cerr << " FAILED (not fatal, we can just build it)\n";
1298         std::cerr << "Building Index        " << options.referencePath << ".fai ...";
1299         if (!build(faiIndex, toCString(options.referencePath)))
1300         {
1301             std::cerr << "Could not build FAI index.\n";
1302             return 1;
1303         }
1304         std::cerr << " OK\n";
1305         seqan::CharString faiPath = options.referencePath;
1306         append(faiPath, ".fai");
1307         std::cerr << "Reference Index       " << faiPath << " ...";
1308         try
1309         {
1310             save(faiIndex, toCString(faiPath));
1311             std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1312         }
1313         catch (IOError const & ioErr)
1314         {
1315             std::cerr << "Could not write FAI index we just built.\n";
1316             return 1;
1317         }
1318     }
1319     std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1320 
1321     std::cerr << "Reference Sequences       " << options.referencePath << " ...";
1322     StringSet<Dna5String> refSeqs;
1323     resize(refSeqs, length(faiIndex.seqNameStore));
1324     for (unsigned i = 0; i < length(faiIndex.seqNameStore); ++i)
1325     {
1326         reserve(refSeqs[i], sequenceLength(faiIndex, i), Exact());
1327         try
1328         {
1329             readSequence(refSeqs[i], faiIndex, i);
1330         }
1331         catch (seqan::ParseError const & ioErr)
1332         {
1333             std::cerr << "ERROR: Could not read sequence " << faiIndex.seqNameStore[i] << ".\n";
1334             return 0;
1335         }
1336     }
1337     std::cerr << " OK\n";
1338 
1339     // Open gold standard intervals (GSI) file and read in header.
1340     std::cerr << "Gold Standard Intervals   " << options.inGsiPath << " (header) ...";
1341     VirtualStream<char, Input> inGsi;
1342     if (!open(inGsi, toCString(options.inGsiPath)))
1343     {
1344         std::cerr << "Could not open GSI file.\n";
1345         return 1;
1346     }
1347     if (!isEqual(format(inGsi), Nothing()))
1348         std::cerr << " (is compressed)";
1349     DirectionIterator<VirtualStream<char, Input>, Input>::Type inGsiIter = directionIterator(inGsi, Input());
1350     GsiHeader gsiHeader;
1351     try
1352     {
1353         readHeader(gsiHeader, inGsiIter, Gsi());
1354         std::cerr << " OK\n";
1355     }
1356     catch (seqan::ParseError const & ioErr)
1357     {
1358         std::cerr << "Could not read GSI header(" << ioErr.what() << ").\n";
1359         return 1;
1360     }
1361 
1362     // Open SAM/BAM file and read in header.
1363     BamFileIn bamFileIn;
1364     if (!open(bamFileIn, toCString(options.inBamPath)))
1365     {
1366         std::cerr << "Could not open SAM file." << std::endl;
1367         return 1;
1368     }
1369     BamHeader bamHeader;
1370     try
1371     {
1372         std::cerr << "Alignments                " << options.inBamPath << " (header) ...";
1373         readHeader(bamHeader, bamFileIn);
1374         std::cerr << " OK\n";
1375     }
1376     catch (seqan::ParseError const & ioErr)
1377     {
1378         std::cerr << "Could not read SAM header (" << ioErr.what() << ").\n";
1379         return 1;
1380     }
1381 
1382     // The SAM/BAM file has to be sorted by query name.
1383     //
1384     // We do not look at the SAM header for read ordering since samtools sort does not update the SAM header.  This
1385     // means users would have to update it manually after sorting which is painful.  We will check SAM record order on
1386     // the fly.
1387 
1388     std::cerr << "\nTook " << sysTime() - startTime << "s\n";
1389 
1390     // =================================================================
1391     // Stream the alignments against gold standard intervals and check.
1392     // =================================================================
1393 
1394     std::cerr << "\n____COMPARING ALIGNMENTS WITH INTERVALS_______________________________________\n\n";
1395 
1396     startTime = sysTime();
1397     // The result will be a list of ids to entries in witStore.
1398     int res = 0;
1399     RabemaStats result(options.maxError);
1400     if (options.distanceMetric == EDIT_DISTANCE)
1401         res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
1402                                              inGsiIter, options, MyersUkkonenReads());
1403     else  // options.distanceMetric == HAMMING_DISTANCE
1404         res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
1405                                              inGsiIter, options, HammingSimple());
1406     if (res != 0)
1407         return 1;
1408 
1409     std::cerr << "\nTook " << sysTime() - startTime << " s\n";
1410 
1411     std::cerr << "\n____RESULTING STATISTICS______________________________________________________\n\n";
1412 
1413     std::cerr << "Note that in all-best and any-best mode, we differentiate the intervals we\n"
1414               << "found and those we have to find by their distance.  This is not possible in\n"
1415               << "all mode since a multiple lower-error intervals might be contained in an\n"
1416               << "higher-error interval.\n"
1417               << '\n'
1418               << "Alignments will be marked as \"invalid\" if they have a higher error rate than\n"
1419               << "allowed, i.e. they might become valid when increasing the allowed error rate.\n"
1420               << '\n'
1421               << "In \"all\" mode, only intervals from the maximal level are required.  In \"all-best\"\n"
1422               << "and \"any-best\" mode, the intervals are relabeled with the smallest distance that\n"
1423               << "a containing interval has.  Contained intervals are then removed.\n\n\n";
1424 
1425     // int maxError = -1;
1426     // if (!options.oracleMode && options.benchmarkCategory == CATEGORY_ALL)
1427     //     maxError = options.maxError;
1428     write(std::cout, result, /*options.maxError,*/ Raw());
1429 
1430     if (!empty(options.outTsvPath))
1431     {
1432         std::cerr << '\n'
1433                   << "Writing output TSV        " << options.outTsvPath << " ...";
1434         std::ofstream tsvOut(toCString(options.outTsvPath), std::ios::out | std::ios::binary);
1435         bool failed = false;
1436         if (!tsvOut.good())
1437         {
1438             failed = true;
1439             std::cerr << " FAILED - could not open output file!\n";
1440         }
1441         else if (write(tsvOut, result, options.maxError, categoryName(options.benchmarkCategory),
1442                        options.oracleMode, metricName(options.distanceMetric), Tsv()) != 0)
1443         {
1444             failed = true;
1445             std::cerr << " FAILED - error writing to output file!\n";
1446         }
1447         if (!failed)
1448             std::cerr << " OK\n";
1449     }
1450 
1451     return 0;
1452 }
1453