apps/rabema/rabema_evaluate.cpp

// ==========================================================================
//                      RABEMA Read Alignment Benchmark
// ==========================================================================
// Copyright (C) 2010-1012 Manuel Holtgrewe, FU Berlin
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with this program.  If not, see <http://www.gnu.org/licenses/>.
//
// ==========================================================================
// Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
// ==========================================================================
// RABEMA is a read mapping benchmark tool.  See README for an overview.
//
// This is the program for comparing the SAM output of a read mapper to a
// Gold Standard Intervals (GSI) file.
// ==========================================================================

#include <seqan/arg_parse.h>
#include <seqan/bam_io.h>
#include <seqan/basic.h>
#include <seqan/find.h>
#include <seqan/misc/interval_tree.h>
#include <seqan/store.h>
#include <seqan/seq_io.h>

#include "find_hamming_simple_ext.h"
#include "find_myers_ukkonen_ext.h"
#include "find_myers_ukkonen_reads.h"
#include "io_gsi.h"
#include "rabema_stats.h"
#include "ref_id_mapping.h"
#include "sorting.h"

// ============================================================================
// Enums, Tags, Classes.
// ============================================================================

// ---------------------------------------------------------------------------
// Enum BenchmarkCategory
// ---------------------------------------------------------------------------

enum BenchmarkCategory
{
    CATEGORY_ALL,
    CATEGORY_ALL_BEST,
    CATEGORY_ANY_BEST
};

// ---------------------------------------------------------------------------
// Enum DistanceMetric
// ---------------------------------------------------------------------------

enum DistanceMetric
{
    HAMMING_DISTANCE,
    EDIT_DISTANCE
};

// ----------------------------------------------------------------------------
// Class RabemaEvaluationOptions
// ----------------------------------------------------------------------------

class RabemaEvaluationOptions
{
public:
    // Verbosity level, quiet: 0, normal: 1, verbose: 2, very verbose: 3.
    int verbosity;

    // ------------------------------------------------------------------------
    // Benchmark Related
    // ------------------------------------------------------------------------

    // Maximum number or errors per read length in percent.
    int maxError;

    // If true, N matches as a wildcard.  Otherwise it matches none.
    bool matchN;

    // If enabled, the distance of alignments is considered to be 0.  This is used when comparing against GSI files that
    // were generated by the read simulator.  In this case, the intervals will have a length of 1 and be the single
    // original position of this read.
    bool oracleMode;

    // Consider only reads that have a unique match in the mapping result file. Useful for precision computation.
    bool onlyUniqueReads;

    // The benchmark category, one of {"all", "any-best", "all-best"}.
    BenchmarkCategory benchmarkCategory;

    // Distance function to use, also see validDistanceFunction.
    DistanceMetric distanceMetric;

    // If true then we trust the "NM" tag of the SAM alignment if present.  Otherwise, we perform a realignment at the
    // target position.
    bool trustNM;

    // If the CIGAR string is absent, this tag provides the position of the other end of the alignment.
    CharString extraPosTag;

    // If enabled then all reads are treated as single-end mapped.  This is required for analyzing SOAP output that has
    // been converted to SAM since it will set the paired-end flags.
    bool ignorePairedFlags;

    // If enabled, don't panic on additional hits in non-weighted mode.
    bool dontPanic;

    // ------------------------------------------------------------------------
    // Input / Output
    // ------------------------------------------------------------------------

    // Path to reference sequence file.
    CharString referencePath;

    // Path to input GSI file.
    CharString inGsiPath;

    // Exactly one of the following has to be set.
    //
    // Path to input SAM or BAM file.
    CharString inBamPath;

    // Path to output TSV stats file.
    CharString outTsvPath;

    // Whether to check sorting or not.
    bool checkSorting;

    // ------------------------------------------------------------------------
    // Logging configuration.
    // ------------------------------------------------------------------------

    // Print the missed intervals to stderr for debugging purposes.
    bool showMissedIntervals;

    // Print superfluous intervals (intervals found in BAM file but have too bad score).
    bool showSuperflousIntervals;

    // Print additional intervals (intervals found in BAM with good score that are not in WIT file).
    bool showAdditionalIntervals;

    // Print the hit intervals to stderr for debugging purposes.
    bool showHitIntervals;

    // Print each end position that we try to match agains the interval.
    bool showTryHitIntervals;

    RabemaEvaluationOptions() :
        verbosity(1),
        maxError(0),
        matchN(false),
        oracleMode(false),
        onlyUniqueReads(false),
        benchmarkCategory(CATEGORY_ALL),
        distanceMetric(EDIT_DISTANCE),
        trustNM(false),
        ignorePairedFlags(false),
        dontPanic(false),
        checkSorting(true),
        // outPath("-"),
        showMissedIntervals(false),
        showSuperflousIntervals(false),
        showAdditionalIntervals(false),
        showHitIntervals(false),
        showTryHitIntervals(false)
    {}
};

// ----------------------------------------------------------------------------
// Helper Class CmpGsiRecordLowering.
// ----------------------------------------------------------------------------

// Comparison functor for lexicographically sorting by (readId, contigId, first pos).

struct CmpGsiRecordLowering
{
    bool operator()(GsiRecord const & lhs, GsiRecord const & rhs) const
    {
        return (lhs.readId < rhs.readId) || (lhs.readId == rhs.readId && lhs.contigId < rhs.contigId) ||
               (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos < rhs.firstPos) ||
               (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
                lhs.lastPos > rhs.lastPos) ||
               (lhs.readId == rhs.readId && lhs.contigId == rhs.contigId && lhs.firstPos == rhs.firstPos &&
                lhs.lastPos == rhs.lastPos && lhs.distance > rhs.distance);
    }

};

// ============================================================================
// Metafunctions
// ============================================================================

// ============================================================================
// Functions
// ============================================================================

// ---------------------------------------------------------------------------
// Function categoryName()
// ---------------------------------------------------------------------------

char const * categoryName(BenchmarkCategory cat)
{
    if (cat == CATEGORY_ALL)
        return "all";
    else if (cat == CATEGORY_ALL_BEST)
        return "all-best";
    else
        return "any-best";
}

// ---------------------------------------------------------------------------
// Function metricName()
// ---------------------------------------------------------------------------

char const * metricName(DistanceMetric met)
{
    if (met == EDIT_DISTANCE)
        return "edit";
    else  // if (met == HAMMING_DISTANCE)
        return "hamming";
}

// ---------------------------------------------------------------------------
// Function yesNo()
// ---------------------------------------------------------------------------

char const * yesNo(bool b)
{
    if (b)
        return "yes";
    else
        return "no";
}

// ----------------------------------------------------------------------------
// Function performIntervalLowering()
// ----------------------------------------------------------------------------

// Relabel intervals with the smallest distance of a contained interval.  Filter out intervals with distance > maxError.

void performIntervalLowering(String<GsiRecord> & gsiRecords, int maxError)
{
    if (empty(gsiRecords))
        return;

    typedef Iterator<String<GsiRecord> >::Type TIterator;
    typedef IntervalAndCargo<unsigned, unsigned> TInterval;

    // Step 1: Adjust distances.
    std::sort(begin(gsiRecords, Standard()), end(gsiRecords, Standard()), CmpGsiRecordLowering());

    // Add sentinel interval.
    GsiRecord sentinel(back(gsiRecords));
    sentinel.firstPos = std::numeric_limits<size_t>::max();
    sentinel.lastPos = std::numeric_limits<size_t>::max();
    // sentinel.id = std::numeric_limits<size_t>::max();
    appendValue(gsiRecords, sentinel);

    String<TInterval> openIntervals;
    unsigned i = 0;
    for (TIterator it = begin(gsiRecords, Standard()), itEnd = end(gsiRecords, Standard()); it != itEnd; ++it, ++i)
    {
        unsigned count = 0;
        for (unsigned j = 0; j < length(openIntervals); ++j)
        {
            unsigned idx = length(openIntervals) - 1 - j;
            GsiRecord const thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
            SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
            if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
                count += 1;
        }
        resize(openIntervals, length(openIntervals) - count);

        // Perform distance lowering for containing intervals.
        // std::cerr << "OPEN INTERVALS\n";
        // for (unsigned j = 0; j < length(openIntervals); ++j)
        //     std::cerr << "  " << gsiRecords[cargo(openIntervals[j])] << "\t" << gsiRecords[cargo(openIntervals[j])].originalDistance << "\n";
        // std::cerr << "\n";
        for (unsigned j = 0; j < length(openIntervals); ++j)
        {
            unsigned idx = j; //length(openIntervals) - 1 - j;
            unsigned id = cargo(openIntervals[idx]);
            if (gsiRecords[id].distance <= maxError)
                gsiRecords[id].distance = _min(gsiRecords[id].distance, it->distance);
            else
                break;  // All containing intervals must have a greater distance.
        }

        appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
    }

    // Step 2: Filter out intervals that are contained in intervals of lesser/equal distance.
    String<GsiRecord> filteredGsiRecords;
    clear(openIntervals);
    i = 0;
    for (TIterator it = begin(gsiRecords, Standard()), itend = end(gsiRecords, Standard()); it != itend; ++it, ++i)
    {
        // Remove non-overlapping intervals on top of openInterval stack, appending to filtered intervals
        unsigned count = 0;
        for (unsigned j = 0; j < length(openIntervals); ++j)
        {
            unsigned idx = length(openIntervals) - 1 - j;
            GsiRecord const & thisIntervalRecord = gsiRecords[cargo(openIntervals[idx])];
            SEQAN_ASSERT_EQ(thisIntervalRecord.readName, it->readName);
            if (thisIntervalRecord.contigId != it->contigId || thisIntervalRecord.lastPos < it->firstPos)
            {
                count += 1;
                unsigned startDistance = gsiRecords[cargo(openIntervals[idx])].distance;
                if (!empty(filteredGsiRecords))
                {
                    if (back(filteredGsiRecords).lastPos >= leftBoundary(openIntervals[idx]))
                    {
                        if (back(filteredGsiRecords).contigId == gsiRecords[cargo(openIntervals[idx])].contigId)
                        {
                            // Assert current containing already written out.
                            SEQAN_ASSERT_GEQ(back(filteredGsiRecords).firstPos, leftBoundary(openIntervals[idx]));
                            SEQAN_ASSERT_LEQ(back(filteredGsiRecords).lastPos + 1, rightBoundary(openIntervals[idx]));
                            // Get start distance.
                            startDistance = back(filteredGsiRecords).distance + 1;
                        }
                    }
                }
                unsigned upperLimit = maxError;
                if ((unsigned)maxError < startDistance)
                    upperLimit = startDistance;
                for (unsigned i = startDistance; i <= upperLimit; ++i)
                {
                    appendValue(filteredGsiRecords, gsiRecords[cargo(openIntervals[idx])]);
                    back(filteredGsiRecords).originalDistance = i;
                }
            }
        }
        resize(openIntervals, length(openIntervals) - count);

        // Add interval to the stack of intervals.
        if (empty(openIntervals) || gsiRecords[cargo(back(openIntervals))].distance > it->distance)
            appendValue(openIntervals, TInterval(it->firstPos, it->lastPos + 1, i));
    }
    move(gsiRecords, filteredGsiRecords);
}

// ----------------------------------------------------------------------------
// Function benchmarkReadResult()
// ----------------------------------------------------------------------------

template <typename TPatternSpec>
int benchmarkReadResult(RabemaStats & result,
                        String<BamAlignmentRecord> const & samRecords,
                        BamFileIn const & bamFileIn,
                        String<GsiRecord> const & gsiRecords,
                        FaiIndex const & faiIndex,
                        StringSet<Dna5String> const & refSeqs,
                        RefIdMapping const & refIdMapping,
                        RabemaEvaluationOptions const & options,
                        TPatternSpec const & /*tagPattern*/,
                        bool pairedEnd = false,
                        bool second = false)
{
    typedef IntervalAndCargo<unsigned, unsigned> TInterval;
#define DEBUG_RABEMA 0
#if DEBUG_RABEMA
    std::cerr << ",--\n"
              << "| num SAM\t" << length(samRecords) << "\n"
              << "| num GSI\t" << length(gsiRecords) << "\n"
              << "`--\n";
    if (!empty(samRecords))
        std::cerr << "SAM ID\t" << samRecords[0].qName << "\n";
    if (!empty(gsiRecords))
        std::cerr << "GSI ID\t" << gsiRecords[0].readName << "\n";
#endif  // #if DEBUG_RABEMA

    if (options.oracleMode && empty(gsiRecords))
    {
        // There are no GSI intervals in oracle mode.  This can be the case if we constructed the gold standard
        // with a maximal error rate.  If this is the case then we ignore these reads.
        return 0;
    }

    if (options.onlyUniqueReads && length(samRecords) != 1)
    {
        // The read was mapped non-uniquely by the mapper. Either it was not found or found at multiple locations
        // However, to compute the precision of mapper we only evaluate whether reads mapped to a single location
        // were correctly mapped.
        return 0;
    }

    // Select gold standard intervals (GSI) records.
    //
    // We only select intervals that match the specification of pairedEnd, second.  Also, the intervals must be for an
    // error rate less than or equal to the maximal configured one.  In case of any-best or all-best mode, we only
    // select those of the eligible intervals with the lowest error rate.
    //
    // In case of oracle mode, we ignore the distance of the intervals in the GSI file here but use it later on.
    //
    // Start with picking the smallest distance if *-best mode.
    int smallestDistance = options.oracleMode ? std::numeric_limits<int>::max() : options.maxError;
    // Note that smallestDistance (as bestDistance defined below) is expressed as percent of read length ceiled
    // and cat to an int value.
    if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST ||
        options.benchmarkCategory == CATEGORY_ALL_BEST)
        for (unsigned i = 0; i < length(gsiRecords); ++i)
            smallestDistance = std::min(smallestDistance, gsiRecords[i].distance);
    int largestDistance = options.maxError;
    if (options.oracleMode && smallestDistance != std::numeric_limits<int>::max())
        for (unsigned i = 0; i < length(gsiRecords); ++i)
            largestDistance = std::max(largestDistance, gsiRecords[i].distance);
    String<GsiRecord> pickedGsiRecords;
#if DEBUG_RABEMA
    std::cerr << "--\n";
#endif  // #if DEBUG_RABEMA
    for (unsigned i = 0; i < length(gsiRecords); ++i)
    {
#if DEBUG_RABEMA
        std::cerr << "ORIGINAL\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
#endif  // DEBUG_RABEMA

        // Note: In case of oracle mode, we ignore the distance.
        // TODO(holtgrew): Remove the following two lines.
        //if (!options.oracleMode && gsiRecords[i].distance > smallestDistance)
        //    continue;  // Skip with wrong distance.
        if (!options.oracleMode && gsiRecords[i].distance > options.maxError)
            continue;  // Ignore intervals with too high error rate.
        if (!pairedEnd && (gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
            continue;  // Skip paired if non-paired selected.
        if (pairedEnd && !(gsiRecords[i].flags & GsiRecord::FLAG_PAIRED))
            continue;  // Skip non-paired if paired selected.
        if (pairedEnd && second && !(gsiRecords[i].flags & GsiRecord::FLAG_SECOND_MATE))
            continue;  // Skip if second selected but this interval is not for second.

        appendValue(pickedGsiRecords, gsiRecords[i]);
#if DEBUG_RABEMA
        std::cerr << "PICKED\t" << gsiRecords[i] << "\t" << gsiRecords[i].originalDistance << "\n";
#endif  // DEBUG_RABEMA

        // Get index of the sequence from GSI record contig name.
        if (!getIdByName(back(pickedGsiRecords).contigId, faiIndex, back(pickedGsiRecords).contigName))
        {
            std::cerr << "ERROR: Could not find reference sequence for name "
                      << back(pickedGsiRecords).contigName << '\n';
            return 1;
        }
    }
#if DEBUG_RABEMA
    std::cerr << "--\n";
#endif  // DEBUG_RABEMA

    // On these selected GSI records, we now perform interval lowering if not in oracle mode..  This means that an
    // interval I with distance k_i containing an interval J with distance k_j < k_i is re-labeled with a distance of
    // k_j for the smallest k_j of all contained intervals J.  The original distance is stored in the originalDistance
    // member.
    if (!options.oracleMode)
        performIntervalLowering(pickedGsiRecords, options.maxError);

#if DEBUG_RABEMA
    for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
        std::cerr << "LOWERED\t" << pickedGsiRecords[i] << "\t" << pickedGsiRecords[i].originalDistance << "\n";
    std::cerr << "--\n";
#endif  // DEBUG_RABEMA

    // Next we filter the lowered intervals to those with an original distance of options.maxError.  In the case of
    // all-best and any-best we also only keep those with the smallest distance of all intervals for this read.
    String<GsiRecord> filteredGsiRecords;
    for (unsigned i = 0; i < length(pickedGsiRecords); ++i)
    {
        GsiRecord const & record = pickedGsiRecords[i];
        bool keep = (record.originalDistance == options.maxError);
        if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ALL)
            keep = keep && (record.distance == smallestDistance);
        if (options.oracleMode && record.originalDistance <= largestDistance)
            keep = true;
        if (keep)
            appendValue(filteredGsiRecords, record);
    }

    // Build string of intervals for each reference sequence from these lowered and filtered records.
    String<String<TInterval> > intervals;
    resize(intervals, length(refSeqs));
    String<unsigned> numIntervalsForErrorRate;
    resize(numIntervalsForErrorRate, options.maxError + 1, 0);
    String<int> intervalDistances;  // Distance of interval i.
    unsigned numIntervals = 0;
    for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
    {
#if DEBUG_RABEMA
        std::cerr << "USED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
#endif  // #if DEBUG_RABEMA
        int distance = filteredGsiRecords[i].distance;

        appendValue(intervals[filteredGsiRecords[i].contigId],
                    TInterval(filteredGsiRecords[i].firstPos, filteredGsiRecords[i].lastPos + 1,
                              length(intervalDistances)));
        appendValue(intervalDistances, distance);
        numIntervals += 1;
        if (distance >= (int)length(numIntervalsForErrorRate))
            resize(numIntervalsForErrorRate, distance + 1, 0);
        if (!options.oracleMode && options.benchmarkCategory != CATEGORY_ANY_BEST)
            numIntervalsForErrorRate[distance] += 1;
    }
    if (options.benchmarkCategory == CATEGORY_ANY_BEST && !empty(filteredGsiRecords))
        numIntervalsForErrorRate[smallestDistance] += 1;
    // Marker array that states whether an interval was hit.
    String<bool> intervalHit;
    resize(intervalHit, length(intervalDistances), false);

    // Build interval trees.
    String<IntervalTree<unsigned> > intervalTrees;
    resize(intervalTrees, length(refSeqs));
    for (unsigned i = 0; i < length(intervalTrees); ++i)
        createIntervalTree(intervalTrees[i], intervals[i]);

    // One of the SAM records must have a non-empty SEQ field, extract the read seq.
    Dna5String readSeqL, readSeqR;
    bool seenL = false, seenR = false;
    for (unsigned i = 0; i < length(samRecords); ++i)
    {
        seenL |= hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]));
        seenR |= hasFlagLast(samRecords[i]);
        if ((hasFlagFirst(samRecords[i]) || (!hasFlagFirst(samRecords[i]) && !hasFlagLast(samRecords[i]))) &&
            !empty(samRecords[i].seq))
        {
            readSeqL = samRecords[i].seq;
            if (hasFlagRC(samRecords[i]))
                reverseComplement(readSeqL);
        }
        else if (hasFlagLast(samRecords[i]) && !empty(samRecords[i].seq))
        {
            readSeqR = samRecords[i].seq;
            if (hasFlagRC(samRecords[i]))
                reverseComplement(readSeqR);
        }
        if (!empty(readSeqL) && !empty(readSeqR))
            break;  // Short-circuit and break.
    }
    if (seenL && empty(readSeqL))
    {
        std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (left-end)\n";
        return 1;
    }
    if (seenR && empty(readSeqR))
    {
        std::cerr << "ERROR: No alignment for query " << front(samRecords).qName << " (right-end)\n";
        return 1;
    }

    Dna5String readSeq;
    Dna5String contigSeq;
    String<unsigned> queryResult;

    // Try to hit intervals.
    bool mappedAny = false;  // Whether any valid alignment was found.
    for (unsigned i = 0; i < length(samRecords); ++i)
    {
        BamAlignmentRecord const & samRecord = samRecords[i];
        int seqId = refIdMapping.map[samRecord.rID];

        // Compute actual alignment score to rule out invalidly reported alignments.
        //
        // If we run in oracle mode then we ignore the actual alignment score and use the best alignment.  We only care
        // about the alignment's end position in this case.
        if (hasFlagLast(samRecord))
            readSeq = readSeqR;
        else
            readSeq = readSeqL;

        // TODO(holtgrew): Remove const cast once we have const holders!
        BamTagsDict bamTags(const_cast<CharString &>(samRecord.tags));
        unsigned idx = 0;

        // If the CIGAR is not present, e.g. a * is present instead,
        // we can still get the extra position from an user-defined tag.
        unsigned endPos = samRecord.beginPos + getAlignmentLengthInRef(samRecord) - countPaddings(samRecord.cigar);
        if (empty(samRecord.cigar))
        {
            if (empty(options.extraPosTag) ||
                !(findTagKey(idx, bamTags, options.extraPosTag) && extractTagValue(endPos, bamTags, idx)))
            {
                // Simply try to guess the end position.
                endPos = samRecord.beginPos + length(readSeq) + 1;
                std::cerr << "WARNING: Unknown alignment end position for read " << samRecord.qName << ".\n";
            }
            // The extra position tag must be 1-based.
            SEQAN_ASSERT_GT(endPos, 0u);
            endPos--;
        }

        int bestDistance = std::numeric_limits<int>::min();  // Marker for "not set yet".
        // Note that bestDistance expresses the distance in percent error, relative to the read length, ceiled up
        // and converted to an int value.
        if (!options.oracleMode)
        {
            // Get best distance from NM tag if set and we are to trust it.
            if (options.trustNM && findTagKey(idx, bamTags, "NM") && extractTagValue(bestDistance, bamTags, idx))
            {
                // Convert from count to rate.
                bestDistance = static_cast<int>(ceil(100.0 * bestDistance / length(readSeq)));
            }
            // Otherwise, perform a realignment.
            else
            {
                unsigned bandwidth = static_cast<int>(ceil(0.01 * options.maxError * length(readSeq)));

                unsigned intervalBegin = samRecord.beginPos;
                if (intervalBegin > bandwidth)
                    intervalBegin -= bandwidth;

                unsigned intervalEnd = endPos;
                if (intervalEnd < length(refSeqs[seqId]) - 2 * bandwidth)
                    intervalEnd += 2 * bandwidth;

                contigSeq = infix(refSeqs[seqId], intervalBegin, intervalEnd);
                if (hasFlagRC(samRecord))
                    reverseComplement(contigSeq);
                Finder<Dna5String> finder(contigSeq);
                Pattern<Dna5String, TPatternSpec> pattern(readSeq, -static_cast<int>(length(readSeq)) * 1000);
                _patternMatchNOfPattern(pattern, options.matchN);
                _patternMatchNOfFinder(pattern, options.matchN);
                bool ret = setEndPosition(finder, pattern, length(contigSeq) - bandwidth);
                ignoreUnusedVariableWarning(ret);
//                write2(std::cerr, samRecord, bamIOContext, Sam());
                SEQAN_CHECK(ret, "setEndPosition() must not fail!");
                bestDistance = static_cast<int>(ceil(-100.0 * getScore(pattern) / length(readSeq)));
            }
        }

        // Get sequence id and last position of alignment.  We try to hit the interval with the last position (not
        // C-style end) of the read.
        unsigned lastPos = hasFlagRC(samRecord) ? length(refSeqs[seqId]) - samRecord.beginPos - 1 : endPos - 1;

        if (options.showTryHitIntervals)
            std::cerr << "TRY HIT\tchr=" << sequenceName(faiIndex, seqId) << "\tlastPos=" << lastPos << "\tqName="
                      << samRecord.qName << "\n";

        // Try to hit any interval.
        clear(queryResult);
        findIntervals(queryResult, intervalTrees[seqId], lastPos);
        mappedAny = mappedAny || !empty(queryResult);
#if DEBUG_RABEMA
        if (mappedAny)
            std::cout << "MAPPED\t" << front(samRecords).qName << "\n";
#endif  // #if DEBUG_RABEMA
        if (!empty(queryResult))
        {
            for (unsigned i = 0; i < length(queryResult); ++i)
                intervalHit[queryResult[i]] = true;
        }
        else if (bestDistance != std::numeric_limits<int>::min())
        {
            // && bestDistance <= options.maxError)

            // We now have a hit with a distance below the maximal configured error.  This is a candidate for an
            // additional hit.  In case of all-best and any-best, this is only one if its distance is better than the
            // best distance for this read.  In the case of all, it is one if its distance is less than the allowed
            // maximal distance.  If it is not an additional hit then it is an invalid one.
            //
            // Note that all distances including allowedDistance are percent of read length, ceiled up.
            int allowedDistance = options.maxError;
            if ((options.benchmarkCategory == CATEGORY_ALL_BEST || options.benchmarkCategory == CATEGORY_ANY_BEST) &&
                (smallestDistance != std::numeric_limits<int>::max()))
                allowedDistance = smallestDistance;
            if (bestDistance > allowedDistance)
            {
                if (options.showSuperflousIntervals)
                {
                    std::cerr << "SUPERFLOUS/INVALID\t";
                    DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
                    write(cerrIt, samRecord, context(bamFileIn), Sam());
                    std::cerr << "  DISTANCE:        \t" << bestDistance << '\n'
                              << "  ALLOWED DISTANCE:\t" << options.maxError << '\n';
                }
                result.invalidAlignments += 1;
                continue;
            }

            // We found an additional hit.
            if (options.showAdditionalIntervals || !options.dontPanic)
            {
                std::cerr << "ADDITIONAL HIT\t";
                DirectionIterator<std::ostream, Output>::Type cerrIt = directionIterator(std::cerr, Output());
                write(cerrIt, samRecord, context(bamFileIn), Sam());
                std::cerr << '\n';

                for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
                    std::cerr << "FILTERED GSI RECORD\t" << filteredGsiRecords[i] << "\n";
            }

            if (!options.dontPanic)
            {
                std::cerr << "ERROR: Found an additional hit for read " << samRecord.qName << "!\n";
                return 1;
            }
            std::cerr << "WARNING: Found an additional hit for read " << samRecord.qName << ".\n";
        }
    }

    // Compute number of found intervals.
    unsigned numFound = 0;
    String<unsigned> foundIntervalsForErrorRate;
    if ((int)length(foundIntervalsForErrorRate) <= largestDistance + 1)
        resize(foundIntervalsForErrorRate, largestDistance + 1, 0);
    if (options.oracleMode || options.benchmarkCategory == CATEGORY_ANY_BEST)
    {
        int bestDistance = std::numeric_limits<int>::max();
        int bestIdx = 0;
        for (unsigned i = 0; i < length(intervalDistances); ++i)
            if (intervalHit[i])
            {
                if (options.showHitIntervals)
                    std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
                if (bestDistance > intervalDistances[i])
                    bestIdx = i;
                bestDistance = std::min(bestDistance, intervalDistances[i]);
            }
        if (bestDistance != std::numeric_limits<int>::max())
        {
            if (options.showHitIntervals)
                std::cerr << "HIT_BEST\t" << filteredGsiRecords[bestIdx] << "\t" << filteredGsiRecords[bestIdx].originalDistance << "\n";
            numFound += 1;
            foundIntervalsForErrorRate[bestDistance] += 1;
        }
        if (!mappedAny && options.showMissedIntervals)
        {
            for (unsigned i = 0; i < length(filteredGsiRecords); ++i)
                std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
        }
    }
    else  // !options.oracleMode && options.benchmarkCategory in ["all-best", "all"]
    {
        for (unsigned i = 0; i < length(intervalDistances); ++i)
        {
            // if (options.benchmarkCategory == CATEGORY_ALL && intervalDistances[i] != options.maxError)
            //     continue;  // Only count intervals on our maximal error rate in "all" mode.
            if (intervalHit[i])
            {
                if (options.showHitIntervals)
                    std::cerr << "HIT\t" << filteredGsiRecords[i] << "\t" << filteredGsiRecords[i].originalDistance << "\n";
                numFound += 1;
                foundIntervalsForErrorRate[intervalDistances[i]] += 1;
            }
            else
            {
                if (options.showMissedIntervals)  // inside braces for consistency with above
                    std::cerr << "MISSED\t" << filteredGsiRecords[i] << "\n";
            }
        }
        SEQAN_ASSERT_LEQ(numFound, length(intervalDistances));
    }

    // Update the resulting RabemaStats.
    updateMaximalErrorRate(result, largestDistance);
    result.totalReads += 1;
    result.readsInGsi += (numIntervals > 0u);
    result.mappedReads += mappedAny;
    if (options.oracleMode)
    {
        bool found = (numFound > 0u);
        result.intervalsToFind += 1;
        result.intervalsFound += found;
        result.normalizedIntervals += found;
        int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
        result.intervalsToFindForErrorRate[d] += 1;
        result.intervalsFoundForErrorRate[d] += found;
        result.normalizedIntervalsToFindForErrorRate[d] += 1;
        result.normalizedIntervalsFoundForErrorRate[d] += found;
    }
    else if (options.benchmarkCategory == CATEGORY_ANY_BEST)
    {
        int d = (smallestDistance == std::numeric_limits<int>::max()) ? 0 : smallestDistance;
        bool toFind = (numIntervalsForErrorRate[d] > 0u);
        bool found = (foundIntervalsForErrorRate[d] > 0u);
        SEQAN_ASSERT_LEQ(found, toFind);
        result.intervalsToFind += toFind;
        result.intervalsFound += found;
        result.normalizedIntervals += found;
        result.intervalsToFindForErrorRate[d] += toFind;
        result.intervalsFoundForErrorRate[d] += found;
        result.normalizedIntervalsToFindForErrorRate[d] += toFind;
        result.normalizedIntervalsFoundForErrorRate[d] += found;
    }
    else  // all-best or all was selected
    {
        unsigned intervalsToFind = 0;
        unsigned intervalsFound = 0;
        for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
        {
            intervalsToFind += numIntervalsForErrorRate[d];
            intervalsFound += foundIntervalsForErrorRate[d];
            result.intervalsToFindForErrorRate[d] += numIntervalsForErrorRate[d];
            result.intervalsFoundForErrorRate[d] += foundIntervalsForErrorRate[d];
        }
        result.intervalsToFind += intervalsToFind;
        result.intervalsFound += intervalsFound;
        if (intervalsToFind > 0u)
            result.normalizedIntervals += 1.0 * intervalsFound / intervalsToFind;
        for (unsigned d = 0; d < length(numIntervalsForErrorRate); ++d)
        {
            if (intervalsToFind == 0u)
                continue;
            // In case of all-best we only count those with the best error rate for this read.
            if (options.benchmarkCategory == CATEGORY_ALL)
            {
                result.normalizedIntervalsToFindForErrorRate[d] += 1.0 * numIntervalsForErrorRate[d] / intervalsToFind;
                result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
            }
            else if (options.benchmarkCategory == CATEGORY_ALL_BEST && (int)d == smallestDistance)
            {
                result.normalizedIntervalsToFindForErrorRate[d] += 1;
                result.normalizedIntervalsFoundForErrorRate[d] += 1.0 * foundIntervalsForErrorRate[d] / intervalsToFind;
            }
        }
    }

    return 0;
}

// ----------------------------------------------------------------------------
// Function clearPairedFlags()
// ----------------------------------------------------------------------------

void clearPairedFlags(seqan::BamAlignmentRecord & record)
{
    if (hasFlagMultiple(record))
        record.flag = record.flag ^ seqan::BAM_FLAG_MULTIPLE;
    if (hasFlagFirst(record))
        record.flag = record.flag ^ seqan::BAM_FLAG_FIRST;
    if (hasFlagLast(record))
        record.flag = record.flag ^ seqan::BAM_FLAG_LAST;
    if (hasFlagNextRC(record))
        record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_RC;
    if (hasFlagNextUnmapped(record))
        record.flag = record.flag ^ seqan::BAM_FLAG_NEXT_UNMAPPED;
}

// ----------------------------------------------------------------------------
// Function compareAlignedReadsToReference()
// ----------------------------------------------------------------------------

// Stream over both the SAM/BAM and GSI file and compare the hits in the SAM/BAM file against the intervals in the GSI
// file.
//
// Both the SAM/BAM file and the GSI file have to be sorted by queryname for this to work.

template <typename TForwardIter, typename TPatternSpec>
int
compareAlignedReadsToReference(RabemaStats & result,
                               BamFileIn & bamFileIn,
                               FaiIndex const & faiIndex,
                               StringSet<Dna5String> const & refSeqs,
                               TForwardIter & gsiIter,
                               RabemaEvaluationOptions const & options,
                               TPatternSpec const & tagPattern)
{
    // Mapping between ref IDs from SAM/BAM file and reference sequence (from SAM/BAM file to reference sequences).
    RefIdMapping refIdMapping;
    rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
                   contigNames(context(bamFileIn)));

    // Read in initial SAM/GSI records.
    BamAlignmentRecord samRecord;
    if (!atEnd(bamFileIn))
        try
        {
            readRecord(samRecord, bamFileIn);
        }
        catch (seqan::ParseError const & ioErr)
        {
            std::cerr << "ERROR: Could not read first SAM/BAM record.\n";
            return 1;
        }
    if (options.ignorePairedFlags)
        clearPairedFlags(samRecord);
    GsiRecord gsiRecord;
    if (!atEnd(gsiIter))
        try
        {
            readRecord(gsiRecord, gsiIter, Gsi());
        }
        catch (seqan::ParseError const & ioErr)
        {
            std::cerr << "ERROR: Could not read first GSI record.\n";
            return 1;
        }

    // Current SAM/BAM and GSI records are stored in these arrays.
    String<BamAlignmentRecord> currentSamRecords;
    String<GsiRecord> currentGsiRecords;

    // These flags store whether we processed the last SAM/BAM and GSI record.
    bool samDone = false, gsiDone = false;

    // The main loop: We walk over both the SAM/BAM and GSI records.
    // unsigned chunkI = 0;
    std::cerr << "Each dot corresponds to 10k processed reads.\n"
              << "\n"
              << "Progress: ";
    unsigned i = 0;
    while (!samDone || !gsiDone)
    {
        if (i > 0u && i % (100 * 1000) == 0u)
            std::cerr << i / 100 / 1000 << "00k";
        else if (i > 0 && i % (10 * 1000) == 0u)
            std::cerr << '.';
        ++i;

        // We process the record for the next query/read.  Since records for this next query/read might be missing in
        // both files, we need to determine which is the next one.
        CharString currentReadName;
        if (gsiDone)
            currentReadName = samRecord.qName;
        else if (samDone)
            currentReadName = gsiRecord.readName;
        else
            currentReadName = lessThanSamtoolsQueryName(gsiRecord.readName, samRecord.qName) ?
                              gsiRecord.readName : samRecord.qName;

        // These flags determine whether evaluation is run for single-end and/or paired-end reads.
        bool seenSingleEnd = false, seenPairedEnd = false;

        // Read all SAM/BAM records with the same query name.
        clear(currentSamRecords);
        while (!samDone && samRecord.qName == currentReadName)
        {
            if (!hasFlagUnmapped(samRecord))  // Ignore records with non-aligned reads.
            {
                seenSingleEnd |= !hasFlagMultiple(samRecord);
                seenPairedEnd |= hasFlagMultiple(samRecord);
                appendValue(currentSamRecords, samRecord);
            }
            if (atEnd(bamFileIn))
            {
                // At end of SAM/BAM File, do not read next one.
                samDone = true;
                continue;
            }
            try
            {
                readRecord(samRecord, bamFileIn);
            }
            catch (seqan::ParseError const & ioErr)
            {
                std::cerr << "ERROR: Could not read SAM/BAM record.\n";
                return 1;
            }
            if (options.ignorePairedFlags)
                clearPairedFlags(samRecord);
            if (options.checkSorting && lessThanSamtoolsQueryName(samRecord.qName, currentReadName))
            {
                std::cerr << "ERROR: Wrong order in SAM/BAM file: " << samRecord.qName << " succeeds "
                          << currentReadName << " in file.\n"
                          << "File must be sorted by read name/queryname.\n";
                return 1;
            }
            // Rebuild ref ID mapping if we discovered a new reference sequence.
            if (length(contigNames(context(bamFileIn))) != length(refIdMapping))
                rebuildMapping(refIdMapping, faiIndex.seqNameStore, faiIndex.seqNameStoreCache,
                               contigNames(context(bamFileIn)));
        }

        // Read in the next block of GSI records.
        clear(currentGsiRecords);
        while (!gsiDone && gsiRecord.readName == currentReadName)
        {
            seenSingleEnd |= !(gsiRecord.flags & GsiRecord::FLAG_PAIRED);
            seenPairedEnd |= (gsiRecord.flags & GsiRecord::FLAG_PAIRED);
            appendValue(currentGsiRecords, gsiRecord);
            if (atEnd(gsiIter))
            {
                // At end of GSI File, do not read next one.
                gsiDone = true;
                continue;
            }
            try
            {
                readRecord(gsiRecord, gsiIter, Gsi());
            }
            catch (seqan::ParseError const & ioErr)
            {
                std::cerr << "ERROR: Could not read GSI record.\n";
                return 1;
            }
            if (options.checkSorting && lessThanSamtoolsQueryName(gsiRecord.readName, currentReadName))
            {
                std::cerr << "ERROR: Wrong order in GSI file: " << gsiRecord.readName << " succeeds "
                          << currentReadName << " in file.\n"
                          << "File must be sorted by read name/queryname.\n";
                return 1;
            }
        }

        // Now, compare the SAM/BAM records against the intervals stored in the GSI records.
        //
        // We collected the records for all queries.  Here, we differentiate between the different cases.
        if (seenSingleEnd)
        {
            int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
                                          faiIndex, refSeqs, refIdMapping, options, tagPattern,
                                          /*pairedEnd=*/ false);
            if (res != 0)
                return 1;
        }
        if (seenPairedEnd)
        {
            int res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
                                          faiIndex, refSeqs, refIdMapping, options, tagPattern,
                                          /*pairedEnd=*/ true, /*second=*/ false);
            if (res != 0)
                return 1;

            res = benchmarkReadResult(result, currentSamRecords, bamFileIn, currentGsiRecords,
                                      faiIndex, refSeqs, refIdMapping, options, tagPattern,
                                      /*pairedEnd=*/ true, /*second=*/ true);
            if (res != 0)
                return 1;
        }
    }
    std::cerr << " DONE\n";

    return 0;
}

// ---------------------------------------------------------------------------
// Function parseCommandLine()
// ---------------------------------------------------------------------------

seqan::ArgumentParser::ParseResult
parseCommandLine(RabemaEvaluationOptions & options, int argc, char const ** argv)
{
    // -----------------------------------------------------------------------
    // Parse Command Line Using ArgumentParser
    // -----------------------------------------------------------------------

    seqan::ArgumentParser parser("rabema_evaluate");
    setShortDescription(parser, "RABEMA Evaluation");
    setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
    setDate(parser, SEQAN_DATE);
    setCategory(parser, "Benchmarking");

    addUsageLine(parser,
                 "[\\fIOPTIONS\\fP] \\fB--reference\\fP \\fIREF.fa\\fP \\fB--in-gsi\\fP \\fIIN.gsi\\fP "
                 "\\fB--in-bam\\fP \\fIMAPPING.{sam,bam}\\fP");
    addDescription(parser,
                   "Compare the SAM/bam output \\fIMAPPING.sam\\fP/\\fIMAPPING.bam\\fP of any read mapper against "
                   "the RABEMA gold standard previously built with \\fBrabema_build_gold_standard\\fP.  The input "
                   "is a reference FASTA file, a gold standard interval (GSI) file and the SAM/BAM input to "
                   "evaluate.");
    addDescription(parser,
                   "The input SAM/BAM file must be \\fIsorted by queryname\\fP.  The program will create a "
                   "FASTA index file \\fIREF.fa.fai\\fP for fast random access to the reference.");

    addOption(parser, seqan::ArgParseOption("v", "verbose", "Enable verbose output."));
    addOption(parser, seqan::ArgParseOption("vv", "very-verbose", "Enable even more verbose output."));

    addSection(parser, "Input / Output");
    // addOption(parser, seqan::ArgParseOption("o", "out-gsi", "Path to write the resulting GSI file to.",
    //                                         seqan::ArgParseArgument::STRING, false, "GSI"));
    // setRequired(parser, "out-gsi", true);
    addOption(parser, seqan::ArgParseOption("r", "reference", "Path to load reference FASTA from.",
                                            seqan::ArgParseArgument::INPUT_FILE, "FASTA"));
    setValidValues(parser, "reference", seqan::SeqFileIn::getFileExtensions());
    setRequired(parser, "reference", true);
    addOption(parser, seqan::ArgParseOption("g", "in-gsi",
                                            "Path to load gold standard intervals from. If compressed using gzip, "
                                            "the file will be decompressed on the fly.",
                                            seqan::ArgParseArgument::INPUT_FILE, "GSI"));
    setRequired(parser, "in-gsi", true);
    setValidValues(parser, "in-gsi", "gsi gsi.gz");  // GSI (Gold Standard Intervals) Format only.

    addOption(parser, seqan::ArgParseOption("b", "in-bam", "Path to load the read mapper SAM or BAM output from.",
                                            seqan::ArgParseArgument::INPUT_FILE, "BAM"));
    setValidValues(parser, "in-bam", BamFileIn::getFileExtensions());
    setRequired(parser, "in-bam");
    addOption(parser, seqan::ArgParseOption("", "out-tsv", "Path to write the statistics to as TSV.",
                                            seqan::ArgParseArgument::OUTPUT_FILE, "TSV"));
    setValidValues(parser, "out-tsv", "rabema_report_tsv");

    addOption(parser, seqan::ArgParseOption("", "dont-check-sorting",
                                            "Do not check sortedness (by name) of input SAM/BAM files.  This is "
                                            "required if the reads are not sorted by name in the original FASTQ "
                                            "files.  Files from the SRA and ENA generally are sorted."));

    addSection(parser, "Benchmark Parameters");
    addOption(parser, seqan::ArgParseOption("", "oracle-mode",
                                            "Enable oracle mode.  This is used for simulated data when the input "
                                            "GSI file gives exactly one position that is considered as the true "
                                            "sample position.  For simulated data."));
    addOption(parser, seqan::ArgParseOption("", "only-unique-reads",
                                            "Consider only reads that a single alignment in the mapping result file. "
                                            "Useful for precision computation."));
    addOption(parser, seqan::ArgParseOption("", "match-N", "When set, N matches all characters without penalty."));
    addOption(parser, seqan::ArgParseOption("", "distance-metric",
                                            "Set distance metric.  Valid values: hamming, edit.  Default: edit.",
                                            seqan::ArgParseOption::STRING, "METRIC"));
    setValidValues(parser, "distance-metric", "hamming edit");
    setDefaultValue(parser, "distance-metric", "edit");

    addOption(parser, seqan::ArgParseOption("e", "max-error",
                                            "Maximal error rate to build gold standard for in percent.  This "
                                            "parameter is an integer and relative to the read length.  "
                                            "The error rate is ignored in oracle mode, here the distance "
                                            "of the read at the sample position is taken, individually "
                                            "for each read.  Default: 0",
                                            seqan::ArgParseArgument::INTEGER, "RATE"));
    setDefaultValue(parser, "max-error", 0);

    addOption(parser, seqan::ArgParseOption("c", "benchmark-category",
                                            "Set benchmark category.  One of {all, all-best, any-best.  Default: all",
                                            seqan::ArgParseOption::STRING, "CAT"));
    setValidValues(parser, "benchmark-category", "all all-best any-best");
    setDefaultValue(parser, "benchmark-category", "all");

    addOption(parser, seqan::ArgParseOption("", "trust-NM",
                                            "When set, we trust the alignment and distance from SAM/BAM file and no "
                                            "realignment is performed.  Off by default."));
    addOption(parser, seqan::ArgParseOption("", "extra-pos-tag",
                                            "If the CIGAR string is absent, the missing alignment end position can be "
                                            "provided by this BAM tag.",
                                            seqan::ArgParseOption::STRING));

    addOption(parser, seqan::ArgParseOption("", "ignore-paired-flags",
                                            "When set, we ignore all SAM/BAM flags related to pairing.  This is "
                                            "necessary when analyzing SAM from SOAP's soap2sam.pl script."));
    addOption(parser, seqan::ArgParseOption("", "DONT-PANIC",
                                            "Do not stop program execution if an additional hit was found that "
                                            "indicates that the gold standard is incorrect."));

    addSection(parser, "Logging");
    addText(parser, "");
    addText(parser,
            "The occurrence of \"invalid\" hits in the read mapper's output is not an error.  If there are "
            "additional hits, however, this shows an error in the gold standard.");
    addOption(parser, seqan::ArgParseOption("", "show-missed-intervals",
                                            "Show details for each missed interval from the GSI."));
    addOption(parser, seqan::ArgParseOption("", "show-invalid-hits",
                                            "Show details for invalid hits (with too high error rate)."));
    addOption(parser, seqan::ArgParseOption("", "show-additional-hits",
                                            "Show details for additional hits (low enough error rate but not in "
                                            "gold standard."));
    addOption(parser, seqan::ArgParseOption("", "show-hits", "Show details for hit intervals."));
    addOption(parser, seqan::ArgParseOption("", "show-try-hit", "Show details for each alignment in SAM/BAM input."));

    addTextSection(parser, "Return Values");
    addText(parser, "A return value of 0 indicates success, any other value indicates an error.");

    // addTextSection(parser, "Examples");

    // addListItem(parser,
    //             "\\fBrabema_build_gold_standard\\fP \\fB-e\\fP \\fI4\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
    //                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
    //             "Build gold standard from a SAM file \\fIIN.sam\\fP with all mapping locations and a FASTA "
    //             "reference \\fIREF.fa\\fP to GSI file \\fIOUT.gsi\\fP with a maximal error rate of \\fI4\\fP.");
    // addListItem(parser,
    //             "\\fBrabema_build_gold_standard\\fP \\fB--distance-metric\\fP \\fIedit\\fP \\fB-e\\fP \\fI4\\fP "
    //                 "\\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP \\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
    //             "Same as above, but using Hamming instead of edit distance.");
    // addListItem(parser,
    //             "\\fBrabema_build_gold_standard\\fP \\fB--oracle-mode\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-i\\fP "
    //                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
    //             "Build gold standard from a SAM file \\fIIN.sam\\fP with the original sample position, e.g.  "
    //             "as exported by read simulator Mason.");

    addTextSection(parser, "Memory Requirements");
    addText(parser,
            "From version 1.1, great care has been taken to keep the memory requirements as low as possible.");
    addText(parser,
            "The evaluation step needs to store the whole reference sequence in memory but little more "
            "memory.  So, for the human genome, the memory requirements are below 4 GB, regardless of "
            "the size of the GSI or SAM/BAM file.");

    addTextSection(parser, "References");
    addText(parser,
            "M. Holtgrewe, A.-K. Emde, D. Weese and K. Reinert.  A Novel And Well-Defined Benchmarking Method "
            "For Second Generation Read Mapping, BMC Bioinformatics 2011, 12:210.");
    addListItem(parser, "\\fIhttp://www.seqan.de/rabema\\fP", "RABEMA Homepage");
    addListItem(parser, "\\fIhttp://www.seqan.de/mason\\fP", "Mason Homepage");

    // Actually do the parsing and exit on error, help display etc.
    seqan::ArgumentParser::ParseResult res = parse(parser, argc, argv);
    if (res != seqan::ArgumentParser::PARSE_OK)
        return res;

    // -----------------------------------------------------------------------
    // Fill BuildGoldStandardOptions Object
    // -----------------------------------------------------------------------

    if (isSet(parser, "verbose"))
        options.verbosity = 2;
    if (isSet(parser, "very-verbose"))
        options.verbosity = 3;

    if (isSet(parser, "reference"))
        getOptionValue(options.referencePath, parser, "reference");
    if (isSet(parser, "in-bam"))
        getOptionValue(options.inBamPath, parser, "in-bam");
    if (isSet(parser, "in-gsi"))
        getOptionValue(options.inGsiPath, parser, "in-gsi");
    if (isSet(parser, "out-tsv"))
        getOptionValue(options.outTsvPath, parser, "out-tsv");

    getOptionValue(options.maxError, parser, "max-error");
    options.matchN = isSet(parser, "match-N");
    options.oracleMode = isSet(parser, "oracle-mode");
    options.onlyUniqueReads = isSet(parser, "only-unique-reads");
    CharString benchmarkCategory;
    getOptionValue(benchmarkCategory, parser, "benchmark-category");
    if (benchmarkCategory == "all")
        options.benchmarkCategory = CATEGORY_ALL;
    else if (benchmarkCategory == "all-best")
        options.benchmarkCategory = CATEGORY_ALL_BEST;
    else  // if (benchmarkCategory == "any-best")
        options.benchmarkCategory = CATEGORY_ANY_BEST;
    CharString distanceMetric;
    getOptionValue(distanceMetric, parser, "distance-metric");
    if (distanceMetric == "edit")
        options.distanceMetric = EDIT_DISTANCE;
    else
        options.distanceMetric = HAMMING_DISTANCE;
    options.trustNM = isSet(parser, "trust-NM");
    getOptionValue(options.extraPosTag, parser, "extra-pos-tag");
    options.ignorePairedFlags = isSet(parser, "ignore-paired-flags");
    options.dontPanic = isSet(parser, "DONT-PANIC");

    getOptionValue(options.checkSorting, parser, "dont-check-sorting");
    options.checkSorting = !options.checkSorting;

    options.showMissedIntervals = isSet(parser, "show-missed-intervals");
    options.showSuperflousIntervals = isSet(parser, "show-invalid-hits");
    options.showAdditionalIntervals = isSet(parser, "show-additional-hits");
    options.showHitIntervals = isSet(parser, "show-hits");
    options.showTryHitIntervals = isSet(parser, "show-try-hit");

    return res;
}

// ----------------------------------------------------------------------------
// Function main()
// ----------------------------------------------------------------------------

int main(int argc, char const ** argv)
{
    // Parse command line and store results in options.
    RabemaEvaluationOptions options;
    seqan::ArgumentParser::ParseResult parseRes = parseCommandLine(options, argc, argv);
    if (parseRes != seqan::ArgumentParser::PARSE_OK)
        return parseRes == seqan::ArgumentParser::PARSE_ERROR;

    double startTime = 0;  // For measuring time below.

    std::cerr << "==============================================================================\n"
              << "                RABEMA - Read Alignment BEnchMArk\n"
              << "==============================================================================\n"
              << "                        Result Comparison\n"
              << "==============================================================================\n"
              << "\n";
    std::cerr << "____OPTIONS___________________________________________________________________\n\n";

    std::cerr << "Max error rate [%]    " << options.maxError << "\n"
              << "Oracle mode           " << yesNo(options.oracleMode) << '\n'
              << "Only unique reads     " << yesNo(options.onlyUniqueReads) << '\n'
              << "Benchmark category    " << categoryName(options.benchmarkCategory) << "\n"
              << "Distance measure      " << metricName(options.distanceMetric) << "\n"
              << "Match Ns              " << yesNo(options.matchN) << '\n'
              << "Trust NM tag          " << yesNo(options.trustNM) << '\n'
              << "Ignore paired flags   " << yesNo(options.ignorePairedFlags) << '\n'
              << "GSI File              " << options.inGsiPath << '\n'
              << "BAM File              " << options.inBamPath << '\n'
              << "Reference File        " << options.referencePath << '\n'
              << "TSV Output File       " << options.outTsvPath << '\n'
              << "Check Sorting         " << yesNo(options.checkSorting) << '\n'
              << "Show\n"
              << "    additional        " << yesNo(options.showAdditionalIntervals) << '\n'
              << "    hit               " << yesNo(options.showHitIntervals) << '\n'
              << "    missed            " << yesNo(options.showMissedIntervals) << '\n'
              << "    superfluous       " << yesNo(options.showSuperflousIntervals) << '\n'
              << "    try hit           " << yesNo(options.showTryHitIntervals) << '\n'
              << "\n";

    std::cerr << "____LOADING FILES_____________________________________________________________\n\n";

    // =================================================================
    // Prepare File I/O.
    // =================================================================

    startTime = sysTime();
    // Open reference FAI index.
    std::cerr << "Reference Index           " << options.referencePath << ".fai ...";
    FaiIndex faiIndex;
    if (!open(faiIndex, toCString(options.referencePath)))
    {
        std::cerr << " FAILED (not fatal, we can just build it)\n";
        std::cerr << "Building Index        " << options.referencePath << ".fai ...";
        if (!build(faiIndex, toCString(options.referencePath)))
        {
            std::cerr << "Could not build FAI index.\n";
            return 1;
        }
        std::cerr << " OK\n";
        seqan::CharString faiPath = options.referencePath;
        append(faiPath, ".fai");
        std::cerr << "Reference Index       " << faiPath << " ...";
        try
        {
            save(faiIndex, toCString(faiPath));
            std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
        }
        catch (IOError const & ioErr)
        {
            std::cerr << "Could not write FAI index we just built.\n";
            return 1;
        }
    }
    std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";

    std::cerr << "Reference Sequences       " << options.referencePath << " ...";
    StringSet<Dna5String> refSeqs;
    resize(refSeqs, length(faiIndex.seqNameStore));
    for (unsigned i = 0; i < length(faiIndex.seqNameStore); ++i)
    {
        reserve(refSeqs[i], sequenceLength(faiIndex, i), Exact());
        try
        {
            readSequence(refSeqs[i], faiIndex, i);
        }
        catch (seqan::ParseError const & ioErr)
        {
            std::cerr << "ERROR: Could not read sequence " << faiIndex.seqNameStore[i] << ".\n";
            return 0;
        }
    }
    std::cerr << " OK\n";

    // Open gold standard intervals (GSI) file and read in header.
    std::cerr << "Gold Standard Intervals   " << options.inGsiPath << " (header) ...";
    VirtualStream<char, Input> inGsi;
    if (!open(inGsi, toCString(options.inGsiPath)))
    {
        std::cerr << "Could not open GSI file.\n";
        return 1;
    }
    if (!isEqual(format(inGsi), Nothing()))
        std::cerr << " (is compressed)";
    DirectionIterator<VirtualStream<char, Input>, Input>::Type inGsiIter = directionIterator(inGsi, Input());
    GsiHeader gsiHeader;
    try
    {
        readHeader(gsiHeader, inGsiIter, Gsi());
        std::cerr << " OK\n";
    }
    catch (seqan::ParseError const & ioErr)
    {
        std::cerr << "Could not read GSI header(" << ioErr.what() << ").\n";
        return 1;
    }

    // Open SAM/BAM file and read in header.
    BamFileIn bamFileIn;
    if (!open(bamFileIn, toCString(options.inBamPath)))
    {
        std::cerr << "Could not open SAM file." << std::endl;
        return 1;
    }
    BamHeader bamHeader;
    try
    {
        std::cerr << "Alignments                " << options.inBamPath << " (header) ...";
        readHeader(bamHeader, bamFileIn);
        std::cerr << " OK\n";
    }
    catch (seqan::ParseError const & ioErr)
    {
        std::cerr << "Could not read SAM header (" << ioErr.what() << ").\n";
        return 1;
    }

    // The SAM/BAM file has to be sorted by query name.
    //
    // We do not look at the SAM header for read ordering since samtools sort does not update the SAM header.  This
    // means users would have to update it manually after sorting which is painful.  We will check SAM record order on
    // the fly.

    std::cerr << "\nTook " << sysTime() - startTime << "s\n";

    // =================================================================
    // Stream the alignments against gold standard intervals and check.
    // =================================================================

    std::cerr << "\n____COMPARING ALIGNMENTS WITH INTERVALS_______________________________________\n\n";

    startTime = sysTime();
    // The result will be a list of ids to entries in witStore.
    int res = 0;
    RabemaStats result(options.maxError);
    if (options.distanceMetric == EDIT_DISTANCE)
        res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
                                             inGsiIter, options, MyersUkkonenReads());
    else  // options.distanceMetric == HAMMING_DISTANCE
        res = compareAlignedReadsToReference(result, bamFileIn, faiIndex, refSeqs,
                                             inGsiIter, options, HammingSimple());
    if (res != 0)
        return 1;

    std::cerr << "\nTook " << sysTime() - startTime << " s\n";

    std::cerr << "\n____RESULTING STATISTICS______________________________________________________\n\n";

    std::cerr << "Note that in all-best and any-best mode, we differentiate the intervals we\n"
              << "found and those we have to find by their distance.  This is not possible in\n"
              << "all mode since a multiple lower-error intervals might be contained in an\n"
              << "higher-error interval.\n"
              << '\n'
              << "Alignments will be marked as \"invalid\" if they have a higher error rate than\n"
              << "allowed, i.e. they might become valid when increasing the allowed error rate.\n"
              << '\n'
              << "In \"all\" mode, only intervals from the maximal level are required.  In \"all-best\"\n"
              << "and \"any-best\" mode, the intervals are relabeled with the smallest distance that\n"
              << "a containing interval has.  Contained intervals are then removed.\n\n\n";

    // int maxError = -1;
    // if (!options.oracleMode && options.benchmarkCategory == CATEGORY_ALL)
    //     maxError = options.maxError;
    write(std::cout, result, /*options.maxError,*/ Raw());

    if (!empty(options.outTsvPath))
    {
        std::cerr << '\n'
                  << "Writing output TSV        " << options.outTsvPath << " ...";
        std::ofstream tsvOut(toCString(options.outTsvPath), std::ios::out | std::ios::binary);
        bool failed = false;
        if (!tsvOut.good())
        {
            failed = true;
            std::cerr << " FAILED - could not open output file!\n";
        }
        else if (write(tsvOut, result, options.maxError, categoryName(options.benchmarkCategory),
                       options.oracleMode, metricName(options.distanceMetric), Tsv()) != 0)
        {
            failed = true;
            std::cerr << " FAILED - error writing to output file!\n";
        }
        if (!failed)
            std::cerr << " OK\n";
    }

    return 0;
}