1 // ==========================================================================
2 //                      RABEMA Read Alignment Benchmark
3 // ==========================================================================
4 // Copyright (C) 2010-1012 Manuel Holtgrewe, FU Berlin
5 //
6 // This program is free software: you can redistribute it and/or modify it
7 // under the terms of the GNU General Public License as published by the Free
8 // Software Foundation, either version 3 of the License, or (at your option)
9 // any later version.
10 //
11 // This program is distributed in the hope that it will be useful, but WITHOUT
12 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14 // more details.
15 //
16 // You should have received a copy of the GNU General Public License along
17 // with this program.  If not, see <http://www.gnu.org/licenses/>.
18 //
19 // ==========================================================================
20 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
21 // ==========================================================================
22 // RABEMA is a read mapping benchmark tool.  See README for an overview.
23 //
24 // This is the program for building the RABEMA gold standard.  The input to
25 // this program is the reference sequence as a FASTA file and a "perfect
26 // mapping" SAM/BAM file (see README on how to obtain such a gold standard).
27 // ==========================================================================
28 
29 // TODO(holtgrew): Another possible way to save memory is not to store the first, always identical part of the read id.
30 
31 #include <fstream>
32 #include <iostream>
33 #include <map>
34 
35 #include <seqan/arg_parse.h>
36 #include <seqan/bam_io.h>
37 #include <seqan/basic.h>
38 #include <seqan/sequence.h>
39 #include <seqan/store.h>
40 #include <seqan/seq_io.h>
41 
42 #include "curve_smoothing.h"
43 #include "find_hamming_simple_ext.h"
44 #include "find_myers_ukkonen_reads.h"
45 #include "io_gsi.h"
46 #include "sorting.h"
47 
48 // ============================================================================
49 // Enums, Tags, Classes, Typedefs.
50 // ============================================================================
51 
52 typedef std::map<int, TWeightedMatches> TErrorCurves;
53 
54 // ----------------------------------------------------------------------------
55 // Helper Class IntervalizeCmp
56 // ----------------------------------------------------------------------------
57 
58 // Comparison functor used in intervalizeErrorCurves().
59 
60 struct IntervalizeCmp
61 {
62     StringSet<CharString> const & readNameStore;
63 
IntervalizeCmpIntervalizeCmp64     IntervalizeCmp(StringSet<CharString> const & readNameStore) :
65         readNameStore(readNameStore)
66     {}
67 
operator ()IntervalizeCmp68     bool operator()(unsigned lhs, unsigned rhs) const
69     {
70         return lessThanSamtoolsQueryName(readNameStore[lhs], readNameStore[rhs]);
71     }
72 
73 };
74 
75 // ----------------------------------------------------------------------------
76 // Class ContigInterval
77 // ----------------------------------------------------------------------------
78 
79 // Represents an interval in a contig.
80 struct ContigInterval
81 {
82     // Contig the interval is in.
83     size_t contigId;
84 
85     // true iff the interval is on the forward strand.
86     bool isForward;
87 
88     // First value of interval
89     size_t first;
90 
91     // Last value of interval.
92     size_t last;
93 
94     // Default constructor so it can be used in containers.
ContigIntervalContigInterval95     ContigInterval() : contigId(0), isForward(false), first(0), last(0)
96     {}
97 
98     // Constructor for the record.
ContigIntervalContigInterval99     ContigInterval(size_t _contigId, bool _isForward, size_t _first, size_t _last) :
100         contigId(_contigId), isForward(_isForward), first(_first), last(_last)
101     {}
102 };
103 
104 // ---------------------------------------------------------------------------
105 // Enum DistanceMetric
106 // ---------------------------------------------------------------------------
107 
108 enum DistanceMetric
109 {
110     HAMMING_DISTANCE,
111     EDIT_DISTANCE
112 };
113 
114 // ---------------------------------------------------------------------------
115 // Class BuildGoldStandardOptions
116 // ---------------------------------------------------------------------------
117 
118 struct BuildGoldStandardOptions
119 {
120     // Verbosity level, quiet: 0, normal: 1, verbose: 2, very verbose: 3.
121     int verbosity;
122 
123     // Whether N should match all characters without penalty.
124     bool matchN;
125 
126     // Whether or not to run in oracle mode.  The input SAM/BAM file must only contain the sample location for each read
127     // in this case.  This is used when building a GSI file for reads simulated by Mason where the original locations
128     // are written out in a SAM/BAM file.
129     bool oracleMode;
130 
131     // Maximal error rate in percent, relative to the read length.
132     int maxError;
133 
134     // Indicates whether a maximal error was set.
135     bool maxErrorSet;
136 
137     // Distance metric to use.
138     DistanceMetric distanceMetric;
139 
140     // Path to the output file that we will write the GSI to.
141     seqan::CharString outGsiPath;
142 
143     // Path to the reference contigs file.
144     seqan::CharString referencePath;
145 
146     // Exactly one of the following two has to be given.
147     //
148     // Path to the perfect input SAM/BAM file.
149     seqan::CharString inBamPath;
150 
BuildGoldStandardOptionsBuildGoldStandardOptions151     BuildGoldStandardOptions() :
152         verbosity(1),
153         matchN(false),
154         oracleMode(false),
155         maxError(0),
156         maxErrorSet(false),
157         distanceMetric(EDIT_DISTANCE)
158     {}
159 };
160 
161 // ============================================================================
162 // Metafunctions
163 // ============================================================================
164 
165 // ============================================================================
166 // Functions
167 // ============================================================================
168 
169 // ---------------------------------------------------------------------------
170 // Function metricName()
171 // ---------------------------------------------------------------------------
172 
metricName(DistanceMetric met)173 char const * metricName(DistanceMetric met)
174 {
175     if (met == EDIT_DISTANCE)
176         return "edit";
177     else  // if (met == HAMMING_DISTANCE)
178         return "hamming";
179 }
180 
181 // ---------------------------------------------------------------------------
182 // Function yesNo()
183 // ---------------------------------------------------------------------------
184 
yesNo(bool b)185 char const * yesNo(bool b)
186 {
187     if (b)
188         return "yes";
189     else
190         return "no";
191 }
192 
193 // ----------------------------------------------------------------------------
194 // Function trimSeqHeaderToId()
195 // ----------------------------------------------------------------------------
196 
197 // TODO(holtgrew): Such a method is already in the SeqAn library, but in extras. Remove here when it is in core.
trimSeqHeaderToId(seqan::CharString & header)198 void trimSeqHeaderToId(seqan::CharString & header)
199 {
200     unsigned i = 0;
201     for (; i < length(header); ++i)
202         if (isspace(header[i]))
203             break;
204     resize(header, i);
205 }
206 
207 // ----------------------------------------------------------------------------
208 // Function intervalizeAndDumpErrorCurves()
209 // ----------------------------------------------------------------------------
210 
211 // Build intervals from the error curves and print them to stream.
212 
213 template <typename TStream>
intervalizeAndDumpErrorCurves(TStream & stream,TErrorCurves const & errorCurves,String<int> const & readAlignmentDistances,StringSet<CharString> const & readNameStore,StringSet<CharString> const & contigNameStore,BuildGoldStandardOptions const & options,bool toStdout)214 int intervalizeAndDumpErrorCurves(TStream & stream,
215                                   TErrorCurves const & errorCurves,
216                                   String<int> const & readAlignmentDistances,
217                                   StringSet<CharString> const & readNameStore,
218                                   StringSet<CharString> const & contigNameStore,
219                                   BuildGoldStandardOptions const & options,
220                                   bool toStdout)
221 {
222     if (!toStdout)
223         std::cerr << "\n____POINT TO INTERVAL CONVERSION______________________________________________\n\n"
224                   << "Progress: ";
225 
226     // Write out the header.
227     GsiHeader header;
228     writeHeader(stream, header, Gsi());
229     writeRecord(stream, GSI_COLUMN_NAMES, Gsi());
230 
231     // Get a list of read ids, sorted by their read name.
232     String<unsigned> sortedReadIds;
233     reserve(sortedReadIds, length(readNameStore));
234     for (unsigned i = 0; i < length(readNameStore); ++i)
235         appendValue(sortedReadIds, i);
236     IntervalizeCmp cmp(readNameStore);
237     std::sort(begin(sortedReadIds, Standard()), end(sortedReadIds, Standard()), cmp);
238 
239     unsigned tenPercent = errorCurves.size() / 10 + 1;
240     typedef TErrorCurves::const_iterator TErrorCurvesIter;
241     for (unsigned i = 0; i < length(sortedReadIds); ++i)
242     {
243         TErrorCurvesIter it = errorCurves.find(sortedReadIds[i]);
244         if (it == errorCurves.end())
245         {
246             if (!toStdout)
247                 std::cerr << "WARNING: Something went wrong with read ids! This should not happen.\n";
248             continue;
249         }
250 
251         if (!toStdout)
252         {
253             if (tenPercent > 0u && i % tenPercent == 0u)
254                 std::cerr << i / tenPercent * 10 << '%';
255             else if (tenPercent > 5u && i % (tenPercent / 5) == 0u)
256                 std::cerr << '.';
257         }
258 
259         size_t readId = it->first;
260         TWeightedMatches const & matches = it->second;
261 
262         // Sort the matches.  Matches with high scores (negative score and low absolute value) come first.
263         TWeightedMatches sortedMatches(matches);
264 
265         // intervals[e] holds the intervals for error e of the current read.
266         String<String<ContigInterval> > intervals;
267         int maxError = options.oracleMode ? 0 : (int)options.maxError;
268         if (options.oracleMode)
269             for (unsigned j = 0; j < length(matches); ++j)
270                 maxError = std::max(maxError, matches[j].distance);
271         resize(intervals, maxError + 1);
272 
273         // Join the intervals stored in sortedMatches.
274         //
275         // The position of the previous match, so we can consider only the ones with the smallest error.
276         //
277         // The following two vars should be != first pos and contigId.
278         size_t previousPos = std::numeric_limits<size_t>::max();
279         size_t previousContigId = std::numeric_limits<size_t>::max();
280         typedef Iterator<TWeightedMatches>::Type TWeightedMatchesIter;
281         for (TWeightedMatchesIter it = begin(sortedMatches);
282              it != end(sortedMatches); ++it)
283         {
284             // Skip it if (it - 1) pointed to same pos (and must point to
285             // one with smaller absolute distance.
286             if (it->pos == previousPos && it->contigId == previousContigId)
287                 continue;
288             // Consider all currently open intervals with a greater error than the error in *it and extend them or
289             // create a new one.
290             int error = options.oracleMode ? 0 : abs(it->distance);
291             SEQAN_ASSERT_LEQ(error, maxError);
292             for (int e = error; e <= maxError; ++e)
293             {
294                 // Handle base case of no open interval:  Create new one.
295                 if (length(intervals[e]) == 0)
296                 {
297                     appendValue(intervals[e], ContigInterval(it->contigId, it->isForward, it->pos, it->pos));
298                     continue;
299                 }
300                 ContigInterval & interval = back(intervals[e]);
301                 // Either extend the interval or create a new one.
302                 if (interval.contigId == it->contigId && interval.isForward == it->isForward)
303                     SEQAN_ASSERT_LEQ(interval.last, it->pos);
304                 if (interval.contigId == it->contigId && interval.isForward == it->isForward &&
305                     interval.last + 1 == it->pos)
306                     back(intervals[e]).last += 1;
307                 else
308                     appendValue(intervals[e], ContigInterval(it->contigId, it->isForward, it->pos, it->pos));
309             }
310             // Book-keeping.
311             previousPos = it->pos;
312             previousContigId = it->contigId;
313         }
314 
315         // Print the resulting intervals.
316         typedef Iterator<String<String<ContigInterval> > >::Type TIntervalContainerIter;
317         int distance = 0;
318         for (TIntervalContainerIter it = begin(intervals);
319              it != end(intervals); ++it, ++distance)
320         {
321             typedef Iterator<String<ContigInterval> >::Type TIntervalIter;
322             for (TIntervalIter it2 = begin(*it); it2 != end(*it); ++it2)
323             {
324                 int flags = 0;
325                 // We appended custom prefixes to the read ids.  "/S" means single-end, "/0" means left mate, "/1" means
326                 // right mate.
327                 int mateNo = -1;
328                 if (back(readNameStore[readId]) == '0')
329                     mateNo = 0;
330                 if (back(readNameStore[readId]) == '1')
331                     mateNo = 1;
332                 SEQAN_ASSERT_EQ(readNameStore[readId][length(readNameStore[readId]) - 2], '/');
333                 if (mateNo == 0)
334                     flags = GsiRecord::FLAG_PAIRED | GsiRecord::FLAG_FIRST_MATE;
335                 else if (mateNo == 1)
336                     flags = GsiRecord::FLAG_PAIRED | GsiRecord::FLAG_SECOND_MATE;
337 
338                 int gsiDistance = options.oracleMode ? readAlignmentDistances[readId] : distance;
339                 if (options.oracleMode && options.maxErrorSet && gsiDistance > options.maxError)
340                     continue;  // Skip if cut off in Rabema oracle mode.
341                 GsiRecord gsiRecord(CharString(prefix(readNameStore[readId], length(readNameStore[readId]) - 2)), flags,
342                                     gsiDistance, contigNameStore[it2->contigId],
343                                     it2->isForward, it2->first, it2->last);
344                 writeRecord(stream, gsiRecord, Gsi());
345             }
346         }
347     }
348     if (!toStdout)
349         std::cerr << "100% DONE\n";
350 
351     return 0;
352 }
353 
354 // ----------------------------------------------------------------------------
355 // Function ceilAwayFromZero()
356 // ----------------------------------------------------------------------------
357 
358 // Ceil away from Zero.
359 //
360 // ceilAwayFromZero(-1.5) == -2
361 // ceilAwayFromZero(1.5) == 2
362 template <typename T>
ceilAwayFromZero(T x)363 inline T ceilAwayFromZero(T x)
364 {
365     if (x < 0)
366         return floor(x);
367 
368     return ceil(x);
369 }
370 
371 // ----------------------------------------------------------------------------
372 // Function buildErrorCurvePoint()
373 // ----------------------------------------------------------------------------
374 
375 // TODO(holtgrew): This function has still a lot of cleanup potential.
376 
377 // Build the error curve points around the end position of the given contig.
378 //
379 // This is pretty involved and this function easily is the most complex one.
380 //
381 // Returns rightmost border of the added points.
382 template <typename TContigSeq, typename TReadSeq, typename TPatternSpec, typename TReadNames>
buildErrorCurvePoints(String<WeightedMatch> & errorCurve,int & maxError,TContigSeq & contig,size_t contigId,bool isForward,TReadSeq & read,size_t readId,size_t endPos,TReadNames const & readNames,bool matchN,TPatternSpec const &)383 void buildErrorCurvePoints(String<WeightedMatch> & errorCurve,
384                            int & maxError,
385                            TContigSeq /*const*/ & contig,
386                            size_t contigId,
387                            bool isForward,
388                            TReadSeq /*const*/ & read,
389                            size_t readId,
390                            size_t endPos,
391                            TReadNames const & readNames,
392                            bool matchN,
393                            TPatternSpec const &)
394 {
395     typedef typename Position<TContigSeq>::Type TPosition;
396 
397     // In oracle Sam mode, the maximum error is the error at the position given in the Sam alignment.
398     bool oracleMode = false;
399     if (maxError == std::numeric_limits<int>::max())  // We are in oracle mode.
400     {
401         oracleMode = true;
402         Finder<TContigSeq> finder(contig);
403         Pattern<TReadSeq, TPatternSpec> pattern(read, -(int)length(read) * 40);
404         bool ret = setEndPosition(finder, pattern, endPos);
405         (void) ret; // If compiled without assertions.
406         SEQAN_ASSERT(ret);
407         maxError = -getScore(pattern);
408     }
409 
410     // Debug-adjustments.
411     #define ENABLE 0
412     #define ALL 1
413     #define READID 1
414 
415     //if (readNames[readId] == CharString("rabema_synthetic_ecoli_illumina_l100_100k_e3.000000376")) {
416     //      std::cerr << "**************** read id = " << readId << " readname = " << readNames[readId] << " read = " << read << " maxError == " << maxError << std::endl;
417     //}
418 
419     // The read maps with less than the given number of errors to [left, right]
420     // to the contig sequence, is forward strand iff is Forwar is true.
421     TPosition /*left = endPos,*/ right = endPos;
422 
423     // Skip this alignment position if not right of previously
424     // right border of the interval.
425     // TODO(holtgrew): Remove this piece, it's unused right now. We could/should also fix this to work for streaming over SAM files.
426     //if (readId == previousReadId && contigId == previousContigId && left <= previousRightBorder) {
427     //    //std::cerr << __FILE__  << ":" << __LINE__ << " Skipping because " << left << " <= " << previousRightBorder << std::endl;
428     //    return previousRightBorder;
429     //}
430 
431     bool ret;  // Flag used for assertions below.
432     (void) ret;  // If run without assertions.
433     int relativeMinScore = (int)ceilAwayFromZero(100.0 * -maxError / length(read));
434 
435     // Setup the finder and pattern.
436     Finder<TContigSeq> finder(contig);
437     Pattern<TReadSeq, TPatternSpec> pattern(read, -(int)length(read) * 40);
438     // If configured so, match N against all other values, otherwise match
439     // against none.
440     _patternMatchNOfPattern(pattern, matchN);
441     _patternMatchNOfFinder(pattern, matchN);
442     TPosition hitBeginPosition;
443 
444     if (ENABLE && (ALL || readId == READID))
445     {
446         std::cerr << "**************** read id = " << readId << " readname = " << readNames[readId]
447                   << " read = " << read << std::endl;
448         std::cerr << __FILE__ << ":" << __LINE__ << " maxError = " << maxError << std::endl;
449         std::cerr << __FILE__ << ":" << __LINE__ << " relative min score = " << relativeMinScore << std::endl;
450         std::cerr << __FILE__ << ":" << __LINE__ << " extending to the right." << std::endl;
451         std::cerr << __FILE__ << ":" << __LINE__ << " contig id == " << contigId << std::endl;
452         std::cerr << __FILE__ << ":" << __LINE__ << " is forward strand? " << isForward << std::endl;
453         std::cerr << __FILE__ << ":" << __LINE__ << " contig length = " << length(contig) << std::endl;
454         std::cerr << __FILE__ << ":" << __LINE__ << " endPos = " << endPos << std::endl;
455         std::cerr << __FILE__ << ":" << __LINE__ << " infix(contig, endPos - length(read), endPos) == "
456                   << infix(contig, endPos - length(read), endPos) << std::endl;
457     }
458 
459     // We will first gather all results in tempMatches.  Below, we
460     // will smooth the curve in these points and throw out too bad
461     // entries.  Then, we will append tempMatches to errorCurve.
462     String<WeightedMatch> tempMatches;
463 
464     // First, extend the interval to the right.
465     {
466         // Skip to original hit.
467         ret = setEndPosition(finder, pattern, endPos);
468         SEQAN_ASSERT(ret);
469         SEQAN_ASSERT_EQ(endPos, endPosition(finder));
470         SEQAN_ASSERT_GEQ(getScore(pattern), -maxError);
471         while (findBegin(finder, pattern, getScore(pattern)))
472             continue;  // Find leftmost begin position.
473         SEQAN_ASSERT_GT(endPos, beginPosition(finder));
474         SEQAN_ASSERT_EQ(getScore(pattern), getBeginScore(pattern));
475 
476         // Add original hit to the error curve points.
477         int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
478         appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
479                                                beginPosition(finder)));
480         hitBeginPosition = beginPosition(finder);
481         if (ENABLE && (ALL || readId == READID))
482         {
483             std::cerr << __FILE__ << ":" << __LINE__ << " -- getScore(pattern) == " << getScore(pattern) << std::endl;
484             std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
485                       << " for read id " << readId << " (FIRST HIT)" << std::endl;
486             std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read " << read
487                       << " endPos = " << endPos << std::endl;
488         }
489 
490         // Now extend to the first hit with too low score.
491         bool foundWithTooLowScore = false;
492         while (find(finder, pattern))
493         {
494             while (findBegin(finder, pattern, getScore(pattern)))
495                 continue;  // Find leftmost begin position.
496             if (getScore(pattern) < -maxError && beginPosition(finder) != back(tempMatches).beginPos)
497             {
498                 foundWithTooLowScore = true;
499                 if (ENABLE && (ALL || readId == READID))
500                 {
501                     std::cerr << __FILE__ << ":" << __LINE__ << " -- Found too low score." << std::endl;
502                     std::cerr << __FILE__ << ":" << __LINE__ << " -- low scoring match was "
503                               << WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
504                                      beginPosition(finder)) << std::endl;
505                 }
506                 break;
507             }
508             int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
509             appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
510                                                    beginPosition(finder)));
511             if (ENABLE && (ALL || readId == READID))
512             {
513                 std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
514                           << " for read id " << readId << std::endl;
515                 std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder)
516                           << " read " << read << std::endl;
517             }
518             right += 1;
519         }
520         // If we broke because of the score limit then collect the last not
521         // yet added hit and the ones right of it until the beginPosition
522         // changes.
523         if (foundWithTooLowScore)
524         {
525             if (beginPosition(finder) == hitBeginPosition)
526             {
527                 relativeScore = static_cast<int>(ceilAwayFromZero(100.0 * getScore(pattern) / length(read)));
528                 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
529                                                        relativeScore, beginPosition(finder)));
530                 TPosition currentBeginPosition = beginPosition(finder);
531                 // Add the rest until we hit one with a different begin position.
532                 //
533                 // Loop at most length(read) times (this limit is here
534                 // for the quality based DP algorithm which can suffer
535                 // from "infinite inserts").
536                 for (unsigned i = 0; find(finder, pattern) && i < length(read); ++i)
537                 {
538                     while (findBegin(finder, pattern, getScore(pattern)))
539                         continue;  // Find leftmost begin position.
540                     SEQAN_ASSERT(ret);
541                     if (beginPosition(finder) != currentBeginPosition)
542                         break;
543                     relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
544                     appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
545                                                            relativeScore, beginPosition(finder)));
546                     if (ENABLE && (ALL || readId == READID))
547                     {
548                         std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
549                                   << " for read id " << readId << std::endl;
550                         std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder)
551                                   << " read " << read << std::endl;
552                     }
553                     right += 1;
554                 }
555             }
556         }
557 
558         if (ENABLE && (ALL || readId == READID))
559         {
560             std::cerr << __FILE__ << ":" << __LINE__ << " extending to the left." << std::endl;
561         }
562         // Then, extend the interval to the left.
563         {
564             // Length by which we extend the interval.  Note that this must be
565             // at least the length of the read + max error count so no special
566             // treatment of islands on the left side must be done.
567             TPosition kIntervalLen = length(read) + maxError;
568             // Tentatively extend the interval to the left.  We will
569             // extend until we hit a position with too low score.
570             TPosition tentativeLeft = endPos;
571             // Flag that indicates whether we found an entry with too low score.
572             bool foundTooLowScore = false;
573             // Flag that indicates whether we performed a loop iteration after having found a too low score.
574             bool loopedAfterFoundTooLowScore = false;
575             // Flag for breaking out of loop if we hit the right border of the last interval.
576             bool hitLastRight = false;
577             while (tentativeLeft > 0 && !loopedAfterFoundTooLowScore && !hitLastRight)
578             {
579                 // Stop if went went to the rightmost position of the
580                 // last interval with the previous loop iteration.
581                 //if (readId == previousReadId && contigId == previousContigId && tentativeLeft == previousRightBorder)
582                 //    break;
583                 loopedAfterFoundTooLowScore = foundTooLowScore;
584                 TPosition oldTentativeLeft = tentativeLeft;
585                 // Move the tentative left position left by kIntervalLen but not
586                 // further left than 0.
587                 if (ENABLE && (ALL || readId == READID))
588                     std::cerr << "tentativeLeft before = " << tentativeLeft << std::endl;
589                 tentativeLeft -= _min(kIntervalLen, tentativeLeft);
590                 if (ENABLE && (ALL || readId == READID))
591                     std::cerr << "tentativeLeft after = " << tentativeLeft << std::endl;
592                 // Do not go further than the previous right position.
593                 //if (readId == previousReadId && contigId == previousContigId && tentativeLeft <= previousRightBorder) {
594                 //    tentativeLeft = previousRightBorder;
595                 //    hitLastRight = true;
596                 //}
597                 if (ENABLE && (ALL || readId == READID))
598                 {
599                     std::cerr << __FILE__ << ":" << __LINE__ << " -- tentative left = " << tentativeLeft << std::endl;
600                 }
601                 // Search from tentative left position to the previous tentative left position.
602                 ret = setEndPosition(finder, pattern, tentativeLeft);
603                 SEQAN_ASSERT(ret);
604                 if (ENABLE && (ALL || readId == READID))
605                 {
606                     std::cerr << __FILE__ << ":" << __LINE__ << " -- endPosition(finder) = "
607                               << endPosition(finder) << std::endl;
608                 }
609                 if (endPosition(finder) > oldTentativeLeft)
610                     break;  // Could not set position of the finder left of old tentative left.
611                 while (findBegin(finder, pattern, getScore(pattern)))
612                     continue;  // Find leftmost begin position.
613                 int relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
614                 appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1, relativeScore,
615                                                        beginPosition(finder)));
616                 if (ENABLE && (ALL || readId == READID))
617                 {
618                     std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
619                               << " for read id " << readId << std::endl;
620                     std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read "
621                               << read << std::endl;
622                 }
623                 foundTooLowScore = foundTooLowScore || (relativeScore < relativeMinScore);
624                 while (find(finder, pattern) && endPosition(finder) != oldTentativeLeft)
625                 {
626                     while (findBegin(finder, pattern, getScore(pattern)))
627                         continue;  // Find leftmost begin position.
628                     SEQAN_ASSERT(ret);
629                     relativeScore = (int)ceilAwayFromZero(100.0 * getScore(pattern) / length(read));
630                     appendValue(tempMatches, WeightedMatch(contigId, isForward, endPosition(finder) - 1,
631                                                            relativeScore, beginPosition(finder)));
632                     if (ENABLE && (ALL || readId == READID))
633                     {
634                         std::cerr << __FILE__ << ":" << __LINE__ << " -- raw score is " << getScore(pattern)
635                                   << std::endl;
636                         std::cerr << __FILE__ << ":" << __LINE__ << " -- appended " << back(tempMatches)
637                                   << " for read id " << readId << std::endl;
638                         std::cerr << __FILE__ << ":" << __LINE__ << " -- infix " << infix(finder) << " read "
639                                   << read << std::endl;
640                     }
641                     foundTooLowScore = foundTooLowScore || (relativeScore < relativeMinScore);
642                 }
643             }
644             if (ENABLE && (ALL || readId == READID))
645             {
646                 std::cerr << __FILE__ << ":" << __LINE__ << " -- after loop" << std::endl;
647             }
648             /*
649             // Now we can be sure that the temporary matches contain an entry
650             // left of the first one with a too low score that has a different
651             // begin position than hitBeginPosition.
652             std::sort(begin(tempMatches, Standard()), end(tempMatches, Standard()));
653             ModifiedString<String<WeightedMatch>, ModReverse> rTempMatches(tempMatches);
654             TPosition prefixLength = 0;
655             TPosition i;
656             // Search up to the first too low score.
657             for (i = 0; i < length(rTempMatches); ++i) {
658                 if (ENABLE && (ALL || readId == READID)) {
659                     std::cerr << "Considering " << rTempMatches[i] << std::endl;
660                 }
661                 if (rTempMatches[i].distance < relativeMinScore) {
662                     if (rTempMatches[i].beginPos == hitBeginPosition) {
663                         prefixLength += 1;
664                         left -= 1;
665                         i += 1;
666                     }
667                     break;
668                 }
669                 prefixLength += 1;
670                 left -= 1;
671             }
672             if (ENABLE && (ALL || readId == READID)) {
673                 std::cerr << "prefixLength == " << prefixLength << std::endl;
674                 std::cerr << "i == " << i << std::endl;
675             }
676             // Then, append until we find a position with a different begin position.
677             for (; i < length(rTempMatches); ++i) {
678                 if (rTempMatches[i].beginPos == hitBeginPosition) {
679                     prefixLength += 1;
680                     left -= 1;
681                 } else {
682                     break;
683                 }
684             }
685             // Finally, append the prefix of the given length.
686             if (ENABLE && (ALL || readId == READID)) {
687                 std::cerr << "appending prefix(rTempMatches, " << prefixLength << ")" << std::endl;
688             }
689         */
690         }
691     }
692     // Postprocessing: Sorting, smoothing, filtering.
693     std::sort(begin(tempMatches, Standard()), end(tempMatches, Standard()));
694     smoothErrorCurve(tempMatches);
695     appendValue(tempMatches, WeightedMatch(0, 0, 0, relativeMinScore - 1, 0));  // Sentinel.
696     if (oracleMode)
697     {
698         // In oracle Sam mode, we only want the lake with last pos endPos-1.
699         String<WeightedMatch> buffer;
700         bool flag = false;
701         for (size_t i = 0; i < length(tempMatches); ++i)
702         {
703             if (tempMatches[i].distance < relativeMinScore)
704             {
705                 if (flag)
706                 {
707                     append(errorCurve, buffer);
708                     break;
709                 }
710                 else
711                 {
712                     clear(buffer);
713                 }
714             }
715             else
716             {
717                 appendValue(buffer, tempMatches[i]);
718                 if (tempMatches[i].pos == endPos - 1)
719                     flag = true;
720             }
721         }
722     }
723     else
724     {
725         for (size_t i = 0; i < length(tempMatches); ++i)
726         {
727             if (tempMatches[i].distance >= relativeMinScore)
728                 appendValue(errorCurve, tempMatches[i]);
729         }
730     }
731 
732 //     if (readId == READID) {
733 //         std::cerr << ",-- errorCurve is (read id = " << readId << " readname = " << readNames[readId] << ")" << std::endl;
734 //         for (unsigned i = 0; i < length(errorCurve); ++i) {
735 //             std::cerr << "| " << errorCurve[i] << std::endl;
736 //         }
737 //         std::cerr << "`--" << std::endl;
738 //     }
739 
740 //     std::cerr << __FILE__ << ":" << __LINE__ << " return " << right << std::endl;
741 }
742 
743 // ----------------------------------------------------------------------------
744 // Function matchesToErrorFunction()
745 // ----------------------------------------------------------------------------
746 
747 // Compute error curve from the reads and reference sequences.
748 
749 template <typename TPatternSpec>
matchesToErrorFunction(TErrorCurves & errorCurves,String<int> & readAlignmentDistances,BamFileIn & inBam,StringSet<CharString> & readNameStore,StringSet<CharString> const & contigNameStore,FaiIndex const & faiIndex,BuildGoldStandardOptions const & options,TPatternSpec const &)750 int matchesToErrorFunction(TErrorCurves & errorCurves,
751                            String<int> & readAlignmentDistances,  // only used in case of oracle mode
752                            BamFileIn & inBam,
753                            StringSet<CharString> & readNameStore,
754                            StringSet<CharString> const & contigNameStore,
755                            FaiIndex const & faiIndex,
756                            BuildGoldStandardOptions const & options,
757                            TPatternSpec const & /*patternTag*/)
758 {
759     double startTime = 0;
760 
761     if (atEnd(inBam))
762         return 0;  // Do nothing if there are no more alignments in the file.
763 
764     // We store the read names and append "/0" and "/1", depending on their flag value ("/0" for first, "/1" for last
765     // flag).  Sadly, there is no easy way out here.  Single-end reads are stored as "/S".
766     NameStoreCache<StringSet<CharString> > readNameStoreCache(readNameStore);
767     String<unsigned> readLengthStore;
768 
769     std::cerr << "\n____EXTENDING INTERVALS_______________________________________________________\n\n"
770               << "Each dot represents work for 100k nucleotides in the genome.\n";
771 
772     // Stream over SAM/BAM file.  The main assumption here is that the reads are sorted by coordinate, which is check
773     // when loading the header.
774     //
775     // For each read alignments:
776     //   If aligned on forward strand:
777     //     Build error curve for this read alignment on forward strand.
778     //   Else:
779     //     Build error curve for this read alignment on backward strand.
780 
781     startTime = sysTime();      // Time at beginning for total time display at end.
782     int prevRefId = -1;      // Previous contig id.
783     int prevPos = -1;
784     int posOnContig = 0;
785     BamAlignmentRecord record;  // Current read record.
786     Dna5String contig;
787     Dna5String rcContig;
788     Dna5String readSeq;
789     CharString readName;
790     while (!atEnd(inBam))
791     {
792         // -------------------------------------------------------------------
793         // Read SAM/BAM Record and retrieve name and sequence.
794         // -------------------------------------------------------------------
795 
796         // Read SAM/BAM record.
797         try
798         {
799             readRecord(record, inBam);
800         }
801         catch (seqan::ParseError const & ioErr)
802         {
803             std::cerr << "ERROR reading SAM/BAM record(" << ioErr.what() << ")!\n";
804             return 1;
805         }
806         // Break if we have an unaligned SAM/BAM record.
807         if (record.rID == -1)
808             break;  // Done!
809         // Check that the file is actually sorted by coordinate.
810         if (prevRefId != -1 && (prevRefId > record.rID || (prevRefId == record.rID && prevPos > record.beginPos)))
811         {
812             std::cerr << "ERROR: File was not sorted by coordinate!\n";
813             return 1;
814         }
815         // Get read name and sequence from record.
816         readSeq = record.seq;  // Convert read sequence to Dna5.
817         // Compute reverse complement since we align against reverse strand, SAM has aligned sequence against forward
818         // strand.
819         if (hasFlagRC(record))
820             reverseComplement(readSeq);
821         trimSeqHeaderToId(record.qName);  // Remove everything after the first whitespace.
822         if (!hasFlagMultiple(record))
823             append(record.qName, "/S");
824         if (hasFlagMultiple(record) && hasFlagFirst(record))
825             append(record.qName, "/0");
826         if (hasFlagMultiple(record) && hasFlagLast(record))
827             append(record.qName, "/1");
828         // Translate read to read id.
829         unsigned readId = 0;
830         if (!getIdByName(readId, readNameStoreCache, record.qName))
831         {
832             readId = length(readNameStore);
833             appendName(readNameStoreCache, record.qName);
834             appendValue(readLengthStore, length(record.seq));
835             if (options.oracleMode)
836                 appendValue(readAlignmentDistances, -1);
837         }
838 
839         // -------------------------------------------------------------------
840         // Handle Progress
841         // -------------------------------------------------------------------
842 
843         // Handle progress: Change of contig.
844         SEQAN_ASSERT_LEQ(prevRefId, record.rID);
845         if (prevRefId != record.rID)
846         {
847             for (int i = prevRefId + 1; i <= record.rID; ++i)
848             {
849                 if (i != prevRefId)
850                     std::cerr << "\n";
851                 std::cerr << contigNameStore[record.rID] << " (" << record.rID + 1 << "/"
852                           << length(contigNameStore) << ") ";
853             }
854             posOnContig = 0;
855 
856             // Load reference sequence on the fly using FAI index to save memory.
857             unsigned faiRefId = 0;
858             if (!getIdByName(faiRefId, faiIndex, contigNameStore[record.rID]))
859             {
860                 std::cerr << "Reference sequence " << contigNameStore[record.rID] << " not known in FAI file.\n";
861                 return 1;
862             }
863             try
864             {
865                 readSequence(contig, faiIndex, faiRefId);
866             }
867             catch (seqan::ParseError const & ioErr)
868             {
869                 std::cerr << "Could not load sequence " << contigNameStore[record.rID]
870                           << " from FASTA file with FAI (" << ioErr.what() << ").\n";
871                 return 1;
872             }
873             rcContig = contig;
874             reverseComplement(rcContig);
875         }
876         // Handle progress: Position on contig.
877         for (; posOnContig < record.beginPos; posOnContig += 100 * 1000)
878         {
879             if (posOnContig % (1000 * 1000) == 0 && posOnContig > 0)
880                 std::cerr << posOnContig / 1000 / 1000 << "M";
881             else
882                 std::cerr << ".";
883         }
884 
885         // -------------------------------------------------------------------
886         // Extend error curve points for read.
887         // -------------------------------------------------------------------
888 
889         // In oracle mode, set max error to -1, buildErrorCurvePoints() will use the error at the alignment position
890         // from the SAM/BAM file.  In normal mode, convert from error rate from options to error count.
891         int maxError = std::numeric_limits<int>::max();
892         if (!options.oracleMode)
893             maxError = static_cast<int>(floor(0.01 * options.maxError * length(record.seq)));
894 
895         // Compute end position of alignment.
896         int endPos = record.beginPos + getAlignmentLengthInRef(record) - countPaddings(record.cigar);
897 
898         if (!hasFlagRC(record))
899             buildErrorCurvePoints(errorCurves[readId], maxError, contig, record.rID, !hasFlagRC(record),
900                                   readSeq, readId, endPos, readNameStore, options.matchN, TPatternSpec());
901         else
902             buildErrorCurvePoints(errorCurves[readId], maxError, rcContig, record.rID, !hasFlagRC(record),
903                                   readSeq, readId, length(rcContig) - record.beginPos, readNameStore, options.matchN,
904                                   TPatternSpec());
905 #if ENABLE
906         std::cerr << "AFTER BUILD, readId == " << readId << ", read name == " << readNameStore[readId] << ", maxError == " << maxError << "\n";
907 #endif  // #if ENABLE
908         if (options.oracleMode)
909             readAlignmentDistances[readId] = maxError;
910 
911         // Update variables storing the previous read/contig id and position.
912         prevRefId = record.rID;
913         prevPos = record.beginPos;
914     }
915     std::cerr << "\n\nTook " << sysTime() - startTime << " s\n";
916 
917     // For all reads:
918     //   Sort all error curve points.
919     //   Fill gaps.
920     //   Smooth them.
921     //   Filter out low scoring ones.
922 
923     std::cerr << "\n____SMOOTHING ERROR CURVES____________________________________________________\n\n";
924     startTime = sysTime();
925     unsigned tenPercent = length(readLengthStore) / 10 + 1;
926     std::cerr << "Progress: ";
927     for (unsigned readId = 0; readId < length(readLengthStore); ++readId)
928     {
929         if (tenPercent > 0u && readId % tenPercent == 0u)
930             std::cerr << readId / tenPercent * 10 << '%';
931         else if (tenPercent > 5u && readId % (tenPercent / 5) == 0u)
932             std::cerr << '.';
933 
934         std::sort(begin(errorCurves[readId], Standard()), end(errorCurves[readId], Standard()));
935         fillGaps(errorCurves[readId]);
936         smoothErrorCurve(errorCurves[readId]);
937 
938         // Compute relative min score for the read.
939         String<WeightedMatch> filtered;
940         int maxError = (int)floor(options.maxError / 100.0 * readLengthStore[readId]);
941         if (options.oracleMode)
942         {
943             SEQAN_ASSERT_NEQ(readAlignmentDistances[readId], -1);
944             maxError = readAlignmentDistances[readId];
945             if (options.maxErrorSet && maxError > options.maxError)
946                 maxError = options.maxError;
947         }
948         int relativeMinScore = (int)ceilAwayFromZero(100.0 * -maxError / readLengthStore[readId]);
949 #if ENABLE
950         std::cerr << "AFTER SMOOTHING, readId == " << readId << ", read name == " << readNameStore[readId] << ", maxError == " << maxError << "\n";
951 #endif  // #if ENABLE
952 
953         // Filter out low scoring ones.
954         typedef typename Iterator<String<WeightedMatch> >::Type TIterator;
955         for (TIterator it = begin(errorCurves[readId]); it != end(errorCurves[readId]); ++it)
956         {
957             if (value(it).distance >= relativeMinScore)
958                 appendValue(filtered, value(it));
959         }
960         move(errorCurves[readId], filtered);
961     }
962     std::cerr << "100% DONE\n"
963               << "\nTook: " << sysTime() - startTime << " s\n";
964 
965     return 0;
966 }
967 
968 // ---------------------------------------------------------------------------
969 // Function parseCommandLine()
970 // ---------------------------------------------------------------------------
971 
972 seqan::ArgumentParser::ParseResult
parseCommandLine(BuildGoldStandardOptions & options,int argc,char const ** argv)973 parseCommandLine(BuildGoldStandardOptions & options, int argc, char const ** argv)
974 {
975     // -----------------------------------------------------------------------
976     // Parse Command Line Using ArgumentParser
977     // -----------------------------------------------------------------------
978 
979     seqan::ArgumentParser parser("rabema_build_gold_standard");
980     setShortDescription(parser, "RABEMA Gold Standard Builder");
981     setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
982     setDate(parser, SEQAN_DATE);
983     setCategory(parser, "Benchmarking");
984 
985     addUsageLine(parser,
986                  "[\\fIOPTIONS\\fP] \\fB--out-gsi\\fP \\fIOUT.gsi\\fP \\fB--reference\\fP \\fIREF.fa\\fP "
987                  "\\fB--in-bam\\fP \\fIPERFECT.{sam,bam}\\fP");
988     addDescription(parser,
989                    "This program allows one to build a RABEMA gold standard.  The input is a reference FASTA file "
990                    "and a perfect SAM/BAM map (e.g. created using RazerS 3 in full-sensitivity mode).");
991     addDescription(parser,
992                    "The input SAM/BAM file must be \\fIsorted by coordinate\\fP.  The program will create a "
993                    "FASTA index file \\fIREF.fa.fai\\fP for fast random access to the reference.");
994 
995     addOption(parser, seqan::ArgParseOption("v", "verbose", "Enable verbose output."));
996     addOption(parser, seqan::ArgParseOption("vv", "very-verbose", "Enable even more verbose output."));
997 
998     addSection(parser, "Input / Output");
999     addOption(parser, seqan::ArgParseOption("o", "out-gsi", "Path to write the resulting GSI file to.",
1000                                             seqan::ArgParseArgument::OUTPUT_FILE, "GSI"));
1001     setValidValues(parser, "out-gsi", "gsi gsi.gz");
1002     setRequired(parser, "out-gsi", true);
1003     addOption(parser, seqan::ArgParseOption("r", "reference", "Path to load reference FASTA from.",
1004                                             seqan::ArgParseArgument::INPUT_FILE, "FASTA"));
1005     setRequired(parser, "reference", true);
1006     setValidValues(parser, "reference", seqan::SeqFileIn::getFileExtensions());
1007     addOption(parser, seqan::ArgParseOption("b", "in-bam", "Path to load the \"perfect\" SAM/BAM file from.",
1008                                             seqan::ArgParseArgument::INPUT_FILE, "BAM"));
1009     setValidValues(parser, "in-bam", BamFileIn::getFileExtensions());
1010 
1011     addSection(parser, "Gold Standard Parameters");
1012     addOption(parser, seqan::ArgParseOption("", "oracle-mode",
1013                                             "Enable oracle mode.  This is used for simulated data when the input "
1014                                             "SAM/BAM file gives exactly one position that is considered as the true "
1015                                             "sample position."));
1016     addOption(parser, seqan::ArgParseOption("", "match-N", "When set, N matches all characters without penalty."));
1017     addOption(parser, seqan::ArgParseOption("", "distance-metric",
1018                                             "Set distance metric.  Valid values: hamming, edit.  Default: edit.",
1019                                             seqan::ArgParseOption::STRING, "METRIC"));
1020     setValidValues(parser, "distance-metric", "hamming edit");
1021     setDefaultValue(parser, "distance-metric", "edit");
1022 
1023     addOption(parser, seqan::ArgParseOption("e", "max-error",
1024                                             "Maximal error rate to build gold standard for in percent.  This "
1025                                             "parameter is an integer and relative to the read length.  "
1026                                             "In case of oracle mode, the error rate for the read at the "
1027                                             "sampling position is used and \\fIRATE\\fP is used as a cutoff "
1028                                             "threshold.",
1029                                             seqan::ArgParseArgument::INTEGER, "RATE"));
1030     setDefaultValue(parser, "max-error", 0);
1031 
1032     addTextSection(parser, "Return Values");
1033     addText(parser, "A return value of 0 indicates success, any other value indicates an error.");
1034 
1035     addTextSection(parser, "Examples");
1036 
1037     addListItem(parser,
1038                 "\\fBrabema_build_gold_standard\\fP \\fB-e\\fP \\fI4\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-s\\fP "
1039                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1040                 "Build gold standard from a SAM file \\fIIN.sam\\fP with all mapping locations and a FASTA "
1041                 "reference \\fIREF.fa\\fP to GSI file \\fIOUT.gsi\\fP with a maximal error rate of \\fI4\\fP.");
1042     addListItem(parser,
1043                 "\\fBrabema_build_gold_standard\\fP \\fB--distance-metric\\fP \\fIedit\\fP \\fB-e\\fP \\fI4\\fP "
1044                 "\\fB-o\\fP \\fIOUT.gsi\\fP \\fB-b\\fP \\fIIN.bam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1045                 "Same as above, but using Hamming instead of edit distance and BAM as the input.");
1046     addListItem(parser,
1047                 "\\fBrabema_build_gold_standard\\fP \\fB--oracle-mode\\fP \\fB-o\\fP \\fIOUT.gsi\\fP \\fB-s\\fP "
1048                 "\\fIIN.sam\\fP \\fB-r\\fP \\fIREF.fa\\fP",
1049                 "Build gold standard from a SAM file \\fIIN.sam\\fP with the original sample position, e.g.  "
1050                 "as exported by read simulator Mason.");
1051 
1052     addTextSection(parser, "Memory Requirements");
1053     addText(parser,
1054             "From version 1.1, great care has been taken to keep the memory requirements as low as possible. "
1055             "There memory required is two times the size of the largest chromosome plus some constant memory "
1056             "for each match.");
1057     addText(parser,
1058             "For example, the memory usage for 100bp human genome reads at 5% error rate was 1.7GB. Of this, "
1059             "roughly 400GB came from the chromosome and 1.3GB from the matches.");
1060 
1061     addTextSection(parser, "References");
1062     addText(parser,
1063             "M. Holtgrewe, A.-K. Emde, D. Weese and K. Reinert.  A Novel And Well-Defined Benchmarking Method "
1064             "For Second Generation Read Mapping, BMC Bioinformatics 2011, 12:210.");
1065     addListItem(parser, "\\fIhttp://www.seqan.de/rabema\\fP", "RABEMA Homepage");
1066     addListItem(parser, "\\fIhttp://www.seqan.de/mason\\fP", "Mason Homepage");
1067 
1068     // Actually do the parsing and exit on error, help display etc.
1069     seqan::ArgumentParser::ParseResult res = parse(parser, argc, argv);
1070     if (res != seqan::ArgumentParser::PARSE_OK)
1071         return res;
1072 
1073     // -----------------------------------------------------------------------
1074     // Fill BuildGoldStandardOptions Object
1075     // -----------------------------------------------------------------------
1076 
1077     if (isSet(parser, "verbose"))
1078         options.verbosity = 2;
1079     if (isSet(parser, "very-verbose"))
1080         options.verbosity = 3;
1081 
1082     if (isSet(parser, "match-N"))
1083         options.matchN = true;
1084     if (isSet(parser, "oracle-mode"))
1085         options.oracleMode = true;
1086     if (isSet(parser, "max-error"))
1087     {
1088         getOptionValue(options.maxError, parser, "max-error");
1089         options.maxErrorSet = true;
1090     }
1091     CharString distanceMetric;
1092     getOptionValue(distanceMetric, parser, "distance-metric");
1093     if (distanceMetric == "hamming")
1094         options.distanceMetric = HAMMING_DISTANCE;
1095 
1096     if (isSet(parser, "out-gsi"))
1097         getOptionValue(options.outGsiPath, parser, "out-gsi");
1098     if (isSet(parser, "reference"))
1099         getOptionValue(options.referencePath, parser, "reference");
1100     if (isSet(parser, "in-bam"))
1101         getOptionValue(options.inBamPath, parser, "in-bam");
1102 
1103     return res;
1104 }
1105 
1106 // ----------------------------------------------------------------------------
1107 // Function main()
1108 // ----------------------------------------------------------------------------
1109 
main(int argc,char const ** argv)1110 int main(int argc, char const ** argv)
1111 {
1112     // Parse command line and store results in options.
1113     BuildGoldStandardOptions options;
1114     seqan::ArgumentParser::ParseResult parseRes = parseCommandLine(options, argc, argv);
1115     if (parseRes != seqan::ArgumentParser::PARSE_OK)
1116         return parseRes == seqan::ArgumentParser::PARSE_ERROR;
1117 
1118     double startTime = 0;  // For measuring time below.
1119 
1120     std::cerr << "==============================================================================\n"
1121               << "                RABEMA - Read Alignment BEnchMArk\n"
1122               << "==============================================================================\n"
1123               << "                      Building Gold Standard\n"
1124               << "==============================================================================\n"
1125               << "\n"
1126               << "____OPTIONS___________________________________________________________________\n\n";
1127 
1128     std::cerr << "Max error rate [%]    " << options.maxError << "\n"
1129               << "Oracle mode           " << yesNo(options.oracleMode) << '\n'
1130               << "Distance measure      " << metricName(options.distanceMetric) << "\n"
1131               << "Match Ns              " << yesNo(options.matchN) << '\n'
1132               << "GSI Output File       " << options.outGsiPath << '\n'
1133               << "SAM/BAM Input File    " << options.inBamPath << '\n'
1134               << "Reference File        " << options.referencePath << '\n'
1135               << "Verbosity             " << options.verbosity << "\n\n";
1136 
1137     std::cerr << "____LOADING FILES_____________________________________________________________\n\n";
1138 
1139     // =================================================================
1140     // Prepare File I/O.
1141     // =================================================================
1142 
1143     startTime = sysTime();
1144     std::cerr << "Reference Index       " << options.referencePath << ".fai ...";
1145     FaiIndex faiIndex;
1146     if (!open(faiIndex, toCString(options.referencePath)))
1147     {
1148         std::cerr << " FAILED (not fatal, we can just build it)\n";
1149         std::cerr << "Building Index        " << options.referencePath << ".fai ...";
1150         if (!build(faiIndex, toCString(options.referencePath)))
1151         {
1152             std::cerr << "Could not build FAI index.\n";
1153             return 1;
1154         }
1155         std::cerr << " OK\n";
1156         seqan::CharString faiPath = options.referencePath;
1157         append(faiPath, ".fai");
1158         std::cerr << "Reference Index       " << faiPath << " ...";
1159         try
1160         {
1161             save(faiIndex, toCString(faiPath));
1162         }
1163         catch (seqan::IOError const & ioErr)
1164         {
1165             std::cerr << "Could not write FAI index we just built (" << ioErr.what() << ").\n";
1166             return 1;
1167         }
1168         std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1169     }
1170     std::cerr << " OK (" << length(faiIndex.indexEntryStore) << " seqs)\n";
1171 
1172     // Open SAM file and read in header.
1173     BamHeader bamHeader;
1174     BamFileIn inBam;
1175     if (!open(inBam, toCString(options.inBamPath)))
1176     {
1177         std::cerr << "Could not open SAM file.\n";
1178         return 1;
1179     }
1180     try
1181     {
1182         readHeader(bamHeader, inBam);
1183     }
1184     catch (seqan::ParseError const & ioErr)
1185     {
1186         std::cerr << "Could not read BAM header (" << ioErr.what() << ").\n";
1187         return 1;
1188     }
1189     std::cerr << " OK\n";
1190     std::cerr << "\nTook " << sysTime() - startTime << "s\n";
1191 
1192     // =================================================================
1193     // Build point-wise error curve.
1194     // =================================================================
1195     startTime = sysTime();
1196     TErrorCurves errorCurves;
1197     int res = 0;
1198     StringSet<CharString> readNameStore;
1199     // In oracle mode, we store the distance of the alignment from the SAM file for each read.  Otherwise, this variable
1200     // remains unused.
1201     String<int> readAlignmentDistances;
1202     if (options.distanceMetric == EDIT_DISTANCE)
1203         res = matchesToErrorFunction(errorCurves, readAlignmentDistances, inBam, readNameStore,
1204                                      contigNames(context(inBam)), faiIndex, options, MyersUkkonenReads());
1205     else // options.distanceMetric == DISTANCE_HAMMING
1206         res = matchesToErrorFunction(errorCurves, readAlignmentDistances, inBam, readNameStore,
1207                                      contigNames(context(inBam)), faiIndex, options, HammingSimple());
1208     if (res != 0)
1209         return 1;
1210 
1211     // =================================================================
1212     // Convert points in error curves to intervals and write them to
1213     // stdout or a file.
1214     // =================================================================
1215 
1216     // The two alternatives are equivalent after opening the file.
1217     startTime = sysTime();
1218     std::cerr << "\n____WRITING OUTPUT____________________________________________________________\n\n";
1219     VirtualStream<char, Output> gsiStream;
1220     if ((options.outGsiPath == "-" && !open(gsiStream, std::cout, Nothing())) ||
1221         !open(gsiStream, toCString(options.outGsiPath)))
1222     {
1223         std::cerr << "Could not open output file " << options.outGsiPath << ".\n";
1224         return 1;
1225     }
1226 
1227     if (options.outGsiPath == "-")
1228         std::cerr << "Writing to stdout ...";
1229     else
1230         std::cerr << "Writing to " << options.outGsiPath << " ...";
1231     intervalizeAndDumpErrorCurves(gsiStream, errorCurves, readAlignmentDistances, readNameStore,
1232                                   contigNames(context(inBam)), options, true);
1233     std::cerr << " DONE\n"
1234               << "\n Took " << sysTime() - startTime << " s\n";
1235 
1236     return 0;
1237 }
1238