1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_OUTPUT_FORMAT_H
23 #define SEQAN_HEADER_OUTPUT_FORMAT_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28
29 #include "razers.h"
30 #include <seqan/align.h>
31
32 namespace seqan
33 {
34
35 //////////////////////////////////////////////////////////////////////////////
36 // Quality-based score
37
38 template <typename TQualityString = CharString>
39 struct Quality;
40
41 template <typename TValue, typename TQualityString>
42 class Score<TValue, Quality<TQualityString> >
43 {
44 public:
45 TValue data_match;
46 TValue data_mismatch;
47 TValue data_gap_extend;
48 TValue data_gap_open;
49
50 TQualityString const *data_qual;
51
52 public:
Score()53 Score():
54 data_match(0),
55 data_mismatch(-1),
56 data_gap_extend(-1),
57 data_gap_open(-1),
58 data_qual(NULL)
59 {
60 }
Score(TValue _match,TValue _mismatch,TValue _gap)61 Score(TValue _match, TValue _mismatch, TValue _gap):
62 data_match(_match),
63 data_mismatch(_mismatch),
64 data_gap_extend(_gap),
65 data_gap_open(_gap),
66 data_qual(NULL)
67 {
68 }
Score(TValue _match,TValue _mismatch,TValue _gap_extend,TValue _gap_open,TQualityString const & _qual)69 Score(TValue _match, TValue _mismatch, TValue _gap_extend, TValue _gap_open, TQualityString const &_qual):
70 data_match(_match),
71 data_mismatch(_mismatch),
72 data_gap_extend(_gap_extend),
73 data_gap_open(_gap_open),
74 data_qual(&_qual)
75 {
76 }
77
Score(Score const & other)78 Score(Score const & other):
79 data_match(other.data_match),
80 data_mismatch(other.data_mismatch),
81 data_gap_extend(other.data_gap_extend),
82 data_gap_open(other.data_gap_open),
83 data_qual(other.data_qual)
84 {
85 }
~Score()86 ~Score()
87 {
88 }
89
90 Score & operator = (Score const & other)
91 {
92 data_match = other.data_match;
93 data_mismatch = other.data_mismatch;
94 data_gap_extend = other.data_gap_extend;
95 data_gap_open = other.data_gap_open;
96 data_qual = other.data_qual;
97 return *this;
98 }
99 };
100
101 //////////////////////////////////////////////////////////////////////////////
102
103 template <typename TValue, typename TQualityString, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
104 inline TValue
score(Score<TValue,Quality<TQualityString>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)105 score(Score<TValue, Quality<TQualityString> > const & me,
106 TPos1 pos1,
107 TPos2 pos2,
108 TSeq1 const &seq1,
109 TSeq2 const &seq2)
110 {
111 if (seq1[pos1] != seq2[pos2])
112 if (me.data_qual)
113 return (*me.data_qual)[pos2];
114 else
115 return scoreMismatch(me);
116 else
117 return scoreMatch(me);
118 }
119
120
121 //////////////////////////////////////////////////////////////////////////////
122 // Less-operators ...
123
124 // ... to sort matches and remove duplicates with equal gBegin
125 template <typename TReadMatch>
126 struct LessGPosRNo : public ::std::binary_function < TReadMatch, TReadMatch, bool >
127 {
operatorLessGPosRNo128 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
129 {
130 // genome position and orientation
131 if (a.gseqNo < b.gseqNo) return true;
132 if (a.gseqNo > b.gseqNo) return false;
133 if (a.gBegin < b.gBegin) return true;
134 if (a.gBegin > b.gBegin) return false;
135 if (a.orientation < b.orientation) return true;
136 if (a.orientation > b.orientation) return false;
137
138 // read number
139 if (a.rseqNo < b.rseqNo) return true;
140 if (a.rseqNo > b.rseqNo) return false;
141
142 // quality
143 return a.editDist < b.editDist;
144 }
145 };
146
147 //////////////////////////////////////////////////////////////////////////////
148 // Determine error distribution
149 template <typename TErrDistr, typename TMatches, typename TReads, typename TGenomes, typename TOptions>
150 inline unsigned
getErrorDistribution(TErrDistr & posError,TMatches & matches,TReads & reads,TGenomes & genomes,TOptions & options)151 getErrorDistribution(
152 TErrDistr &posError,
153 TMatches &matches,
154 TReads &reads,
155 TGenomes &genomes,
156 TOptions &options)
157 {
158 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
159 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
160
161 Dna5String genome;
162 unsigned unique = 0;
163 for (; it != itEnd; ++it)
164 {
165 if ((*it).orientation == '-') continue;
166
167 Dna5String const &read = reads[(*it).rseqNo];
168 genome = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
169 if ((*it).orientation == 'R')
170 reverseComplement(genome);
171 for (unsigned i = 0; i < length(posError) && i < length(read); ++i)
172 if ((options.compMask[ordValue(genome[i])] & options.compMask[ordValue(read[i])]) == 0)
173 ++posError[i];
174 ++unique;
175 }
176 return unique;
177 }
178
179 template <typename TErrDistr, typename TCount1, typename TCount2, typename TMatches, typename TReads, typename TGenomes, typename TSpec>
180 inline unsigned
getErrorDistribution(TErrDistr & posError,TCount1 & insertions,TCount2 & deletions,TMatches & matches,TReads & reads,TGenomes & genomes,RazerSOptions<TSpec> & options)181 getErrorDistribution(
182 TErrDistr &posError,
183 TCount1 &insertions,
184 TCount2 &deletions,
185 TMatches &matches,
186 TReads &reads,
187 TGenomes &genomes,
188 RazerSOptions<TSpec> &options)
189 {
190 typedef Align<String<Dna5>, ArrayGaps> TAlign;
191 typedef typename Row<TAlign>::Type TRow;
192 typedef typename Iterator<TRow>::Type TIter;
193
194 //typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
195 typedef typename Position<TAlign>::Type TPosition;
196
197 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
198 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
199
200 Align<Dna5String, ArrayGaps> align;
201 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
202 if (options.hammingOnly)
203 scoreType.data_mismatch = -1;
204 resize(rows(align), 2);
205
206 unsigned unique = 0;
207 for (; it != itEnd; ++it)
208 {
209 if ((*it).orientation == '-') continue;
210
211 assignSource(row(align, 0), reads[(*it).rseqNo]);
212 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
213 if ((*it).orientation == 'R')
214 reverseComplement(source(row(align, 1)));
215 globalAlignment(align, scoreType);
216
217 TRow& row0 = row(align, 0);
218 TRow& row1 = row(align, 1);
219
220 TPosition begin = beginPosition(cols(align));
221 TPosition end = endPosition(cols(align));
222
223 TIter it0 = iter(row0, begin);
224 TIter it1 = iter(row1, begin);
225 TIter end0 = iter(row0, end);
226
227 unsigned pos = 0;
228 for (; it0 != end0 && pos < length(posError); ++it0, ++it1)
229 {
230 if (isGap(it0))
231 ++insertions;
232 else
233 {
234 if (isGap(it1))
235 ++deletions;
236 else
237 if ((options.compMask[ordValue(getValue(it0))] & options.compMask[ordValue(getValue(it1))]) == 0)
238 ++posError[pos];
239 ++pos;
240 }
241 }
242 ++unique;
243 }
244 return unique;
245 }
246
247
248 //////////////////////////////////////////////////////////////////////////////
249 template <typename TFile, typename TSource, typename TSpec, typename TPosition>
250 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source,TPosition begin_,TPosition end_)251 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source, TPosition begin_, TPosition end_)
252 {
253 typedef Align<TSource, TSpec> const TAlign;
254 typedef typename Row<TAlign>::Type TRow;
255 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
256
257 TRowsPosition row_count = length(rows(source));
258
259 // Print sequences
260 for (TRowsPosition i = 0; i < row_count; ++i)
261 {
262 if (i == 0)
263 target << "#Read: ";
264 else
265 target << "#Genome: ";
266 TRow& row_ = row(source, i);
267 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
268 TIter begin1_ = iter(row_, begin_);
269 TIter end1_ = iter(row_, end_);
270 for (; begin1_ != end1_; ++begin1_) {
271 if (isGap(begin1_))
272 target << gapValue<char>();
273 else
274 target << *begin1_;
275 }
276 target << '\n';
277 }
278 }
279
280 template<typename TMatches, typename TCounts, typename TOptions>
281 void
countCoocurrences(TMatches & matches,TCounts & cooc,TOptions & options)282 countCoocurrences(TMatches & matches, TCounts & cooc, TOptions & options)
283 {
284 clear(cooc);
285 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength) + 1;
286 resize(cooc,maxSeedErrors+1,0);
287 for (int i = 0; i < maxSeedErrors+1; ++i)
288 cooc[i] = 1;
289
290 int count = 0;
291 unsigned readNo = -1;
292 int preEditDist = -1;
293 typename Iterator<TMatches>::Type it = begin(matches,Standard());
294 typename Iterator<TMatches>::Type itEnd = end(matches,Standard());
295
296 for(; it != itEnd; ++it)
297 {
298 if ((*it).rseqNo == readNo)
299 {
300 if(preEditDist > 1) continue;// || dist > options.errorRate * maxReadLength + 1) continue;
301 int dist = (*it).seedEditDist - preEditDist;
302 if(dist > maxSeedErrors) continue;
303 if(dist < 0) ++cooc[0];
304 else ++cooc[dist];
305 }
306 else
307 {
308 readNo = (*it).rseqNo;
309 preEditDist = (*it).seedEditDist;
310 if(preEditDist <= 1) ++count;
311 }
312 }
313 for (unsigned i = 0; i < length(cooc); ++i)
314 {
315 cooc[i] = (int)(-4.343 * log((double)cooc[i]/count) );
316 if (cooc[i] < 0) cooc[i] = 0;
317 }
318 if(options._debugLevel > 1)
319 {
320 ::std::cerr << "[mapping_count] ";
321 for(unsigned j = 0; j < length(cooc); ++j)
322 ::std::cerr << cooc[j] << " ";
323 ::std::cerr << ::std::endl;
324 }
325
326 }
327
328
329 template<typename TAlign, typename TString, typename TIter>
330 void
getCigarLine(TAlign & align,TString & cigar,TString & mutations,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)331 getCigarLine(TAlign & align, TString & cigar, TString & mutations, TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
332 {
333
334 typedef typename Source<TAlign>::Type TSource;
335 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
336
337 // typedef typename Row<TAlign>::Type TRow;
338 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
339
340 TStringIterator readBase = begin(source(row(align,0)));
341
342 int readPos = 0;
343 bool first = true;
344 int inserted = 0;
345 int deleted = 0;
346 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
347 {
348 inserted = 0;
349 deleted = 0;
350 int matched = 0;
351 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
352 {
353 ++readPos;
354 if(*ali_it1 != *ali_it0)
355 {
356 if(first) first = false;
357 else mutations << ",";
358 mutations << readPos <<*readBase;
359 }
360 ++readBase;
361 ++ali_it0;
362 ++ali_it1;
363 ++matched;
364 }
365 if(matched>0) cigar << matched<< "M" ;
366 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
367 {
368 ++ali_it0;
369 ++ali_it1;
370 ++deleted;
371 }
372 if(deleted>0) cigar << deleted << "D";
373 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
374 {
375 ++ali_it0;
376 ++ali_it1;
377 ++readPos;
378 if(first) first = false;
379 else mutations << ",";
380 mutations << readPos << *readBase;
381 ++readBase;
382 ++inserted;
383 }
384 if(inserted>0) cigar << inserted << "I";
385 }
386 // end gaps can happen in split mapping
387 while(ali_it0!=ali_it0_stop)
388 {
389 ++ali_it0;
390 ++deleted;
391 }
392 if(deleted>0) cigar << deleted << "D";
393 while(ali_it1 != ali_it1_stop)
394 {
395 ++ali_it1;
396 ++readPos;
397 if(first) first = false;
398 else mutations << ",";
399 mutations << readPos << *readBase;
400 ++inserted;
401 }
402 if(inserted>0) cigar << inserted << "I";
403
404 }
405
406 template<typename TAlign, typename TIter>
407 void
getCigarLine(TAlign & align,String<Pair<char,int>> & cigar,String<Pair<Dna5,int>> & mutations,int offset,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)408 getCigarLine(TAlign & align,
409 String<Pair<char,int> > & cigar,
410 String<Pair<Dna5,int> > & mutations,
411 int offset,
412 TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
413 {
414
415 typedef typename Source<TAlign>::Type TSource;
416 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
417
418 // typedef typename Row<TAlign>::Type TRow;
419 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
420
421 TStringIterator readBase = begin(source(row(align,0)));
422
423 int readPos = 0;
424 int inserted = 0;
425 int deleted = 0;
426 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
427 {
428 inserted = 0;
429 deleted = 0;
430 int matched = 0;
431 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
432 {
433 ++readPos;
434 if(*ali_it1 != *ali_it0)
435 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
436 ++readBase;
437 ++ali_it0;
438 ++ali_it1;
439 ++matched;
440 }
441 if(matched>0) appendValue(cigar,Pair<char,int>('M',matched));
442 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
443 {
444 ++ali_it0;
445 ++ali_it1;
446 ++deleted;
447 }
448 if(deleted>0) appendValue(cigar,Pair<char,int>('D',deleted));
449 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
450 {
451 ++ali_it0;
452 ++ali_it1;
453 ++readPos;
454 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
455 ++readBase;
456 ++inserted;
457 }
458 if(inserted>0) appendValue(cigar,Pair<char,int>('I',inserted));
459 }
460 // end gaps can happen in split mapping
461 while(ali_it0!=ali_it0_stop)
462 {
463 ++ali_it0;
464 ++deleted;
465 }
466 if(deleted>0) appendValue(cigar,Pair<char,int>('D',deleted));
467 while(ali_it1 != ali_it1_stop)
468 {
469 ++ali_it1;
470 ++readPos;
471 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
472 ++inserted;
473 }
474 if(inserted>0) appendValue(cigar,Pair<char,int>('I',inserted + offset));
475
476 }
477
478
479
480 #ifdef RAZERS_DIRECT_MAQ_MAPPING
481 //////////////////////////////////////////////////////////////////////////////
482 // Assign mapping quality and remove suboptimal matches
483 template < typename TMatches, typename TReads, typename TCooc, typename TCounts, typename TSpec >
assignMappingQuality(TMatches & matches,TReads & reads,TCooc & cooc,TCounts & cnts,RazerSOptions<TSpec> & options)484 void assignMappingQuality(TMatches &matches, TReads & reads, TCooc & cooc, TCounts &cnts, RazerSOptions<TSpec> & options)
485 {
486 typedef typename Value<TMatches>::Type TMatch;
487 typedef typename Iterator<TMatches, Standard>::Type TIterator;
488
489 //matches are already sorted
490 //::std::sort(
491 // begin(matches, Standard()),
492 // end(matches, Standard()),
493 // LessRNoMQ<TMatch>());
494
495
496 int maxSeedErrors = (int)(options.errorRate*options.artSeedLength)+1;
497 unsigned readNo = -1;
498
499 TIterator it = begin(matches, Standard());
500 TIterator itEnd = end(matches, Standard());
501 TIterator dit = it;
502
503 int bestQualSum, secondBestQualSum;
504 int secondBestDist = -1 ,secondBestMatches = -1;
505 for (; it != itEnd; ++it)
506 {
507 if ((*it).orientation == '-') continue;
508 bool mappingQualityFound = false;
509 int mappingQuality = 0;
510 int qualTerm1,qualTerm2;
511
512 readNo = (*it).rseqNo;
513 bestQualSum = (*it).mScore;
514
515 if(++it!=itEnd && (*it).rseqNo==readNo)
516 {
517 secondBestQualSum = (*it).mScore;
518 secondBestDist = (*it).editDist;
519 secondBestDist = (*it).editDist;
520 secondBestMatches = cnts[1][readNo] >> 5;
521 //CHECKcnts secondBestMatches = cnts[secondBestDist][readNo];
522 // secondBestMatches = cnts[secondBestDist][readNo];
523 (*it).orientation = '-';
524 // if(secondBestDist<=bestDist) unique=0;
525 }
526 else secondBestQualSum = -1000;
527 --it; //it is now at best match of current rseqNo
528
529 int bestDist = (*it).editDist;
530 int kPrime = (*it).seedEditDist;
531 if((bestQualSum==secondBestQualSum) || (kPrime>maxSeedErrors))
532 mappingQualityFound = true; //mq=0
533 else{
534 if(secondBestQualSum == -1000) qualTerm1 = 99;
535 else
536 {
537 qualTerm1 = (int)(secondBestQualSum - bestQualSum - 4.343 * log((double)secondBestMatches));
538 //if (secondBestKPrime - kPrime <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual; //TODO abchecken was mehr sinn macht
539 if (secondBestDist - bestDist <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual;
540 }
541 float avgSeedQual = 0.0;
542 if(!mappingQualityFound)
543 {
544 //TODO!!! generalize and adapt to razers lossrates
545 // lossrate 0.42 => -10 log10(0.42) = 4
546 int kPrimeLoss = 4; // options.kPrimeLoss; // bezieht sich auf 3 fehler in 28bp
547 qualTerm2 = kPrimeLoss + cooc[maxSeedErrors-kPrime];
548
549 for(unsigned j = 0; j<options.artSeedLength; ++j)
550 {
551 int q = getQualityValue(reads[readNo][j]);//(int)((unsigned char)(reads[readNo][j])>>3);
552 if(q>options.mutationRateQual) q = options.mutationRateQual;
553 avgSeedQual+=q;
554 }
555 avgSeedQual/=options.artSeedLength;
556 //-10 log10(28-2) = 14;
557 //generalize to -10 log10(artSeedLength - maxSeedErrors +1 ) // 14 fits for seedlength 28 to 32 with 2 errors
558 if(avgSeedQual>14) qualTerm2 += (int)((maxSeedErrors-kPrime)*(avgSeedQual-14));
559 }
560 }
561 if (!mappingQualityFound) mappingQuality = (qualTerm1<qualTerm2) ? qualTerm1:qualTerm2;
562 if (mappingQuality < 0) mappingQuality = 0;
563 (*it).mScore = mappingQuality;
564
565 *dit = *it;
566 // if(secondBestQualSum != -1000) ++it;
567 ++dit;
568 }
569 resize(matches, dit - begin(matches, Standard()));
570 }
571 #endif
572
573
574 //////////////////////////////////////////////////////////////////////////////
575 // Dump an alignment
576 template <typename TFile, typename TSource, typename TSpec>
577 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source)578 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source)
579 {
580 typedef Align<TSource, TSpec> const TAlign;
581 typedef typename Row<TAlign>::Type TRow;
582 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
583 typedef typename Position<TAlign>::Type TPosition;
584
585 TRowsPosition row_count = length(rows(source));
586 TPosition begin_ = beginPosition(cols(source));
587 TPosition end_ = endPosition(cols(source));
588
589 // Print sequences
590 for(TRowsPosition i=0;i<row_count;++i) {
591 if (i == 0)
592 streamPut(target, "#Read: ");
593 else
594 streamPut(target, "#Genome: ");
595 TRow& row_ = row(source, i);
596 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
597 TIter begin1_ = iter(row_, begin_);
598 TIter end1_ = iter(row_, end_);
599 for (; begin1_ != end1_; ++begin1_) {
600 if (isGap(begin1_))
601 streamPut(target, gapValue<char>());
602 else
603 streamPut(target, *begin1_);
604 }
605 streamPut(target, '\n');
606 }
607 }
608
609 template<typename TMatches, typename TReads, typename TReadNames, typename TSpec>
610 void
dumpUnmappedReads(TMatches & matches,TReads const & reads,TReadNames const & readIDs,RazerSOptions<TSpec> & options)611 dumpUnmappedReads(TMatches &matches,
612 TReads const &reads,
613 TReadNames const &readIDs,
614 RazerSOptions<TSpec> &options)
615 {
616 typedef typename Iterator<TMatches>::Type TIterator;
617
618 ::std::ofstream file;
619 CharString fileName = options.outputUnmapped;
620 if (empty(fileName))
621 return;
622
623 file.open(toCString(fileName), ::std::ios_base::out | ::std::ios_base::trunc);
624 if (!file.is_open()) {
625 ::std::cerr << "Failed to open output file" << ::std::endl;
626 return;
627 }
628
629 char headerChar = '>';
630 if(options.readsWithQualities || options.fastaIdQual)
631 headerChar = '@';
632
633 // sort according to read number
634 ::std::sort(begin(matches, Standard()), end(matches, Standard()), LessRNoGPos<TMatch>());
635
636 TIterator mIt = begin(matches,Standard());
637 TIterator mEnd = end(matches,Standard());
638
639 unsigned readNo = 0;
640 // output reads without any matches
641 while(mIt != mEnd && readNo < length(reads))
642 {
643 while((*mIt).rseqNo > readNo)
644 {
645 // std::cout << "readNo=" << readNo << " + ";
646 file << headerChar << readIDs[readNo] << std::endl << reads[readNo] << std::endl;
647 if(options.readsWithQualities || options.fastaIdQual)
648 {
649 file << '+' << std::endl;
650 for(unsigned j=0;j<length(reads[readNo]);++j)
651 file << (char)(getQualityValue(reads[readNo][j])+ 33);
652 file << std::endl;
653 }
654 ++readNo;
655 }
656 while((*mIt).rseqNo == readNo && mIt != mEnd)
657 {
658 ++mIt;
659 }
660 ++readNo;
661 }
662 // std::cout << std::endl << "readNoInBetween=" << readNo << " + " << std::endl;
663
664 while(readNo < length(reads))
665 {
666 // std::cout << "readNo=" << readNo << " + ";
667 file << headerChar << readIDs[readNo] << std::endl << reads[readNo] << std::endl;
668 if(options.readsWithQualities || options.fastaIdQual)
669 {
670 file << '+' << std::endl;
671 for(unsigned j=0;j<length(reads[readNo]);++j)
672 file << (char)(getQualityValue(reads[readNo][j])+ 33);
673 file << std::endl;
674 }
675 ++readNo;
676 }
677 return;
678
679
680 }
681
682
683 //////////////////////////////////////////////////////////////////////////////
684 // Output matches
685 template <
686 typename TMatches,
687 typename TGenomeNames,
688 typename TReads,
689 typename TReadRegions,
690 typename TReadNames,
691 typename TCounts,
692 typename TSpec
693 >
dumpMatches(TMatches & matches,TGenomeNames const & genomeIDs,StringSet<CharString> & genomeFileNameList,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReads const & reads,TReadRegions const & readRegions,TCounts & stats,TReadNames const & readIDs,CharString readFName,CharString errorPrbFileName,RazerSOptions<TSpec> & options)694 void dumpMatches(
695 TMatches &matches, // forward/reverse matches
696 TGenomeNames const &genomeIDs, // Genome names (read from Fasta file, currently unused)
697 StringSet<CharString> &genomeFileNameList, // list of genome names (e.g. {"hs_ref_chr1.fa","hs_ref_chr2.fa"})
698 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > &gnoToFileMap, //map to retrieve genome filename and sequence number within that file
699 TReads const &reads,
700 TReadRegions const &readRegions,
701 TCounts & stats, // Match statistics (possibly empty)
702 TReadNames const &readIDs, // Read names (read from Fasta file, currently unused)
703 CharString readFName, // read name (e.g. "reads.fa"), used for file/read naming
704 CharString errorPrbFileName,
705 RazerSOptions<TSpec> &options)
706 {
707 typedef typename Value<TMatches>::Type TMatch;
708 //typedef typename Value<TReads>::Type TRead;
709 //typedef typename Value<TGenomeSet>::Type TGenome;
710 typedef typename TMatch::TGPos TGPos;
711
712 //dump umapped reads in a separate file if filename was specified
713 dumpUnmappedReads(matches, reads, readIDs, options);
714
715 if (options.minMatchLen > 0 && options.outputFormat != 4) options.outputFormat = 33;
716
717 if (options.outputFormat == 2)
718 {
719 options.sortOrder = 0; // ... to count multiple matches
720 }
721
722 if (options.outputFormat == 2)
723 {
724 options.maxHits = 1; // Eland outputs at most one match
725 options.sortOrder = 0; // read numbers are increasing
726 options.positionFormat = 1; // bases in file are numbered starting at 1
727 options.dumpAlignment = options.hammingOnly;
728 }
729 #ifdef RAZERS_DIRECT_MAQ_MAPPING
730 if (options.maqMapping) options.outputFormat = 3;
731 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength); //without + 1 here, used to check whether match should be supported if noBelowIdentity option is switched on
732 #endif
733
734 if (options.outputFormat == 3)
735 {
736 options.sortOrder = 1; // sort according to gPos
737 options.positionFormat = 1; // bases in file are numbered starting at 1
738 //options.dumpAlignment = false;
739 }
740 if (options.outputFormat == 33 || options.outputFormat == 4)
741 {
742 options.positionFormat = 1; // bases in file are numbered starting at 1
743 options.dumpAlignment = false;
744 options.sortOrder = 1;
745 }
746
747
748 // error profile
749 unsigned maxReadLength = options.maxReadLength;
750
751 SEQAN_PROTIMESTART(dump_time);
752
753 // load Genome sequences for alignment dumps
754 TGenomeSet genomes;
755 if ((options.outputFormat == 0 && !options.hammingOnly) || options.dumpAlignment || !empty(errorPrbFileName) || options.outputFormat == 33 || options.outputFormat == 4)
756 if (!loadGenomes(genomes, genomeFileNameList)) {
757 ::std::cerr << "Failed to load genomes" << ::std::endl;
758 options.dumpAlignment = false;
759 }
760
761 // how many 0's should be padded?
762 int pzeros = 0;
763 for (unsigned l = length(reads); l > 9; l = l / 10)
764 ++pzeros;
765
766 int gzeros = 0;
767 for (unsigned l = length(genomes); l > 9; l = l / 10)
768 ++gzeros;
769
770 // remove the directory prefix of readFName
771 ::std::string _readName;
772 assign(_readName, readFName);
773 size_t lastPos = _readName.find_last_of('/') + 1;
774 if (lastPos == _readName.npos) lastPos = _readName.find_last_of('\\') + 1;
775 if (lastPos == _readName.npos) lastPos = 0;
776 CharString readName = _readName.substr(lastPos);
777
778
779 typedef Align<String<Dna5>, ArrayGaps> TAlign;
780 TAlign align, alignL, alignR;
781 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
782 // Score<int> scoreType(0,-1,-1,-1);
783
784 if (options.hammingOnly)
785 scoreType.data_mismatch = -1;
786 resize(rows(align), 2);
787 resize(rows(alignL), 2);
788 resize(rows(alignR), 2);
789
790 VirtualStream<char, Output> file;
791 bool success;
792 if (!isEqual(options.output, "-"))
793 success = open(file, toCString(options.output));
794 else
795 success = open(file, std::cout, Nothing());
796
797 if (!success) {
798 ::std::cerr << "Failed to open output file" << ::std::endl;
799 return;
800 }
801
802
803 String<short> ambiStates;
804 if(options.minMatchLen == 0)
805 maskDuplicates(matches);
806 if (options.outputFormat > 0
807 #ifdef RAZERS_DIRECT_MAQ_MAPPING
808 && !options.maqMapping
809 #endif
810 )
811 {
812 // match statistics
813 unsigned maxErrors = (int)(options.errorRate * maxReadLength);
814 //if (maxErrors > 10) maxErrors = 10;
815 resize(stats, maxErrors + 1);
816 for (unsigned i = 0; i <= maxErrors; ++i)
817 resize(stats[i], length(reads), 0);
818 countMatches(matches, stats);
819 }
820
821
822
823 Nothing nothing;
824 unsigned currSeqNo = 0;
825 #ifdef RAZERS_DIRECT_MAQ_MAPPING
826 if(options.maqMapping)
827 {
828 String<int> cooc;
829 compactMatches(matches, stats, options, true, nothing, false); //only best two matches per read are kept
830 countCoocurrences(matches,cooc,options); //coocurrence statistics are filled
831 assignMappingQuality(matches,reads,cooc,stats,options);//mapping qualities are assigned and only one match per read is kept
832 }
833 else
834 #endif
835
836 #ifdef RAZERS_MICRO_RNA
837 if(options.microRNA)purgeAmbiguousRnaMatches(matches,options);
838 else
839 #endif
840 {
841 if(options.minMatchLen>0)
842 compactAndCountSplicedMatches(matches, ambiStates, options, true);
843 else
844 compactMatches(matches, stats, options, true, nothing);
845 }
846 int jj = 0;
847 if(!(options.minMatchLen > 0 ))
848 switch (options.sortOrder) {
849 case 0:
850 ::std::sort(
851 begin(matches, Standard()),
852 end(matches, Standard()),
853 LessRNoGPos<TMatch>());
854 break;
855
856 case 1:
857 ::std::sort(
858 begin(matches, Standard()),
859 end(matches, Standard()),
860 LessGPosRNo<TMatch>());
861 break;
862
863 }
864
865 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
866 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
867
868
869 Dna5String gInf, gInfL, gInfR;
870 char _sep_ = '\t';
871 unsigned viewPosReadFirst = 0;
872 unsigned viewPosReadLast = 0;
873
874 switch (options.outputFormat)
875 {
876 case 0: // Razer Format
877 // _sep_ = ',';
878 for(; it != itEnd; ++it)
879 {
880 unsigned readLen = length(reads[(*it).rseqNo]);
881 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
882 #ifdef RAZERS_MICRO_RNA
883 percId = 100.0 * (1.0 - (double)(*it).editDist / (double) ((*it).mScore));
884 #endif
885 switch (options.readNaming)
886 {
887 // 0..filename is the read's Fasta id
888 case 0:
889 file << readIDs[(*it).rseqNo];
890 break;
891
892 // 1..filename is the read filename + seqNo
893 case 1:
894 file.fill('0');
895 file << readName << '#' << ::std::setw(pzeros) << (*it).rseqNo + 1;
896 break;
897
898 // 2..filename is the read sequence itself
899 case 2:
900 file << reads[(*it).rseqNo];
901 }
902
903 file << _sep_ << options.positionFormat << _sep_ << readLen << _sep_ << (*it).orientation << _sep_;
904
905 switch (options.genomeNaming)
906 {
907 // 0..filename is the read's Fasta id
908 case 0:
909 file << genomeIDs[(*it).gseqNo];
910 break;
911
912 // 1..filename is the read filename + seqNo
913 case 1:
914 file.fill('0');
915 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
916 }
917
918 // get alignment to dump or to fix end coordinates
919 if (options.dumpAlignment || !options.hammingOnly)
920 {
921 #ifdef RAZERS_MICRO_RNA
922 if(options.microRNA)
923 assignSource(row(align, 0), prefix(reads[(*it).rseqNo],(*it).mScore));
924 else
925 #endif
926 assignSource(row(align, 0), reads[(*it).rseqNo]);
927 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
928 #ifdef RAZERS_MATEPAIRS
929 if ((*it).pairId != 0 && ((*it).rseqNo & 1))
930 reverseComplement(source(row(align, 0)));
931 #endif
932 if ((*it).orientation == 'R')
933 reverseComplement(source(row(align, 1)));
934
935 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
936
937 // transform first and last read character to genomic positions
938 viewPosReadFirst = toViewPosition(row(align, 0), 0);
939 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
940 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
941 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
942
943 if ((*it).orientation == 'R')
944 {
945 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
946 (*it).gEnd -= genomePosReadFirst;
947 }
948 else
949 {
950 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
951 (*it).gBegin += genomePosReadFirst;
952 }
953 }
954
955 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd << _sep_ << ::std::setprecision(5) << percId;
956 #ifdef RAZERS_MICRO_RNA
957 if(options.microRNA) file << _sep_ << (*it).mScore;
958 #endif
959
960 #ifdef RAZERS_MATEPAIRS
961 if ((*it).pairId != 0)
962 file << _sep_ << (*it).pairId << _sep_ << (*it).pairScore << _sep_ << (*it).mateDelta;
963 #endif
964 file << ::std::endl;
965
966 if (options.dumpAlignment)
967 dumpAlignment(file, align, viewPosReadFirst, viewPosReadLast + 1);
968 }
969 break;
970
971
972 case 1: // Enhanced Fasta Format
973 _sep_ = ',';
974 for(unsigned matchReadNo = -1, matchReadCount = 0; it != itEnd; ++it)
975 {
976 unsigned readLen = length(reads[(*it).rseqNo]);
977 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
978
979 if (matchReadNo != (*it).rseqNo)
980 {
981 matchReadNo = (*it).rseqNo;
982 matchReadCount = 0;
983 } else
984 ++matchReadCount;
985
986 ::std::string fastaID;
987 assign(fastaID, readIDs[(*it).rseqNo]);
988
989 ::std::string id = fastaID;
990 int fragId = (*it).rseqNo;
991 bool appendMatchId = options.maxHits > 1;
992
993 size_t left = fastaID.find_first_of('[');
994 size_t right = fastaID.find_last_of(']');
995 if (left != fastaID.npos && right != fastaID.npos && left < right)
996 {
997 fastaID.erase(right);
998 fastaID.erase(0, left + 1);
999 replace(fastaID.begin(), fastaID.end(), ',', ' ');
1000 size_t pos = fastaID.find("id=");
1001 if (pos != fastaID.npos) {
1002 ::std::istringstream iss(fastaID.substr(pos + 3));
1003 iss >> id;
1004 // appendMatchId = false;
1005 }
1006 pos = fastaID.find("fragId=");
1007 if (pos != fastaID.npos) {
1008 ::std::istringstream iss(fastaID.substr(pos + 7));
1009 iss >> fragId;
1010 }
1011 }
1012
1013 if ((*it).orientation == 'F')
1014 // forward strand
1015 file << '>' << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd;
1016 else
1017 // reverse strand (switch begin and end)
1018 file << '>' << (*it).gEnd << _sep_ << ((*it).gBegin + options.positionFormat);
1019
1020 unsigned ambig = 0;
1021 for (unsigned i = 0; i <= (*it).editDist && i < length(stats); ++i)
1022 ambig += stats[i][(*it).rseqNo];
1023
1024 file << "[id=" << id;
1025 if (appendMatchId) file << "_" << matchReadCount;
1026 file << ",fragId=" << fragId;
1027 file << ",contigId=" << genomeIDs[(*it).gseqNo];
1028 file << ",errors=" << (*it).editDist << ",percId=" << ::std::setprecision(5) << percId;
1029 file << ",ambiguity=" << ambig << ']' << ::std::endl;
1030
1031 file << reads[(*it).rseqNo] << ::std::endl;
1032 }
1033 break;
1034
1035
1036 case 2: // Eland Format
1037 _sep_ = '\t';
1038 for(unsigned readNo = 0; readNo < length(reads); ++readNo)
1039 {
1040 switch (options.readNaming)
1041 {
1042 // 0..filename is the read's Fasta id
1043 case 0:
1044 file << '>' << readIDs[readNo] << _sep_;
1045 break;
1046
1047 // 1..filename is the read filename + seqNo
1048 case 1:
1049 file.fill('0');
1050 file << readName << '#' << ::std::setw(pzeros) << readNo + 1 << _sep_;
1051 break;
1052 }
1053
1054 if (it == itEnd || readNo < (*it).rseqNo)
1055 {
1056 if (!empty(reads[readNo]))
1057 file << reads[readNo] << _sep_ << "NM" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
1058 else
1059 {
1060 for (unsigned i = 0; i < maxReadLength; ++i)
1061 file << '.';
1062 file << _sep_ << "QC" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
1063 }
1064 } else
1065 {
1066 file << reads[readNo] << _sep_;
1067 unsigned bestMatches = 1;
1068 if ((unsigned)(*it).editDist < length(stats))
1069 bestMatches = stats[(*it).editDist][readNo];
1070
1071 if (bestMatches == 0) file << '?'; // impossible
1072 if (bestMatches == 1) file << 'U'; // unique best match
1073 if (bestMatches > 1) file << 'R'; // non-unique best matches
1074
1075 file << (*it).editDist << _sep_ << stats[0][readNo] << _sep_ << stats[1][readNo] << _sep_ << stats[2][readNo];
1076
1077 if (bestMatches == 1)
1078 {
1079 file << _sep_;
1080 switch (options.genomeNaming)
1081 {
1082 // 0..filename is the read's Fasta id
1083 case 0:
1084 file << genomeIDs[(*it).gseqNo];
1085 break;
1086
1087 // 1..filename is the read filename + seqNo
1088 case 1:
1089 file.fill('0');
1090 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
1091 }
1092
1093 if ((*it).orientation == 'F')
1094 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << 'F' << _sep_ << "..";
1095 else
1096 file << _sep_ << (*it).gEnd << _sep_ << 'R' << _sep_ << "..";
1097
1098 if ((*it).editDist > 0 && options.dumpAlignment && options.hammingOnly)
1099 {
1100 gInf = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
1101 if ((*it).orientation == 'R')
1102 reverseComplement(gInf);
1103 for (unsigned i = 0; i < length(gInf); ++i)
1104 if ((options.compMask[ordValue(reads[readNo][i])] &
1105 options.compMask[ordValue(gInf[i])]) == 0)
1106 file << _sep_ << i + 1 << gInf[i];
1107 }
1108 }
1109 file << ::std::endl;
1110 ++it;
1111 }
1112 }
1113 break;
1114 case 3: // Gff: printf "$chr $name_$format read $pos %ld . $dir . ID=$col[0]$unique$rest\n",$pos+$len-1;
1115 // NOTE(marehr): filecount+=2 might be a potential bug [https://github.com/seqan/seqan/issues/2165]
1116 // In revision 4dbf27b55 and before, filecount was incremented twice at the
1117 // end of the for loop, which caused a compiler warning (once in the body
1118 // and once in the iteration_expression of the for loop). We kept this
1119 // behaviour, because we have no active maintainer for this app.
1120 for (unsigned filecount = 0; filecount < length(genomeFileNameList); filecount+=2)
1121 {
1122 // open genome file
1123 SeqFileIn gFile;
1124 if (!open(gFile, toCString(genomeFileNameList[filecount])))
1125 {
1126 std::cerr << "Couldn't open genome file." << std::endl;
1127 break;
1128 }
1129
1130 CharString currId;
1131 Dna5String currGenome;
1132
1133 // iterate over genome sequences
1134 for(; !atEnd(gFile); ++currSeqNo)
1135 {
1136 readRecord(currId, currGenome, gFile); // read Fasta sequence
1137 while(it != itEnd && (*it).gseqNo == currSeqNo)
1138 {
1139 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1140 if(options.maqMapping && options.noBelowIdentity && (*it).seedEditDist > maxSeedErrors)
1141 {
1142 ++it;
1143 continue;
1144 }
1145 #endif
1146
1147 unsigned currReadNo = (*it).rseqNo;
1148 int unique = 1;
1149 unsigned bestMatches = 0;
1150
1151 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1152 if(options.maqMapping)
1153 bestMatches = stats[0][currReadNo] >> 5;
1154 else
1155 #endif
1156 {
1157 if (bestMatches == 0 && (unsigned)(*it).editDist < length(stats))
1158 bestMatches = stats[(*it).editDist][currReadNo];
1159
1160 }
1161
1162 bool suboptimal = false;
1163 if (
1164 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1165 !options.maqMapping &&
1166 #endif
1167 (unsigned)(*it).editDist > 0 )
1168 {
1169 for(unsigned d = 0; d < (unsigned)(*it).editDist; ++d)
1170 if (stats[d][currReadNo]>0) suboptimal=true;
1171 }
1172
1173 if (bestMatches != 1)
1174 {
1175 unique = 0;
1176 if(options.purgeAmbiguous)
1177 {
1178 ++it;
1179 continue;
1180 }
1181
1182 // if((*it).mScore > 0) std::cout << (*it).mScore << "<-non uniq but score > 0\n";
1183 // ++it;
1184 // continue; // TODO: output non-unique matches (concerns maq mapping only)
1185 }
1186 unsigned readLen = length(reads[currReadNo]);
1187
1188 String<Dna5Q> readInf = infix(reads[currReadNo],0,readLen);
1189 switch (options.genomeNaming)
1190 {
1191 // 0..filename is the read's Fasta id
1192 case 0:
1193 file << genomeIDs[(*it).gseqNo] <<'\t';
1194 break;
1195 // 1..filename is the read filename + seqNo
1196 case 1:
1197 file.fill('0');
1198 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1199 break;
1200 }
1201
1202 // get alignment to dump or to fix end coordinates
1203 if (!options.hammingOnly)
1204 {
1205 assignSource(row(align, 0), readInf);
1206 //std::cout << "begin=" << (*it).gBegin <<" end="<< (*it).gEnd << std::endl;
1207 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1208 if ((*it).orientation == 'R')
1209 reverseComplement(source(row(align, 1)));
1210
1211 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1212
1213 //std::cout << align << std::endl;
1214 //dumpAlignment(::std::cout, align);
1215
1216 // transform first and last read character to genomic positions
1217 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1218 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1219 TGPos genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
1220 TGPos genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
1221 if ((*it).orientation == 'R')
1222 {
1223 // watch out at chromosome borders
1224 (*it).gBegin = (*it).gEnd - _min((TGPos)(genomePosReadLast + 1),(*it).gEnd);
1225 (*it).gEnd -= genomePosReadFirst;
1226 }
1227 else
1228 {
1229 (*it).gEnd = (*it).gBegin + _min(genomePosReadLast + 1, static_cast<TGPos>(length(currGenome) - (*it).gBegin));
1230 (*it).gBegin += genomePosReadFirst;
1231 }
1232
1233 clear(row(align,1));
1234 clear(row(align,0));
1235 assignSource(row(align, 0), readInf);
1236 //std::cout << "begin=" << (*it).gBegin <<" end="<< (*it).gEnd << std::endl;
1237 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1238 if ((*it).orientation == 'R')
1239 reverseComplement(source(row(align, 1)));
1240
1241 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1242
1243 //std::cout << align << std::endl;
1244 //dumpAlignment(::std::cout, align);
1245
1246 // transform first and last read character to genomic positions
1247 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1248 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1249
1250
1251 }
1252
1253 //file << options.runID << "_razers\tread";
1254 file << "razers\tread";
1255 file << '\t' << ((*it).gBegin + options.positionFormat) << '\t' << (*it).gEnd << '\t';
1256 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
1257 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1258 if(options.maqMapping)
1259 file << (*it).mScore << "\t";
1260 else
1261 #endif
1262 file << percId << "\t";
1263
1264 if ((*it).orientation == 'F')
1265 file << '+' << '\t' << '.' <<'\t';
1266 else
1267 file << '-' << '\t' << '.' <<'\t';
1268
1269 switch (options.readNaming)
1270 {
1271 // 0..filename is the read's Fasta id
1272 case 0:
1273 file << "ID=" <<readIDs[currReadNo];
1274 break;
1275
1276 // 1..filename is the read filename + seqNo
1277 case 1:
1278 file.fill('0');
1279 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1280 break;
1281 }
1282 if(suboptimal) file << ";suboptimal";
1283 else
1284 {
1285 if(unique==1) file << ";unique";
1286 if(unique==0) file << ";multi";
1287 }
1288
1289
1290 if ((*it).editDist > 0)
1291 {
1292 if (options.hammingOnly)
1293 {
1294 gInf = infix(currGenome, (*it).gBegin, (*it).gEnd);
1295 if ((*it).orientation == 'R')
1296 reverseComplement(gInf);
1297 bool first = true;
1298 file << ";cigar=" << readLen << "M";
1299 file << ";mutations=";
1300 for (unsigned i = 0; i < length(gInf); ++i)
1301 if ((options.compMask[ordValue(readInf[i])] &
1302 options.compMask[ordValue(gInf[i])]) == 0)
1303 {
1304 if(first){ file << i + 1 << (Dna5)readInf[i]; first = false;}
1305 else file <<','<< i + 1 << (Dna5)readInf[i];
1306 }
1307
1308 }
1309 else
1310 {
1311 std::stringstream cigar, mutations;
1312 typedef typename Row<TAlign>::Type TRow;
1313 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1314 TAlignIterator ali_it0 = iter(row(align,0),viewPosReadFirst);
1315 TAlignIterator ali_it1 = iter(row(align,1),viewPosReadFirst);
1316 TAlignIterator ali_it0stop = iter(row(align,0),viewPosReadLast + 1);
1317 TAlignIterator ali_it1stop = iter(row(align,1),viewPosReadLast + 1);
1318
1319 getCigarLine(align,cigar,mutations,ali_it0,ali_it0stop,ali_it1,ali_it1stop);
1320 file << ";cigar="<<cigar.str();
1321 if(length(mutations.str())>0)
1322 file << ";mutations=" << mutations.str();
1323 }
1324 }
1325
1326 if(
1327 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1328 options.maqMapping ||
1329 #endif
1330 options.fastaIdQual || options.readsWithQualities)
1331 {
1332 // file << ";read=";
1333 // for(unsigned j=0;j<length(reads[currReadNo]);++j)
1334 // {
1335 // file << (Dna5)reads[currReadNo][j];
1336 // }
1337 file << ";quality=";
1338 for(unsigned j=0;j<readLen;++j)
1339 {
1340 file << (char)(getQualityValue(readInf[j])+ 33);
1341 }
1342 }
1343
1344 file << ::std::endl;
1345 if(options.dumpAlignment)
1346 {
1347 if((*it).editDist > 0 && !options.hammingOnly)
1348 file << align;
1349 else
1350 {
1351 clear(row(align,1));
1352 clear(row(align,0));
1353 assignSource(row(align, 0), readInf);
1354 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1355 if ((*it).orientation == 'R')
1356 reverseComplement(source(row(align, 1)));
1357 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1358 file << align;
1359
1360 }
1361 }
1362 ++it;
1363 }
1364 }
1365 close(gFile);
1366 }
1367 break;
1368 case 4: // SAM output for splitMatches
1369 while(it != itEnd)// && (*it).gseqNo == currSeqNo)
1370 {
1371 unsigned currReadNo = (*it).rseqNo;
1372 Dna5String &currGenome = genomes[(*it).gseqNo];
1373 unsigned readLen = length(reads[currReadNo]);
1374 TMatch& mL = *it;
1375 ++it;
1376 TMatch& mR = *it;
1377 unsigned readLenL = mL.mScore;
1378 unsigned readLenR = mR.mScore;
1379 String<Dna5Q> readInfL = infix(reads[currReadNo],0,readLenL);
1380 String<Dna5Q> readInfR = infix(reads[currReadNo],length(reads[currReadNo])-readLenR,length(reads[currReadNo]));
1381 int offsetL = 0;
1382 int offsetR = readLen - readLenR;
1383 if(mL.orientation == 'R')
1384 {
1385 offsetR = 0;
1386 offsetL = readLen - readLenL;
1387 readInfL = infix(reads[currReadNo],length(reads[currReadNo])-readLenL,length(reads[currReadNo]));
1388 readInfR = infix(reads[currReadNo],0,readLenR);
1389 }
1390
1391 // get alignment to dump and get cigar string
1392 if (!options.hammingOnly)
1393 {
1394 assignSource(row(alignL, 0), readInfL);
1395 assignSource(row(alignL, 1), infix(currGenome, mL.gBegin, mL.gEnd));
1396 if (mL.orientation == 'R')
1397 reverseComplement(source(row(alignL, 1)));
1398
1399 globalAlignment(alignL, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1400
1401 assignSource(row(alignR, 0), readInfR);
1402 assignSource(row(alignR, 1), infix(currGenome, mR.gBegin, mR.gEnd));
1403 if (mR.orientation == 'R')
1404 reverseComplement(source(row(alignR, 1)));
1405
1406 globalAlignment(alignR, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1407 }
1408
1409 switch (options.readNaming)
1410 {
1411 // 0..filename is the read's Fasta id
1412 case 0:
1413 file << "" <<readIDs[currReadNo];
1414 break;
1415 // 1..filename is the read filename + seqNo
1416 case 1:
1417 file.fill('0');
1418 file << "" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1419 break;
1420 }
1421 int samFlag = 0;
1422 if(ambiStates[jj] == 2) samFlag |= 0x100; //suboptimal
1423 if (mR.orientation == 'R') samFlag |= 0x10;
1424 if(!empty(readRegions) && options.anchored)
1425 {
1426 samFlag |= 1;
1427 if(readRegions[currReadNo].i2.i1 > 1) samFlag |= 0x10;
1428 else samFlag |= 0x20;
1429 if((readRegions[currReadNo].i2.i1 & 1) == 1) samFlag |= 0x80;
1430 else samFlag |= 0x40;
1431 }
1432 file << '\t' << samFlag << '\t';
1433
1434 switch (options.genomeNaming)
1435 {
1436 // 0..filename is the read's Fasta id
1437 case 0:
1438 file << genomeIDs[(*it).gseqNo] <<'\t';
1439 break;
1440 // 1..filename is the read filename + seqNo
1441 case 1:
1442 file.fill('0');
1443 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1444 break;
1445 }
1446
1447 file << (mL.gBegin + 1) << '\t' ;
1448
1449 //double percId = 100.0 * (1.0 - (double)(mL.editDist + mR.editDist) /(double)(mL.mScore+mR.mScore));
1450
1451 if(ambiStates[jj] == 0) file << 255 << '\t'; //unique
1452 if(ambiStates[jj] > 0) file << 0 << '\t'; //multi or suboptimal
1453 ++jj;
1454
1455 SEQAN_ASSERT(mL.pairId == mR.pairId);
1456
1457 int indelLen = mR.mScore + mL.mScore - readLen;
1458 if(mL.gEnd != mR.gBegin) indelLen = mR.gBegin - mL.gEnd;
1459
1460 String<Pair<Dna5,int> > mutationsStr, mutStrMid, mutStrL, mutStrR;
1461 String<Pair<char,int> > cigarStr, cigarMid, cigarL, cigarR;
1462 if(indelLen < 0)
1463 {
1464 int offset = readLenL;
1465 if(mL.orientation == 'R')
1466 offset = readLenR;
1467 String<Dna5Q> readInfMid = infix(reads[currReadNo],offset,offset - indelLen);
1468 for (int i = 0; i < -indelLen; ++i)
1469 appendValue(mutStrMid, Pair<Dna5,int>(readInfMid[i],i+offset+1));
1470 appendValue(cigarMid, Pair<char,int>('I',-indelLen));
1471
1472 }
1473 if(indelLen > 0) appendValue(cigarMid, Pair<char,int>('N',indelLen));
1474
1475 if (options.hammingOnly)
1476 {
1477 if(indelLen == 0)
1478 appendValue(cigarL, Pair<char,int>('M',readLen));
1479 else
1480 {
1481 appendValue(cigarL, Pair<char,int>('M',readLenL));
1482 appendValue(cigarR, Pair<char,int>('M',readLenR));
1483 }
1484
1485 //make left part of mutation string
1486 if (mL.editDist > 0)
1487 {
1488 gInfL = infix(currGenome, mL.gBegin, mL.gEnd);
1489 if (mL.orientation == 'R')
1490 reverseComplement(gInfL);
1491 for (unsigned i = 0; i < length(gInfL); ++i)
1492 if ((options.compMask[ordValue(readInfL[i])] & options.compMask[ordValue(gInfL[i])]) == 0)
1493 appendValue(mutStrL, Pair<Dna5,int>(readInfL[i],i+offsetL+1));
1494 }
1495
1496 //make right part of mutation string
1497 if (mR.editDist > 0)
1498 {
1499 gInfR = infix(currGenome, mR.gBegin, mR.gEnd);
1500 if (mR.orientation == 'R')
1501 reverseComplement(gInfR);
1502 for (unsigned i = 0; i < length(gInfR); ++i)
1503 if ((options.compMask[ordValue(readInfR[i])] & options.compMask[ordValue(gInfR[i])]) == 0)
1504 appendValue(mutStrR, Pair<Dna5,int>(readInfR[i],i+offsetR+1));
1505 }
1506 }
1507 else
1508 {
1509 typedef typename Row<TAlign>::Type TRow;
1510 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1511
1512 //left business
1513 TAlignIterator aliL_it0 = begin(row(alignL,0));
1514 TAlignIterator aliL_it1 = begin(row(alignL,1));
1515 TAlignIterator aliL_it0stop = end(row(alignL,0));
1516 TAlignIterator aliL_it1stop = end(row(alignL,1));
1517 getCigarLine(alignL,cigarL,mutStrL,offsetL,aliL_it0,aliL_it0stop,aliL_it1,aliL_it1stop);
1518
1519 //right business
1520 TAlignIterator aliR_it0 = begin(row(alignR,0));
1521 TAlignIterator aliR_it1 = begin(row(alignR,1));
1522 TAlignIterator aliR_it0stop = end(row(alignR,0));
1523 TAlignIterator aliR_it1stop = end(row(alignR,1));
1524 getCigarLine(alignR,cigarR,mutStrR,offsetR,aliR_it0,aliR_it0stop,aliR_it1,aliR_it1stop);
1525 }
1526 //now plug together the parts: cigar
1527 if (mL.orientation == 'F')
1528 {
1529 append(cigarStr,cigarL);
1530 append(cigarStr,cigarMid);
1531 append(cigarStr,cigarR);
1532 append(mutationsStr,mutStrL);
1533 append(mutationsStr,mutStrMid);
1534 append(mutationsStr,mutStrR);
1535
1536 // print cigar string
1537 for (unsigned i = 0; i < length(cigarStr); ++i)
1538 file << cigarStr[i].i2 << cigarStr[i].i1;
1539 }
1540 else
1541 {
1542 append(cigarStr,cigarR);
1543 append(cigarStr,cigarMid);
1544 append(cigarStr,cigarL);
1545 append(mutationsStr,mutStrR);
1546 append(mutationsStr,mutStrMid);
1547 append(mutationsStr,mutStrL);
1548
1549 // print cigar string
1550 for (unsigned i = length(cigarStr); i != 0; )
1551 {
1552 --i;
1553 file << cigarStr[i].i2 << cigarStr[i].i1;
1554 }
1555 }
1556
1557 file << '\t';
1558
1559 if(!empty(readRegions) && options.anchored)
1560 {
1561
1562 switch (options.genomeNaming)
1563 {
1564 // 0..filename is the read's Fasta id
1565 case 0:
1566 file << genomeIDs[(*it).gseqNo] <<'\t';
1567 break;
1568 // 1..filename is the read filename + seqNo
1569 case 1:
1570 file.fill('0');
1571 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1572 break;
1573 }
1574
1575 file << readRegions[currReadNo].i2.i2 << '\t' ;
1576 //int libraryError;
1577 int templateLen = 0;
1578 if(readRegions[currReadNo].i2.i1 < 2)
1579 templateLen = readRegions[currReadNo].i2.i2 + readLen -(int) mL.gBegin;
1580 //libraryError = (int) mR.gEnd + readRegions[currReadNo].i2.i2 - options.libraryLength + 2*readLen;
1581 else
1582 templateLen = (int) mR.gEnd - readRegions[currReadNo].i2.i2;
1583 //libraryError = (int) mL.gBegin - readRegions[currReadNo].i2.i2 + options.libraryLength - readLen ;
1584 file << templateLen << '\t';
1585 }
1586 else file << "*\t*\t*\t";
1587
1588 if (mL.orientation == 'F')
1589 file << reads[currReadNo] << "\t*\t";
1590 else
1591 file << reverseComplementString(reads[currReadNo]) << "\t*\t";
1592 // quality
1593 if(options.fastaIdQual)
1594 {
1595 for(unsigned j=0;j<length(reads[currReadNo]);++j)
1596 {
1597 file << (char)(getQualityValue(reads[currReadNo][j])+ 33);
1598 }
1599 }
1600 file << "AS:i:" << mR.pairScore;
1601
1602 file << ::std::endl;
1603 ++it;
1604 }
1605 break;
1606 case 33: // special Gff for split reads
1607 while(it != itEnd)// && (*it).gseqNo == currSeqNo)
1608 {
1609 unsigned currReadNo = (*it).rseqNo;
1610 Dna5String &currGenome = genomes[(*it).gseqNo];
1611 unsigned readLen = length(reads[currReadNo]);
1612 TMatch& mL = *it;
1613 ++it;
1614 TMatch& mR = *it;
1615 unsigned readLenL = mL.mScore;
1616 unsigned readLenR = mR.mScore;
1617 SEQAN_ASSERT_LEQ(mL.mScore+mR.mScore,(int)readLen);
1618 String<Dna5Q> readInfL = infix(reads[currReadNo],0,readLenL);
1619 String<Dna5Q> readInfR = infix(reads[currReadNo],length(reads[currReadNo])-readLenR,length(reads[currReadNo]));
1620 int offsetL = 0;
1621 int offsetR = readLen - readLenR;
1622 if(mL.orientation == 'R')
1623 {
1624 offsetR = 0;
1625 offsetL = readLen - readLenL;
1626 readInfL = infix(reads[currReadNo],length(reads[currReadNo])-readLenL,length(reads[currReadNo]));
1627 readInfR = infix(reads[currReadNo],0,readLenR);
1628 }
1629
1630 switch (options.genomeNaming)
1631 {
1632 // 0..filename is the read's Fasta id
1633 case 0:
1634 file << genomeIDs[(*it).gseqNo] <<'\t';
1635 break;
1636 // 1..filename is the read filename + seqNo
1637 case 1:
1638 file.fill('0');
1639 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1640 break;
1641 }
1642
1643 // get alignment to dump and get cigar string
1644 if (!options.hammingOnly)
1645 {
1646 assignSource(row(alignL, 0), readInfL);
1647 assignSource(row(alignL, 1), infix(currGenome, mL.gBegin, mL.gEnd));
1648 if (mL.orientation == 'R')
1649 reverseComplement(source(row(alignL, 1)));
1650
1651 globalAlignment(alignL, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1652
1653 assignSource(row(alignR, 0), readInfR);
1654 assignSource(row(alignR, 1), infix(currGenome, mR.gBegin, mR.gEnd));
1655 if (mR.orientation == 'R')
1656 reverseComplement(source(row(alignR, 1)));
1657
1658 globalAlignment(alignR, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1659 }
1660
1661 //file << options.runID << "_razers\tread";
1662 file << "razers\tread";
1663 file << '\t' << (mL.gBegin + options.positionFormat) << '\t' << mR.gEnd << '\t';
1664 //?// double percId = 100.0 * (1.0 + (double)(mL.pairScore-mL.mScore-mR.mScore) / (double)readLen);
1665 double percId = 100.0 * (1.0 - (double)(mL.editDist + mR.editDist) /(double)(mL.mScore+mR.mScore));
1666 file << percId << "\t";
1667 // if(!empty(readRegions) && options.anchored)
1668 // {
1669 // if(readRegions[currReadNo].i2 < 0)
1670 // file << '-' << '\t' << '.' <<'\t';
1671 // else
1672 // file << '+' << '\t' << '.' <<'\t';
1673 // }
1674 // else
1675 {
1676 if (mL.orientation == 'F')
1677 file << '+' << '\t' << '.' <<'\t';
1678 else
1679 file << '-' << '\t' << '.' <<'\t';
1680 }
1681
1682 switch (options.readNaming)
1683 {
1684 // 0..filename is the read's Fasta id
1685 case 0:
1686 file << "ID=" <<readIDs[currReadNo];
1687 break;
1688 // 1..filename is the read filename + seqNo
1689 case 1:
1690 file.fill('0');
1691 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1692 break;
1693 }
1694 if(ambiStates[jj] == 0) file << ";unique";
1695 if(ambiStates[jj] == 1) file << ";multi";
1696 if(ambiStates[jj] == 2) file << ";suboptimal";
1697 ++jj;
1698
1699 SEQAN_ASSERT(mL.pairId == mR.pairId);
1700
1701 int indelLen = mR.mScore + mL.mScore - readLen;
1702 if(mL.gEnd != mR.gBegin) indelLen = mR.gBegin - mL.gEnd;
1703
1704 file << ";pairScore=" << (unsigned int) mR.pairScore;
1705 if(!empty(readRegions) && options.anchored)
1706 {
1707 if(readRegions[currReadNo].i2.i1 < 2)
1708 file << ";libraryError=" << (int) readRegions[currReadNo].i2.i2 - mR.gEnd - (int) options.libraryLength + 2*readLen;
1709 else {
1710 int tmp = (int)mL.gBegin + readLen - (int)options.libraryLength - readRegions[currReadNo].i2.i2;
1711 file << ";libraryError=" << tmp ;
1712 }
1713
1714 }
1715
1716 if(mL.gBegin > mR.gBegin) std::cout << "falsch\n";
1717 // if(mR.traceExtension >= abs(indelLen))
1718 // file << ";traceExt=" << (unsigned int) mR.traceExtension;
1719
1720 String<Pair<Dna5,int> > mutationsStr, mutStrMid, mutStrL, mutStrR;
1721 String<Pair<char,int> > cigarStr, cigarMid, cigarL, cigarR;
1722 if(indelLen < 0)
1723 {
1724 int offset = readLenL;
1725 if(mL.orientation == 'R')
1726 offset = readLenR;
1727 String<Dna5Q> readInfMid = infix(reads[currReadNo],offset,offset - indelLen);
1728 for (int i = 0; i < -indelLen; ++i)
1729 appendValue(mutStrMid, Pair<Dna5,int>(readInfMid[i],i+offset+1));
1730 appendValue(cigarMid, Pair<char,int>('I',-indelLen));
1731
1732 }
1733 if(indelLen > 0) appendValue(cigarMid, Pair<char,int>('D',indelLen));
1734 if(indelLen != 0)
1735 file << ";split";
1736
1737 if (options.hammingOnly)
1738 {
1739 if(indelLen == 0)
1740 appendValue(cigarL, Pair<char,int>('M',readLen));
1741 else
1742 {
1743 appendValue(cigarL, Pair<char,int>('M',readLenL));
1744 appendValue(cigarR, Pair<char,int>('M',readLenR));
1745 }
1746
1747 //make left part of mutation string
1748 if (mL.editDist > 0)
1749 {
1750 gInfL = infix(currGenome, mL.gBegin, mL.gEnd);
1751 if (mL.orientation == 'R')
1752 reverseComplement(gInfL);
1753 for (unsigned i = 0; i < length(gInfL); ++i)
1754 if ((options.compMask[ordValue(readInfL[i])] & options.compMask[ordValue(gInfL[i])]) == 0)
1755 appendValue(mutStrL, Pair<Dna5,int>(readInfL[i],i+offsetL+1));
1756 }
1757
1758 //make right part of mutation string
1759 if (mR.editDist > 0)
1760 {
1761 gInfR = infix(currGenome, mR.gBegin, mR.gEnd);
1762 if (mR.orientation == 'R')
1763 reverseComplement(gInfR);
1764 for (unsigned i = 0; i < length(gInfR); ++i)
1765 if ((options.compMask[ordValue(readInfR[i])] & options.compMask[ordValue(gInfR[i])]) == 0)
1766 appendValue(mutStrR, Pair<Dna5,int>(readInfR[i],i+offsetR+1));
1767 }
1768 }
1769 else
1770 {
1771 typedef typename Row<TAlign>::Type TRow;
1772 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1773
1774 //left business
1775 TAlignIterator aliL_it0 = begin(row(alignL,0));
1776 TAlignIterator aliL_it1 = begin(row(alignL,1));
1777 TAlignIterator aliL_it0stop = end(row(alignL,0));
1778 TAlignIterator aliL_it1stop = end(row(alignL,1));
1779 getCigarLine(alignL,cigarL,mutStrL,offsetL,aliL_it0,aliL_it0stop,aliL_it1,aliL_it1stop);
1780
1781 //right business
1782 TAlignIterator aliR_it0 = begin(row(alignR,0));
1783 TAlignIterator aliR_it1 = begin(row(alignR,1));
1784 TAlignIterator aliR_it0stop = end(row(alignR,0));
1785 TAlignIterator aliR_it1stop = end(row(alignR,1));
1786 getCigarLine(alignR,cigarR,mutStrR,offsetR,aliR_it0,aliR_it0stop,aliR_it1,aliR_it1stop);
1787 }
1788 //now plug together the parts: cigar
1789 if(mL.orientation=='F')
1790 {
1791 append(cigarStr,cigarL);
1792 append(cigarStr,cigarMid);
1793 append(cigarStr,cigarR);
1794 append(mutationsStr,mutStrL);
1795 append(mutationsStr,mutStrMid);
1796 append(mutationsStr,mutStrR);
1797 }
1798 else
1799 {
1800 append(cigarStr,cigarR);
1801 append(cigarStr,cigarMid);
1802 append(cigarStr,cigarL);
1803 append(mutationsStr,mutStrR);
1804 append(mutationsStr,mutStrMid);
1805 append(mutationsStr,mutStrL);
1806 }
1807 bool first = true;
1808 file << ";cigar=";// << cigarStr.str();
1809 for (unsigned i = 0; i < length(cigarStr); ++i)
1810 file << cigarStr[i].i2 << cigarStr[i].i1;
1811
1812 if(length(mutationsStr)>0)file << ";mutations=";
1813 for (unsigned i = 0; i < length(mutationsStr); ++i)
1814 {
1815 if(first)
1816 {
1817 first = false;
1818 file << mutationsStr[i].i2 << mutationsStr[i].i1;
1819 }
1820 else
1821 file << "," << mutationsStr[i].i2 << mutationsStr[i].i1;
1822 }
1823
1824 if(options.fastaIdQual)
1825 {
1826 file << ";quality=";
1827 for(unsigned j=0;j<length(reads[currReadNo]);++j)
1828 {
1829 file << (char)(getQualityValue(reads[currReadNo][j])+ 33);
1830 }
1831 }
1832 file << ::std::endl;
1833 ++it;
1834 }
1835 break;
1836
1837
1838 }
1839
1840 close(file);
1841
1842 // get empirical error distribution
1843 if (!empty(errorPrbFileName) && maxReadLength > 0)
1844 {
1845 std::ofstream file;
1846 file.open(toCString(errorPrbFileName), ::std::ios_base::out | ::std::ios_base::trunc);
1847 if (file.is_open())
1848 {
1849 String<long double> posError;
1850 unsigned unique = 0;
1851 unsigned insertions = 0;
1852 unsigned deletions = 0;
1853 resize(posError, maxReadLength, 0);
1854
1855 if (options.hammingOnly)
1856 unique = getErrorDistribution(posError, matches, reads, genomes, options);
1857 else
1858 {
1859 unique = getErrorDistribution(posError, insertions, deletions, matches, reads, genomes, options);
1860 ::std::cerr << "insertProb: " << (double)insertions / ((double)length(posError) * (double)unique) << ::std::endl;
1861 ::std::cerr << "deleteProb: " << (double)deletions / ((double)length(posError) * (double)unique) << ::std::endl;
1862 }
1863
1864 file << (double)posError[0] / (double)unique;
1865 for (unsigned i = 1; i < length(posError); ++i)
1866 file << '\t' << (double)posError[i] / (double)unique;
1867 file << ::std::endl;
1868 file.close();
1869 } else
1870 ::std::cerr << "Failed to open error distribution file" << ::std::endl;
1871 }
1872
1873 options.timeDumpResults = SEQAN_PROTIMEDIFF(dump_time);
1874
1875 if (options._debugLevel >= 1)
1876 ::std::cerr << "Dumping results took \t" << options.timeDumpResults << " seconds" << ::std::endl;
1877 }
1878
1879
1880 }
1881
1882 #endif
1883
1884