1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_OUTPUT_FORMAT_H
23 #define SEQAN_HEADER_OUTPUT_FORMAT_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28
29 #include "razers.h"
30 #include <seqan/align.h>
31
32 namespace SEQAN_NAMESPACE_MAIN
33 {
34
35 //////////////////////////////////////////////////////////////////////////////
36 // Quality-based score
37
38 template <typename TQualityString = CharString>
39 struct Quality;
40
41 template <typename TValue, typename TQualityString>
42 class Score<TValue, Quality<TQualityString> >
43 {
44 public:
45 TValue data_match;
46 TValue data_mismatch;
47 TValue data_gap_extend;
48 TValue data_gap_open;
49
50 TQualityString const *data_qual;
51
52 public:
Score()53 Score():
54 data_match(0),
55 data_mismatch(-1),
56 data_gap_extend(-1),
57 data_gap_open(-1),
58 data_qual(NULL)
59 {
60 }
Score(TValue _match,TValue _mismatch,TValue _gap)61 Score(TValue _match, TValue _mismatch, TValue _gap):
62 data_match(_match),
63 data_mismatch(_mismatch),
64 data_gap_extend(_gap),
65 data_gap_open(_gap),
66 data_qual(NULL)
67 {
68 }
Score(TValue _match,TValue _mismatch,TValue _gap_extend,TValue _gap_open,TQualityString const & _qual)69 Score(TValue _match, TValue _mismatch, TValue _gap_extend, TValue _gap_open, TQualityString const &_qual):
70 data_match(_match),
71 data_mismatch(_mismatch),
72 data_gap_extend(_gap_extend),
73 data_gap_open(_gap_open),
74 data_qual(&_qual)
75 {
76 }
77
Score(Score const & other)78 Score(Score const & other):
79 data_match(other.data_match),
80 data_mismatch(other.data_mismatch),
81 data_gap_extend(other.data_gap_extend),
82 data_gap_open(other.data_gap_open),
83 data_qual(other.data_qual)
84 {
85 }
~Score()86 ~Score()
87 {
88 }
89
90 Score & operator = (Score const & other)
91 {
92 data_match = other.data_match;
93 data_mismatch = other.data_mismatch;
94 data_gap_extend = other.data_gap_extend;
95 data_gap_open = other.data_gap_open;
96 data_qual = other.data_qual;
97 return *this;
98 }
99 };
100
101 //////////////////////////////////////////////////////////////////////////////
102
103 template <typename TValue, typename TQualityString, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
104 inline TValue
score(Score<TValue,Quality<TQualityString>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)105 score(Score<TValue, Quality<TQualityString> > const & me,
106 TPos1 pos1,
107 TPos2 pos2,
108 TSeq1 const &seq1,
109 TSeq2 const &seq2)
110 {
111 if (seq1[pos1] != seq2[pos2])
112 if (me.data_qual)
113 return (*me.data_qual)[pos2];
114 else
115 return scoreMismatch(me);
116 else
117 return scoreMatch(me);
118 }
119
120
121 //////////////////////////////////////////////////////////////////////////////
122 // Less-operators ...
123
124 // ... to sort matches and remove duplicates with equal gBegin
125 template <typename TReadMatch>
126 struct LessGPosRNo : public ::std::binary_function < TReadMatch, TReadMatch, bool >
127 {
operatorLessGPosRNo128 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
129 {
130 // genome position and orientation
131 if (a.gseqNo < b.gseqNo) return true;
132 if (a.gseqNo > b.gseqNo) return false;
133 if (a.gBegin < b.gBegin) return true;
134 if (a.gBegin > b.gBegin) return false;
135 if (a.orientation < b.orientation) return true;
136 if (a.orientation > b.orientation) return false;
137
138 // read number
139 if (a.rseqNo < b.rseqNo) return true;
140 if (a.rseqNo > b.rseqNo) return false;
141
142 // quality
143 return a.editDist < b.editDist;
144 }
145 };
146
147 //////////////////////////////////////////////////////////////////////////////
148 // Determine error distribution
149 template <typename TErrDistr, typename TMatches, typename TReads, typename TGenomes, typename TOptions>
150 inline unsigned
getErrorDistribution(TErrDistr & posError,TMatches & matches,TReads & reads,TGenomes & genomes,TOptions & options)151 getErrorDistribution(
152 TErrDistr &posError,
153 TMatches &matches,
154 TReads &reads,
155 TGenomes &genomes,
156 TOptions &options)
157 {
158 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
159 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
160
161 Dna5String genome;
162 unsigned unique = 0;
163 for (; it != itEnd; ++it)
164 {
165 if ((*it).orientation == '-') continue;
166
167 Dna5String const &read = reads[(*it).rseqNo];
168 genome = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
169 if ((*it).orientation == 'R')
170 reverseComplement(genome);
171 for (unsigned i = 0; i < length(posError) && i < length(read); ++i)
172 if ((options.compMask[ordValue(genome[i])] & options.compMask[ordValue(read[i])]) == 0)
173 ++posError[i];
174 ++unique;
175 }
176 return unique;
177 }
178
179 template <typename TErrDistr, typename TCount1, typename TCount2, typename TMatches, typename TReads, typename TGenomes, typename TSpec>
180 inline unsigned
getErrorDistribution(TErrDistr & posError,TCount1 & insertions,TCount2 & deletions,TMatches & matches,TReads & reads,TGenomes & genomes,RazerSOptions<TSpec> & options)181 getErrorDistribution(
182 TErrDistr &posError,
183 TCount1 &insertions,
184 TCount2 &deletions,
185 TMatches &matches,
186 TReads &reads,
187 TGenomes &genomes,
188 RazerSOptions<TSpec> &options)
189 {
190 typedef Align<String<Dna5>, ArrayGaps> TAlign;
191 typedef typename Row<TAlign>::Type TRow;
192 typedef typename Iterator<TRow>::Type TIter;
193
194 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
195 typedef typename Position<TAlign>::Type TPosition;
196
197 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
198 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
199
200 Align<Dna5String, ArrayGaps> align;
201 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
202 if (options.hammingOnly)
203 scoreType.data_mismatch = -1;
204 resize(rows(align), 2);
205
206 unsigned unique = 0;
207 for (; it != itEnd; ++it)
208 {
209 if ((*it).orientation == '-') continue;
210
211 assignSource(row(align, 0), reads[(*it).rseqNo]);
212 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
213 if ((*it).orientation == 'R')
214 reverseComplement(source(row(align, 1)));
215 globalAlignment(align, scoreType);
216
217 TRow& row0 = row(align, 0);
218 TRow& row1 = row(align, 1);
219
220 TPosition begin = beginPosition(cols(align));
221 TPosition end = endPosition(cols(align));
222
223 TIter it0 = iter(row0, begin);
224 TIter it1 = iter(row1, begin);
225 TIter end0 = iter(row0, end);
226
227 unsigned pos = 0;
228 for (; it0 != end0 && pos < length(posError); ++it0, ++it1)
229 {
230 if (isGap(it0))
231 ++insertions;
232 else
233 {
234 if (isGap(it1))
235 ++deletions;
236 else
237 if ((options.compMask[ordValue(getValue(it0))] & options.compMask[ordValue(getValue(it1))]) == 0)
238 ++posError[pos];
239 ++pos;
240 }
241 }
242 ++unique;
243 }
244 return unique;
245 }
246
247
248 //////////////////////////////////////////////////////////////////////////////
249 template <typename TFile, typename TSource, typename TSpec, typename TPosition>
250 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source,TPosition begin_,TPosition end_)251 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source, TPosition begin_, TPosition end_)
252 {
253 typedef Align<TSource, TSpec> const TAlign;
254 typedef typename Row<TAlign>::Type TRow;
255 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
256
257 TRowsPosition row_count = length(rows(source));
258
259 // Print sequences
260 for (TRowsPosition i = 0; i < row_count; ++i)
261 {
262 if (i == 0)
263 _streamWrite(target, "#Read: ");
264 else
265 _streamWrite(target, "#Genome: ");
266 TRow& row_ = row(source, i);
267 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
268 TIter begin1_ = iter(row_, begin_);
269 TIter end1_ = iter(row_, end_);
270 for (; begin1_ != end1_; ++begin1_) {
271 if (isGap(begin1_)) _streamPut(target, gapValue<char>());
272 else _streamPut(target, *begin1_);
273 }
274 _streamPut(target, '\n');
275 }
276 }
277
278 template<typename TMatches, typename TCounts, typename TOptions>
279 void
countCoocurrences(TMatches & matches,TCounts & cooc,TOptions & options)280 countCoocurrences(TMatches & matches, TCounts & cooc, TOptions & options)
281 {
282 clear(cooc);
283 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength) + 1;
284 resize(cooc,maxSeedErrors+1,0);
285 for (int i = 0; i < maxSeedErrors+1; ++i)
286 cooc[i] = 1;
287
288 int count = 0;
289 unsigned readNo = -1;
290 int preEditDist = -1;
291 typename Iterator<TMatches>::Type it = begin(matches,Standard());
292 typename Iterator<TMatches>::Type itEnd = end(matches,Standard());
293
294 for(; it != itEnd; ++it)
295 {
296 if ((*it).rseqNo == readNo)
297 {
298 if(preEditDist > 1) continue;// || dist > options.errorRate * maxReadLength + 1) continue;
299 int dist = (*it).seedEditDist - preEditDist;
300 if(dist > maxSeedErrors) continue;
301 if(dist < 0) ++cooc[0];
302 else ++cooc[dist];
303 }
304 else
305 {
306 readNo = (*it).rseqNo;
307 preEditDist = (*it).seedEditDist;
308 if(preEditDist <= 1) ++count;
309 }
310 }
311 for (unsigned i = 0; i < length(cooc); ++i)
312 {
313 cooc[i] = (int)(-4.343 * log((double)cooc[i]/count) );
314 if (cooc[i] < 0) cooc[i] = 0;
315 }
316 if(options._debugLevel > 1)
317 {
318 ::std::cerr << "[mapping_count] ";
319 for(unsigned j = 0; j < length(cooc); ++j)
320 ::std::cerr << cooc[j] << " ";
321 ::std::cerr << ::std::endl;
322 }
323
324 }
325
326
327 template<typename TAlign, typename TString, typename TIter>
328 void
getCigarLine(TAlign & align,TString & cigar,TString & mutations,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)329 getCigarLine(TAlign & align, TString & cigar, TString & mutations, TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
330 {
331
332 typedef typename Source<TAlign>::Type TSource;
333 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
334
335 // typedef typename Row<TAlign>::Type TRow;
336 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
337
338 TStringIterator readBase = begin(source(row(align,0)));
339
340 int readPos = 0;
341 bool first = true;
342 int inserted = 0;
343 int deleted = 0;
344 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
345 {
346 inserted = 0;
347 deleted = 0;
348 int matched = 0;
349 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
350 {
351 ++readPos;
352 if(*ali_it1 != *ali_it0)
353 {
354 if(first) first = false;
355 else mutations << ",";
356 mutations << readPos <<*readBase;
357 }
358 ++readBase;
359 ++ali_it0;
360 ++ali_it1;
361 ++matched;
362 }
363 if(matched>0) cigar << matched<< "M" ;
364 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
365 {
366 ++ali_it0;
367 ++ali_it1;
368 ++deleted;
369 }
370 if(deleted>0) cigar << deleted << "D";
371 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
372 {
373 ++ali_it0;
374 ++ali_it1;
375 ++readPos;
376 if(first) first = false;
377 else mutations << ",";
378 mutations << readPos << *readBase;
379 ++readBase;
380 ++inserted;
381 }
382 if(inserted>0) cigar << inserted << "I";
383 }
384 // end gaps can happen in split mapping
385 while(ali_it0!=ali_it0_stop)
386 {
387 ++ali_it0;
388 ++deleted;
389 }
390 if(deleted>0) cigar << deleted << "D";
391 while(ali_it1 != ali_it1_stop)
392 {
393 ++ali_it1;
394 ++readPos;
395 if(first) first = false;
396 else mutations << ",";
397 mutations << readPos << *readBase;
398 ++inserted;
399 }
400 if(inserted>0) cigar << inserted << "I";
401
402 }
403
404 template<typename TAlign, typename TIter>
405 void
getCigarLine(TAlign & align,String<Pair<char,int>> & cigar,String<Pair<Dna5,int>> & mutations,int offset,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)406 getCigarLine(TAlign & align,
407 String<Pair<char,int> > & cigar,
408 String<Pair<Dna5,int> > & mutations,
409 int offset,
410 TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
411 {
412
413 typedef typename Source<TAlign>::Type TSource;
414 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
415
416 // typedef typename Row<TAlign>::Type TRow;
417 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
418
419 TStringIterator readBase = begin(source(row(align,0)));
420
421 int readPos = 0;
422 int inserted = 0;
423 int deleted = 0;
424 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
425 {
426 inserted = 0;
427 deleted = 0;
428 int matched = 0;
429 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
430 {
431 ++readPos;
432 if(*ali_it1 != *ali_it0)
433 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
434 ++readBase;
435 ++ali_it0;
436 ++ali_it1;
437 ++matched;
438 }
439 if(matched>0) appendValue(cigar,Pair<char,int>('M',matched));
440 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
441 {
442 ++ali_it0;
443 ++ali_it1;
444 ++deleted;
445 }
446 if(deleted>0) appendValue(cigar,Pair<char,int>('D',deleted));
447 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
448 {
449 ++ali_it0;
450 ++ali_it1;
451 ++readPos;
452 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
453 ++readBase;
454 ++inserted;
455 }
456 if(inserted>0) appendValue(cigar,Pair<char,int>('I',inserted));
457 }
458 // end gaps can happen in split mapping
459 while(ali_it0!=ali_it0_stop)
460 {
461 ++ali_it0;
462 ++deleted;
463 }
464 if(deleted>0) appendValue(cigar,Pair<char,int>('D',deleted));
465 while(ali_it1 != ali_it1_stop)
466 {
467 ++ali_it1;
468 ++readPos;
469 appendValue(mutations,Pair<Dna5,int>((Dna5)(*readBase),readPos + offset));
470 ++inserted;
471 }
472 if(inserted>0) appendValue(cigar,Pair<char,int>('I',inserted + offset));
473
474 }
475
476
477
478 #ifdef RAZERS_DIRECT_MAQ_MAPPING
479 //////////////////////////////////////////////////////////////////////////////
480 // Assign mapping quality and remove suboptimal matches
481 template < typename TMatches, typename TReads, typename TCooc, typename TCounts, typename TSpec >
assignMappingQuality(TMatches & matches,TReads & reads,TCooc & cooc,TCounts & cnts,RazerSOptions<TSpec> & options)482 void assignMappingQuality(TMatches &matches, TReads & reads, TCooc & cooc, TCounts &cnts, RazerSOptions<TSpec> & options)
483 {
484 typedef typename Value<TMatches>::Type TMatch;
485 typedef typename Iterator<TMatches, Standard>::Type TIterator;
486
487 //matches are already sorted
488 //::std::sort(
489 // begin(matches, Standard()),
490 // end(matches, Standard()),
491 // LessRNoMQ<TMatch>());
492
493
494 int maxSeedErrors = (int)(options.errorRate*options.artSeedLength)+1;
495 unsigned readNo = -1;
496
497 TIterator it = begin(matches, Standard());
498 TIterator itEnd = end(matches, Standard());
499 TIterator dit = it;
500
501 int bestQualSum, secondBestQualSum;
502 int secondBestDist = -1 ,secondBestMatches = -1;
503 for (; it != itEnd; ++it)
504 {
505 if ((*it).orientation == '-') continue;
506 bool mappingQualityFound = false;
507 int mappingQuality = 0;
508 int qualTerm1,qualTerm2;
509
510 readNo = (*it).rseqNo;
511 bestQualSum = (*it).mScore;
512
513 if(++it!=itEnd && (*it).rseqNo==readNo)
514 {
515 secondBestQualSum = (*it).mScore;
516 secondBestDist = (*it).editDist;
517 secondBestDist = (*it).editDist;
518 secondBestMatches = cnts[1][readNo] >> 5;
519 //CHECKcnts secondBestMatches = cnts[secondBestDist][readNo];
520 // secondBestMatches = cnts[secondBestDist][readNo];
521 (*it).orientation = '-';
522 // if(secondBestDist<=bestDist) unique=0;
523 }
524 else secondBestQualSum = -1000;
525 --it; //it is now at best match of current rseqNo
526
527 int bestDist = (*it).editDist;
528 int kPrime = (*it).seedEditDist;
529 if((bestQualSum==secondBestQualSum) || (kPrime>maxSeedErrors))
530 mappingQualityFound = true; //mq=0
531 else{
532 if(secondBestQualSum == -1000) qualTerm1 = 99;
533 else
534 {
535 qualTerm1 = (int)(secondBestQualSum - bestQualSum - 4.343 * log((double)secondBestMatches));
536 //if (secondBestKPrime - kPrime <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual; //TODO abchecken was mehr sinn macht
537 if (secondBestDist - bestDist <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual;
538 }
539 float avgSeedQual = 0.0;
540 if(!mappingQualityFound)
541 {
542 //TODO!!! generalize and adapt to razers lossrates
543 // lossrate 0.42 => -10 log10(0.42) = 4
544 int kPrimeLoss = 4; // options.kPrimeLoss; // bezieht sich auf 3 fehler in 28bp
545 qualTerm2 = kPrimeLoss + cooc[maxSeedErrors-kPrime];
546
547 for(unsigned j = 0; j<options.artSeedLength; ++j)
548 {
549 int q = getQualityValue(reads[readNo][j]);//(int)((unsigned char)(reads[readNo][j])>>3);
550 if(q>options.mutationRateQual) q = options.mutationRateQual;
551 avgSeedQual+=q;
552 }
553 avgSeedQual/=options.artSeedLength;
554 //-10 log10(28-2) = 14;
555 //generalize to -10 log10(artSeedLength - maxSeedErrors +1 ) // 14 fits for seedlength 28 to 32 with 2 errors
556 if(avgSeedQual>14) qualTerm2 += (int)((maxSeedErrors-kPrime)*(avgSeedQual-14));
557 }
558 }
559 if (!mappingQualityFound) mappingQuality = (qualTerm1<qualTerm2) ? qualTerm1:qualTerm2;
560 if (mappingQuality < 0) mappingQuality = 0;
561 (*it).mScore = mappingQuality;
562
563 *dit = *it;
564 // if(secondBestQualSum != -1000) ++it;
565 ++dit;
566 }
567 resize(matches, dit - begin(matches, Standard()));
568 }
569 #endif
570
571
572 //////////////////////////////////////////////////////////////////////////////
573 // Dump an alignment
574 template <typename TFile, typename TSource, typename TSpec>
575 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source)576 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source)
577 {
578 typedef Align<TSource, TSpec> const TAlign;
579 typedef typename Row<TAlign>::Type TRow;
580 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
581 typedef typename Position<TAlign>::Type TPosition;
582
583 TRowsPosition row_count = length(rows(source));
584 TPosition begin_ = beginPosition(cols(source));
585 TPosition end_ = endPosition(cols(source));
586
587 // Print sequences
588 for(TRowsPosition i=0;i<row_count;++i) {
589 if (i == 0)
590 _streamWrite(target, "#Read: ");
591 else
592 _streamWrite(target, "#Genome: ");
593 TRow& row_ = row(source, i);
594 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
595 TIter begin1_ = iter(row_, begin_);
596 TIter end1_ = iter(row_, end_);
597 for (; begin1_ != end1_; ++begin1_) {
598 if (isGap(begin1_)) _streamPut(target, gapValue<char>());
599 else _streamPut(target, *begin1_);
600 }
601 _streamPut(target, '\n');
602 }
603 }
604
605
606 //////////////////////////////////////////////////////////////////////////////
607 // Output matches
608 template <
609 typename TMatches,
610 typename TGenomeNames,
611 typename TReads,
612 typename TReadNames,
613 typename TCounts,
614 typename TSpec
615 >
dumpMatches(TMatches & matches,TGenomeNames const & genomeIDs,StringSet<CharString> & genomeFileNameList,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReads const & reads,TCounts & stats,TReadNames const & readIDs,CharString readFName,CharString errorPrbFileName,RazerSOptions<TSpec> & options)616 void dumpMatches(
617 TMatches &matches, // forward/reverse matches
618 TGenomeNames const &genomeIDs, // Genome names (read from Fasta file, currently unused)
619 StringSet<CharString> &genomeFileNameList, // list of genome names (e.g. {"hs_ref_chr1.fa","hs_ref_chr2.fa"})
620 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > &gnoToFileMap, //map to retrieve genome filename and sequence number within that file
621 TReads const &reads,
622 TCounts & stats, // Match statistics (possibly empty)
623 TReadNames const &readIDs, // Read names (read from Fasta file, currently unused)
624 CharString readFName, // read name (e.g. "reads.fa"), used for file/read naming
625 CharString errorPrbFileName,
626 RazerSOptions<TSpec> &options)
627 {
628 typedef typename Value<TMatches>::Type TMatch;
629 typedef typename Value<TReads>::Type TRead;
630 typedef typename Value<TGenomeSet>::Type TGenome;
631 typedef typename TMatch::TGPos TGPos;
632
633 if (options.minMatchLen > 0) options.outputFormat = 33;
634
635 if (options.outputFormat == 2)
636 {
637 options.sortOrder = 0; // ... to count multiple matches
638 }
639
640 if (options.outputFormat == 2)
641 {
642 options.maxHits = 1; // Eland outputs at most one match
643 options.sortOrder = 0; // read numbers are increasing
644 options.positionFormat = 1; // bases in file are numbered starting at 1
645 options.dumpAlignment = options.hammingOnly;
646 }
647 #ifdef RAZERS_DIRECT_MAQ_MAPPING
648 if (options.maqMapping) options.outputFormat = 3;
649 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength); //without + 1 here, used to check whether match should be supported if noBelowIdentity option is switched on
650 #endif
651
652 if (options.outputFormat == 3)
653 {
654 options.sortOrder = 1; // sort according to gPos
655 options.positionFormat = 1; // bases in file are numbered starting at 1
656 //options.dumpAlignment = false;
657 }
658 if (options.outputFormat == 33)
659 {
660 options.positionFormat = 1; // bases in file are numbered starting at 1
661 options.dumpAlignment = false;
662 options.sortOrder = 1;
663 }
664
665
666 // error profile
667 unsigned maxReadLength = options.maxReadLength;
668
669 SEQAN_PROTIMESTART(dump_time);
670
671 // load Genome sequences for alignment dumps
672 TGenomeSet genomes;
673 if ((options.outputFormat == 0 && !options.hammingOnly) || options.dumpAlignment || !empty(errorPrbFileName) || options.outputFormat == 33)
674 if (!loadGenomes(genomes, genomeFileNameList)) {
675 ::std::cerr << "Failed to load genomes" << ::std::endl;
676 options.dumpAlignment = false;
677 }
678
679 // how many 0's should be padded?
680 int pzeros = 0;
681 for (unsigned l = length(reads); l > 9; l = l / 10)
682 ++pzeros;
683
684 int gzeros = 0;
685 for (unsigned l = length(genomes); l > 9; l = l / 10)
686 ++gzeros;
687
688 // remove the directory prefix of readFName
689 ::std::string _readName;
690 assign(_readName, readFName);
691 size_t lastPos = _readName.find_last_of('/') + 1;
692 if (lastPos == _readName.npos) lastPos = _readName.find_last_of('\\') + 1;
693 if (lastPos == _readName.npos) lastPos = 0;
694 CharString readName = _readName.substr(lastPos);
695
696
697 typedef Align<String<Dna5>, ArrayGaps> TAlign;
698 TAlign align, alignL, alignR;
699 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
700 // Score<int> scoreType(0,-1,-1,-1);
701
702 if (options.hammingOnly)
703 scoreType.data_mismatch = -1;
704 resize(rows(align), 2);
705 resize(rows(alignL), 2);
706 resize(rows(alignR), 2);
707
708 ::std::ofstream file;
709 CharString fileName = options.output;
710 if (empty(fileName))
711 {
712 fileName = readFName;
713 append(fileName, ".result");
714 }
715
716 file.open(toCString(fileName), ::std::ios_base::out | ::std::ios_base::trunc);
717 if (!file.is_open()) {
718 ::std::cerr << "Failed to open output file" << ::std::endl;
719 return;
720 }
721
722
723 String<short> ambiStates;
724 if(options.minMatchLen == 0)
725 maskDuplicates(matches);
726 if (options.outputFormat > 0
727 #ifdef RAZERS_DIRECT_MAQ_MAPPING
728 && !options.maqMapping
729 #endif
730 )
731 {
732 // match statistics
733 unsigned maxErrors = (int)(options.errorRate * maxReadLength);
734 //if (maxErrors > 10) maxErrors = 10;
735 resize(stats, maxErrors + 1);
736 for (unsigned i = 0; i <= maxErrors; ++i)
737 resize(stats[i], length(reads), 0);
738 countMatches(matches, stats);
739 }
740
741
742
743 Nothing nothing;
744 unsigned currSeqNo = 0;
745 #ifdef RAZERS_DIRECT_MAQ_MAPPING
746 if(options.maqMapping)
747 {
748 String<int> cooc;
749 compactMatches(matches, stats, options, true, nothing, false); //only best two matches per read are kept
750 countCoocurrences(matches,cooc,options); //coocurrence statistics are filled
751 assignMappingQuality(matches,reads,cooc,stats,options);//mapping qualities are assigned and only one match per read is kept
752 }
753 else
754 #endif
755
756 #ifdef RAZERS_MICRO_RNA
757 if(options.microRNA)purgeAmbiguousRnaMatches(matches,options);
758 else
759 #endif
760 {
761 if(options.minMatchLen>0)
762 compactAndCountSplicedMatches(matches, ambiStates, options, true);
763 else
764 compactMatches(matches, stats, options, true, nothing);
765 }
766 int jj = 0;
767 if(!(options.minMatchLen > 0 ))
768 switch (options.sortOrder) {
769 case 0:
770 ::std::sort(
771 begin(matches, Standard()),
772 end(matches, Standard()),
773 LessRNoGPos<TMatch>());
774 break;
775
776 case 1:
777 ::std::sort(
778 begin(matches, Standard()),
779 end(matches, Standard()),
780 LessGPosRNo<TMatch>());
781 break;
782
783 }
784
785 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
786 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
787
788
789 Dna5String gInf, gInfL, gInfR;
790 char _sep_ = '\t';
791 unsigned viewPosReadFirst = 0;
792 unsigned viewPosReadLast = 0;
793
794 switch (options.outputFormat)
795 {
796 case 0: // Razer Format
797 // _sep_ = ',';
798 for(; it != itEnd; ++it)
799 {
800 unsigned readLen = length(reads[(*it).rseqNo]);
801 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
802 #ifdef RAZERS_MICRO_RNA
803 percId = 100.0 * (1.0 - (double)(*it).editDist / (double) ((*it).mScore));
804 #endif
805 switch (options.readNaming)
806 {
807 // 0..filename is the read's Fasta id
808 case 0:
809 file << readIDs[(*it).rseqNo];
810 break;
811
812 // 1..filename is the read filename + seqNo
813 case 1:
814 file.fill('0');
815 file << readName << '#' << ::std::setw(pzeros) << (*it).rseqNo + 1;
816 break;
817
818 // 2..filename is the read sequence itself
819 case 2:
820 file << reads[(*it).rseqNo];
821 }
822
823 file << _sep_ << options.positionFormat << _sep_ << readLen << _sep_ << (*it).orientation << _sep_;
824
825 switch (options.genomeNaming)
826 {
827 // 0..filename is the read's Fasta id
828 case 0:
829 file << genomeIDs[(*it).gseqNo];
830 break;
831
832 // 1..filename is the read filename + seqNo
833 case 1:
834 file.fill('0');
835 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
836 }
837
838 // get alignment to dump or to fix end coordinates
839 if (options.dumpAlignment || !options.hammingOnly)
840 {
841 #ifdef RAZERS_MICRO_RNA
842 if(options.microRNA)
843 assignSource(row(align, 0), prefix(reads[(*it).rseqNo],(*it).mScore));
844 else
845 #endif
846 assignSource(row(align, 0), reads[(*it).rseqNo]);
847 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
848 #ifdef RAZERS_MATEPAIRS
849 if ((*it).pairId != 0 && ((*it).rseqNo & 1))
850 reverseComplement(source(row(align, 0)));
851 #endif
852 if ((*it).orientation == 'R')
853 reverseComplement(source(row(align, 1)));
854
855 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
856
857 // transform first and last read character to genomic positions
858 viewPosReadFirst = toViewPosition(row(align, 0), 0);
859 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
860 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
861 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
862
863 if ((*it).orientation == 'R')
864 {
865 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
866 (*it).gEnd -= genomePosReadFirst;
867 }
868 else
869 {
870 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
871 (*it).gBegin += genomePosReadFirst;
872 }
873 }
874
875 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd << _sep_ << ::std::setprecision(5) << percId;
876 #ifdef RAZERS_MICRO_RNA
877 if(options.microRNA) file << _sep_ << (*it).mScore;
878 #endif
879
880 #ifdef RAZERS_MATEPAIRS
881 if ((*it).pairId != 0)
882 file << _sep_ << (*it).pairId << _sep_ << (*it).pairScore << _sep_ << (*it).mateDelta;
883 #endif
884 file << ::std::endl;
885
886 if (options.dumpAlignment)
887 dumpAlignment(file, align, viewPosReadFirst, viewPosReadLast + 1);
888 }
889 break;
890
891
892 case 1: // Enhanced Fasta Format
893 _sep_ = ',';
894 for(unsigned matchReadNo = -1, matchReadCount = 0; it != itEnd; ++it)
895 {
896 unsigned readLen = length(reads[(*it).rseqNo]);
897 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
898
899 if (matchReadNo != (*it).rseqNo)
900 {
901 matchReadNo = (*it).rseqNo;
902 matchReadCount = 0;
903 } else
904 ++matchReadCount;
905
906 ::std::string fastaID;
907 assign(fastaID, readIDs[(*it).rseqNo]);
908
909 ::std::string id = fastaID;
910 int fragId = (*it).rseqNo;
911 bool appendMatchId = options.maxHits > 1;
912
913 size_t left = fastaID.find_first_of('[');
914 size_t right = fastaID.find_last_of(']');
915 if (left != fastaID.npos && right != fastaID.npos && left < right)
916 {
917 fastaID.erase(right);
918 fastaID.erase(0, left + 1);
919 replace(fastaID.begin(), fastaID.end(), ',', ' ');
920 size_t pos = fastaID.find("id=");
921 if (pos != fastaID.npos) {
922 ::std::istringstream iss(fastaID.substr(pos + 3));
923 iss >> id;
924 // appendMatchId = false;
925 }
926 pos = fastaID.find("fragId=");
927 if (pos != fastaID.npos) {
928 ::std::istringstream iss(fastaID.substr(pos + 7));
929 iss >> fragId;
930 }
931 }
932
933 if ((*it).orientation == 'F')
934 // forward strand
935 file << '>' << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd;
936 else
937 // reverse strand (switch begin and end)
938 file << '>' << (*it).gEnd << _sep_ << ((*it).gBegin + options.positionFormat);
939
940 unsigned ambig = 0;
941 for (unsigned i = 0; i <= (*it).editDist && i < length(stats); ++i)
942 ambig += stats[i][(*it).rseqNo];
943
944 file << "[id=" << id;
945 if (appendMatchId) file << "_" << matchReadCount;
946 file << ",fragId=" << fragId;
947 file << ",contigId=" << genomeIDs[(*it).gseqNo];
948 file << ",errors=" << (*it).editDist << ",percId=" << ::std::setprecision(5) << percId;
949 file << ",ambiguity=" << ambig << ']' << ::std::endl;
950
951 file << reads[(*it).rseqNo] << ::std::endl;
952 }
953 break;
954
955
956 case 2: // Eland Format
957 _sep_ = '\t';
958 for(unsigned readNo = 0; readNo < length(reads); ++readNo)
959 {
960 switch (options.readNaming)
961 {
962 // 0..filename is the read's Fasta id
963 case 0:
964 file << '>' << readIDs[readNo] << _sep_;
965 break;
966
967 // 1..filename is the read filename + seqNo
968 case 1:
969 file.fill('0');
970 file << readName << '#' << ::std::setw(pzeros) << readNo + 1 << _sep_;
971 break;
972 }
973
974 if (it == itEnd || readNo < (*it).rseqNo)
975 {
976 if (!empty(reads[readNo]))
977 file << reads[readNo] << _sep_ << "NM" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
978 else
979 {
980 for (unsigned i = 0; i < maxReadLength; ++i)
981 file << '.';
982 file << _sep_ << "QC" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
983 }
984 } else
985 {
986 file << reads[readNo] << _sep_;
987 unsigned bestMatches = 1;
988 if ((unsigned)(*it).editDist < length(stats))
989 bestMatches = stats[(*it).editDist][readNo];
990
991 if (bestMatches == 0) file << '?'; // impossible
992 if (bestMatches == 1) file << 'U'; // unique best match
993 if (bestMatches > 1) file << 'R'; // non-unique best matches
994
995 file << (*it).editDist << _sep_ << stats[0][readNo] << _sep_ << stats[1][readNo] << _sep_ << stats[2][readNo];
996
997 if (bestMatches == 1)
998 {
999 file << _sep_;
1000 switch (options.genomeNaming)
1001 {
1002 // 0..filename is the read's Fasta id
1003 case 0:
1004 file << genomeIDs[(*it).gseqNo];
1005 break;
1006
1007 // 1..filename is the read filename + seqNo
1008 case 1:
1009 file.fill('0');
1010 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
1011 }
1012
1013 if ((*it).orientation == 'F')
1014 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << 'F' << _sep_ << "..";
1015 else
1016 file << _sep_ << (*it).gEnd << _sep_ << 'R' << _sep_ << "..";
1017
1018 if ((*it).editDist > 0 && options.dumpAlignment && options.hammingOnly)
1019 {
1020 gInf = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
1021 if ((*it).orientation == 'R')
1022 reverseComplement(gInf);
1023 for (unsigned i = 0; i < length(gInf); ++i)
1024 if ((options.compMask[ordValue(reads[readNo][i])] &
1025 options.compMask[ordValue(gInf[i])]) == 0)
1026 file << _sep_ << i + 1 << gInf[i];
1027 }
1028 }
1029 file << ::std::endl;
1030 ++it;
1031 }
1032 }
1033 break;
1034 case 3: // Gff: printf "$chr $name_$format read $pos %ld . $dir . ID=$col[0]$unique$rest\n",$pos+$len-1;
1035 for (unsigned filecount = 0; filecount < length(genomeFileNameList); ++filecount)
1036 {
1037 // open genome file
1038 ::std::ifstream gFile;
1039 gFile.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
1040 if (!gFile.is_open())
1041 {
1042 std::cerr << "Couldn't open genome file." << std::endl;
1043 break;
1044 }
1045
1046 Dna5String currGenome;
1047
1048 // iterate over genome sequences
1049 for(; !_streamEOF(gFile); ++currSeqNo)
1050 {
1051 read(gFile, currGenome, Fasta()); // read Fasta sequence
1052 while(it != itEnd && (*it).gseqNo == currSeqNo)
1053 {
1054 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1055 if(options.maqMapping && options.noBelowIdentity && (*it).seedEditDist > maxSeedErrors)
1056 {
1057 ++it;
1058 continue;
1059 }
1060 #endif
1061
1062 unsigned currReadNo = (*it).rseqNo;
1063 int unique = 1;
1064 unsigned bestMatches = 0;
1065
1066 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1067 if(options.maqMapping)
1068 bestMatches = stats[0][currReadNo] >> 5;
1069 else
1070 #endif
1071 {
1072 if (bestMatches == 0 && (unsigned)(*it).editDist < length(stats))
1073 bestMatches = stats[(*it).editDist][currReadNo];
1074
1075 }
1076
1077 bool suboptimal = false;
1078 if (
1079 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1080 !options.maqMapping &&
1081 #endif
1082 (unsigned)(*it).editDist > 0 )
1083 {
1084 for(unsigned d = 0; d < (unsigned)(*it).editDist; ++d)
1085 if (stats[d][currReadNo]>0) suboptimal=true;
1086 }
1087
1088 if (bestMatches != 1)
1089 {
1090 unique = 0;
1091 if(options.purgeAmbiguous)
1092 {
1093 ++it;
1094 continue;
1095 }
1096
1097 // if((*it).mScore > 0) std::cout << (*it).mScore << "<-non uniq but score > 0\n";
1098 // ++it;
1099 // continue; // TODO: output non-unique matches (concerns maq mapping only)
1100 }
1101 unsigned readLen = length(reads[currReadNo]);
1102
1103 String<Dna5Q> readInf = infix(reads[currReadNo],0,readLen);
1104 switch (options.genomeNaming)
1105 {
1106 // 0..filename is the read's Fasta id
1107 case 0:
1108 file << genomeIDs[(*it).gseqNo] <<'\t';
1109 break;
1110 // 1..filename is the read filename + seqNo
1111 case 1:
1112 file.fill('0');
1113 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1114 break;
1115 }
1116
1117 // get alignment to dump or to fix end coordinates
1118 if (!options.hammingOnly)
1119 {
1120 assignSource(row(align, 0), readInf);
1121 //std::cout << "begin=" << (*it).gBegin <<" end="<< (*it).gEnd << std::endl;
1122 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1123 if ((*it).orientation == 'R')
1124 reverseComplement(source(row(align, 1)));
1125
1126 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1127
1128 //std::cout << align << std::endl;
1129 //dumpAlignment(::std::cout, align);
1130
1131 // transform first and last read character to genomic positions
1132 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1133 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1134 TGPos genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
1135 TGPos genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
1136 if ((*it).orientation == 'R')
1137 {
1138 // watch out at chromosome borders
1139 (*it).gBegin = (*it).gEnd - _min((TGPos)(genomePosReadLast + 1),(*it).gEnd);
1140 (*it).gEnd -= genomePosReadFirst;
1141 }
1142 else
1143 {
1144 (*it).gEnd = (*it).gBegin + _min(genomePosReadLast + 1, static_cast<TGPos>(length(currGenome) - (*it).gBegin));
1145 (*it).gBegin += genomePosReadFirst;
1146 }
1147
1148 clear(row(align,1));
1149 clear(row(align,0));
1150 assignSource(row(align, 0), readInf);
1151 //std::cout << "begin=" << (*it).gBegin <<" end="<< (*it).gEnd << std::endl;
1152 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1153 if ((*it).orientation == 'R')
1154 reverseComplement(source(row(align, 1)));
1155
1156 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1157
1158 //std::cout << align << std::endl;
1159 //dumpAlignment(::std::cout, align);
1160
1161 // transform first and last read character to genomic positions
1162 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1163 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1164
1165
1166 }
1167
1168 //file << options.runID << "_razers\tread";
1169 file << "razers\tread";
1170 file << '\t' << ((*it).gBegin + options.positionFormat) << '\t' << (*it).gEnd << '\t';
1171 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
1172 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1173 if(options.maqMapping)
1174 file << (*it).mScore << "\t";
1175 else
1176 #endif
1177 file << percId << "\t";
1178
1179 if ((*it).orientation == 'F')
1180 file << '+' << '\t' << '.' <<'\t';
1181 else
1182 file << '-' << '\t' << '.' <<'\t';
1183
1184 switch (options.readNaming)
1185 {
1186 // 0..filename is the read's Fasta id
1187 case 0:
1188 file << "ID=" <<readIDs[currReadNo];
1189 break;
1190
1191 // 1..filename is the read filename + seqNo
1192 case 1:
1193 file.fill('0');
1194 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1195 break;
1196 }
1197 if(suboptimal) file << ";suboptimal";
1198 else
1199 {
1200 if(unique==1) file << ";unique";
1201 if(unique==0) file << ";multi";
1202 }
1203
1204
1205 if ((*it).editDist > 0)
1206 {
1207 if (options.hammingOnly)
1208 {
1209 gInf = infix(currGenome, (*it).gBegin, (*it).gEnd);
1210 if ((*it).orientation == 'R')
1211 reverseComplement(gInf);
1212 bool first = true;
1213 file << ";cigar=" << readLen << "M";
1214 file << ";mutations=";
1215 for (unsigned i = 0; i < length(gInf); ++i)
1216 if ((options.compMask[ordValue(readInf[i])] &
1217 options.compMask[ordValue(gInf[i])]) == 0)
1218 {
1219 if(first){ file << i + 1 << (Dna5)readInf[i]; first = false;}
1220 else file <<','<< i + 1 << (Dna5)readInf[i];
1221 }
1222
1223 }
1224 else
1225 {
1226 std::stringstream cigar, mutations;
1227 typedef typename Row<TAlign>::Type TRow;
1228 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1229 TAlignIterator ali_it0 = iter(row(align,0),viewPosReadFirst);
1230 TAlignIterator ali_it1 = iter(row(align,1),viewPosReadFirst);
1231 TAlignIterator ali_it0stop = iter(row(align,0),viewPosReadLast + 1);
1232 TAlignIterator ali_it1stop = iter(row(align,1),viewPosReadLast + 1);
1233
1234 getCigarLine(align,cigar,mutations,ali_it0,ali_it0stop,ali_it1,ali_it1stop);
1235 file << ";cigar="<<cigar.str();
1236 if(length(mutations.str())>0)
1237 file << ";mutations=" << mutations.str();
1238 }
1239 }
1240
1241
1242 if(
1243 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1244 options.maqMapping ||
1245 #endif
1246 options.fastaIdQual)
1247 {
1248 // file << ";read=";
1249 // for(unsigned j=0;j<length(reads[currReadNo]);++j)
1250 // {
1251 // file << (Dna5)reads[currReadNo][j];
1252 // }
1253 file << ";quality=";
1254 for(unsigned j=0;j<readLen;++j)
1255 {
1256 file << (char)(getQualityValue(readInf[j])+ 33);
1257 }
1258 }
1259
1260 file << ::std::endl;
1261 if(options.dumpAlignment)
1262 {
1263 if((*it).editDist > 0 && !options.hammingOnly)
1264 file << align;
1265 else
1266 {
1267 clear(row(align,1));
1268 clear(row(align,0));
1269 assignSource(row(align, 0), readInf);
1270 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1271 if ((*it).orientation == 'R')
1272 reverseComplement(source(row(align, 1)));
1273 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1274 file << align;
1275
1276 }
1277 }
1278 ++it;
1279 }
1280 }
1281 gFile.close();
1282 ++filecount;
1283 }
1284 break;
1285 case 33: // special Gff for split reads
1286 while(it != itEnd)// && (*it).gseqNo == currSeqNo)
1287 {
1288 unsigned currReadNo = (*it).rseqNo;
1289 Dna5String &currGenome = genomes[(*it).gseqNo];
1290 unsigned readLen = length(reads[currReadNo]);
1291 TMatch& mL = *it;
1292 ++it;
1293 TMatch& mR = *it;
1294 unsigned readLenL = mL.mScore;
1295 unsigned readLenR = mR.mScore;
1296 String<Dna5Q> readInfL = infix(reads[currReadNo],0,readLenL);
1297 String<Dna5Q> readInfR = infix(reads[currReadNo],length(reads[currReadNo])-readLenR,length(reads[currReadNo]));
1298 int offsetL = 0;
1299 int offsetR = readLen - readLenR;
1300 if(mL.orientation == 'R')
1301 {
1302 offsetR = 0;
1303 offsetL = readLen - readLenL;
1304 readInfL = infix(reads[currReadNo],length(reads[currReadNo])-readLenL,length(reads[currReadNo]));
1305 readInfR = infix(reads[currReadNo],0,readLenR);
1306 }
1307
1308 switch (options.genomeNaming)
1309 {
1310 // 0..filename is the read's Fasta id
1311 case 0:
1312 file << genomeIDs[(*it).gseqNo] <<'\t';
1313 break;
1314 // 1..filename is the read filename + seqNo
1315 case 1:
1316 file.fill('0');
1317 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1318 break;
1319 }
1320
1321 // get alignment to dump and get cigar string
1322 if (!options.hammingOnly)
1323 {
1324 assignSource(row(alignL, 0), readInfL);
1325 assignSource(row(alignL, 1), infix(currGenome, mL.gBegin, mL.gEnd));
1326 if (mL.orientation == 'R')
1327 reverseComplement(source(row(alignL, 1)));
1328
1329 globalAlignment(alignL, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1330
1331 assignSource(row(alignR, 0), readInfR);
1332 assignSource(row(alignR, 1), infix(currGenome, mR.gBegin, mR.gEnd));
1333 if (mR.orientation == 'R')
1334 reverseComplement(source(row(alignR, 1)));
1335
1336 globalAlignment(alignR, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1337 }
1338
1339 //file << options.runID << "_razers\tread";
1340 file << "razers\tread";
1341 file << '\t' << (mL.gBegin + options.positionFormat) << '\t' << mR.gEnd << '\t';
1342 double percId = 100.0 * (1.0 + (double)(mL.pairScore-mL.mScore-mR.mScore) / (double)readLen);
1343 file << percId << "\t";
1344
1345 if (mL.orientation == 'F')
1346 file << '+' << '\t' << '.' <<'\t';
1347 else
1348 file << '-' << '\t' << '.' <<'\t';
1349
1350 switch (options.readNaming)
1351 {
1352 // 0..filename is the read's Fasta id
1353 case 0:
1354 file << "ID=" <<readIDs[currReadNo];
1355 break;
1356 // 1..filename is the read filename + seqNo
1357 case 1:
1358 file.fill('0');
1359 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1360 break;
1361 }
1362 if(ambiStates[jj] == 0) file << ";unique";
1363 if(ambiStates[jj] == 1) file << ";multi";
1364 if(ambiStates[jj] == 2) file << ";suboptimal";
1365 ++jj;
1366
1367 SEQAN_ASSERT_TRUE(mL.pairId == mR.pairId);
1368
1369 int indelLen = mR.mScore + mL.mScore - readLen;
1370 if(mL.gEnd != mR.gBegin) indelLen = mR.gBegin - mL.gEnd;
1371
1372 file << ";pairScore=" << (unsigned int) mR.pairScore;
1373
1374 // if(mR.traceExtension >= abs(indelLen))
1375 // file << ";traceExt=" << (unsigned int) mR.traceExtension;
1376
1377 String<Pair<Dna5,int> > mutationsStr, mutStrMid, mutStrL, mutStrR;
1378 String<Pair<char,int> > cigarStr, cigarMid, cigarL, cigarR;
1379 if(indelLen < 0)
1380 {
1381 int offset = readLenL;
1382 if(mL.orientation == 'R')
1383 offset = readLenR;
1384 String<Dna5Q> readInfMid = infix(reads[currReadNo],offset,offset - indelLen);
1385 for (int i = 0; i < -indelLen; ++i)
1386 appendValue(mutStrMid, Pair<Dna5,int>(readInfMid[i],i+offset+1));
1387 appendValue(cigarMid, Pair<char,int>('I',-indelLen));
1388
1389 }
1390 if(indelLen > 0) appendValue(cigarMid, Pair<char,int>('D',indelLen));
1391 if(indelLen != 0)
1392 file << ";split";
1393
1394 if (options.hammingOnly)
1395 {
1396 if(indelLen == 0)
1397 appendValue(cigarL, Pair<char,int>('M',readLen));
1398 else
1399 {
1400 appendValue(cigarL, Pair<char,int>('M',readLenL));
1401 appendValue(cigarR, Pair<char,int>('M',readLenR));
1402 }
1403
1404 //make left part of mutation string
1405 if (mL.editDist > 0)
1406 {
1407 gInfL = infix(currGenome, mL.gBegin, mL.gEnd);
1408 if (mL.orientation == 'R')
1409 reverseComplement(gInfL);
1410 for (unsigned i = 0; i < length(gInfL); ++i)
1411 if ((options.compMask[ordValue(readInfL[i])] & options.compMask[ordValue(gInfL[i])]) == 0)
1412 appendValue(mutStrL, Pair<Dna5,int>(readInfL[i],i+offsetL+1));
1413 }
1414
1415 //make right part of mutation string
1416 if (mR.editDist > 0)
1417 {
1418 gInfR = infix(currGenome, mR.gBegin, mR.gEnd);
1419 if (mR.orientation == 'R')
1420 reverseComplement(gInfR);
1421 for (unsigned i = 0; i < length(gInfR); ++i)
1422 if ((options.compMask[ordValue(readInfR[i])] & options.compMask[ordValue(gInfR[i])]) == 0)
1423 appendValue(mutStrR, Pair<Dna5,int>(readInfR[i],i+offsetR+1));
1424 }
1425 }
1426 else
1427 {
1428 typedef typename Row<TAlign>::Type TRow;
1429 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1430
1431 //left business
1432 TAlignIterator aliL_it0 = begin(row(alignL,0));
1433 TAlignIterator aliL_it1 = begin(row(alignL,1));
1434 TAlignIterator aliL_it0stop = end(row(alignL,0));
1435 TAlignIterator aliL_it1stop = end(row(alignL,1));
1436 getCigarLine(alignL,cigarL,mutStrL,offsetL,aliL_it0,aliL_it0stop,aliL_it1,aliL_it1stop);
1437
1438 //right business
1439 TAlignIterator aliR_it0 = begin(row(alignR,0));
1440 TAlignIterator aliR_it1 = begin(row(alignR,1));
1441 TAlignIterator aliR_it0stop = end(row(alignR,0));
1442 TAlignIterator aliR_it1stop = end(row(alignR,1));
1443 getCigarLine(alignR,cigarR,mutStrR,offsetR,aliR_it0,aliR_it0stop,aliR_it1,aliR_it1stop);
1444 }
1445 //now plug together the parts: cigar
1446 if(mL.orientation=='F')
1447 {
1448 append(cigarStr,cigarL);
1449 append(cigarStr,cigarMid);
1450 append(cigarStr,cigarR);
1451 append(mutationsStr,mutStrL);
1452 append(mutationsStr,mutStrMid);
1453 append(mutationsStr,mutStrR);
1454 }
1455 else
1456 {
1457 append(cigarStr,cigarR);
1458 append(cigarStr,cigarMid);
1459 append(cigarStr,cigarL);
1460 append(mutationsStr,mutStrR);
1461 append(mutationsStr,mutStrMid);
1462 append(mutationsStr,mutStrL);
1463 }
1464 bool first = true;
1465 file << ";cigar=";// << cigarStr.str();
1466 for (unsigned i = 0; i < length(cigarStr); ++i)
1467 file << cigarStr[i].i2 << cigarStr[i].i1;
1468
1469 if(length(mutationsStr)>0)file << ";mutations=";
1470 for (unsigned i = 0; i < length(mutationsStr); ++i)
1471 {
1472 if(first)
1473 {
1474 first = false;
1475 file << mutationsStr[i].i2 << mutationsStr[i].i1;
1476 }
1477 else
1478 file << "," << mutationsStr[i].i2 << mutationsStr[i].i1;
1479 }
1480
1481 if(options.fastaIdQual)
1482 {
1483 file << ";quality=";
1484 for(unsigned j=0;j<length(reads[currReadNo]);++j)
1485 {
1486 file << (char)(getQualityValue(reads[currReadNo][j])+ 33);
1487 }
1488 }
1489 file << ::std::endl;
1490 ++it;
1491 }
1492 break;
1493
1494
1495 }
1496
1497 file.close();
1498
1499 // get empirical error distribution
1500 if (!empty(errorPrbFileName) && maxReadLength > 0)
1501 {
1502 file.open(toCString(errorPrbFileName), ::std::ios_base::out | ::std::ios_base::trunc);
1503 if (file.is_open())
1504 {
1505 String<long double> posError;
1506 unsigned unique = 0;
1507 unsigned insertions = 0;
1508 unsigned deletions = 0;
1509 resize(posError, maxReadLength, 0);
1510
1511 if (options.hammingOnly)
1512 unique = getErrorDistribution(posError, matches, reads, genomes, options);
1513 else
1514 {
1515 unique = getErrorDistribution(posError, insertions, deletions, matches, reads, genomes, options);
1516 ::std::cerr << "insertProb: " << (double)insertions / ((double)length(posError) * (double)unique) << ::std::endl;
1517 ::std::cerr << "deleteProb: " << (double)deletions / ((double)length(posError) * (double)unique) << ::std::endl;
1518 }
1519
1520 file << (double)posError[0] / (double)unique;
1521 for (unsigned i = 1; i < length(posError); ++i)
1522 file << '\t' << (double)posError[i] / (double)unique;
1523 file << ::std::endl;
1524 file.close();
1525 } else
1526 ::std::cerr << "Failed to open error distribution file" << ::std::endl;
1527 }
1528
1529 options.timeDumpResults = SEQAN_PROTIMEDIFF(dump_time);
1530
1531 if (options._debugLevel >= 1)
1532 ::std::cerr << "Dumping results took \t" << options.timeDumpResults << " seconds" << ::std::endl;
1533 }
1534
1535
1536 }
1537
1538 #endif
1539
1540