1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_OUTPUT_FORMAT_H
23 #define SEQAN_HEADER_OUTPUT_FORMAT_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28
29 #include "razers.h"
30 #include <seqan/align.h>
31
32 namespace seqan
33 {
34
35 //////////////////////////////////////////////////////////////////////////////
36 // Quality-based score
37
38 template <typename TQualityString = CharString>
39 struct Quality;
40
41 template <typename TValue, typename TQualityString>
42 class Score<TValue, Quality<TQualityString> >
43 {
44 public:
45 TValue data_match;
46 TValue data_mismatch;
47 TValue data_gap_extend;
48 TValue data_gap_open;
49
50 TQualityString const *data_qual;
51
52 public:
Score()53 Score():
54 data_match(0),
55 data_mismatch(-1),
56 data_gap_extend(-1),
57 data_gap_open(-1),
58 data_qual(NULL)
59 {
60 }
Score(TValue _match,TValue _mismatch,TValue _gap)61 Score(TValue _match, TValue _mismatch, TValue _gap):
62 data_match(_match),
63 data_mismatch(_mismatch),
64 data_gap_extend(_gap),
65 data_gap_open(_gap),
66 data_qual(NULL)
67 {
68 }
Score(TValue _match,TValue _mismatch,TValue _gap_extend,TValue _gap_open,TQualityString const & _qual)69 Score(TValue _match, TValue _mismatch, TValue _gap_extend, TValue _gap_open, TQualityString const &_qual):
70 data_match(_match),
71 data_mismatch(_mismatch),
72 data_gap_extend(_gap_extend),
73 data_gap_open(_gap_open),
74 data_qual(&_qual)
75 {
76 }
77
Score(Score const & other)78 Score(Score const & other):
79 data_match(other.data_match),
80 data_mismatch(other.data_mismatch),
81 data_gap_extend(other.data_gap_extend),
82 data_gap_open(other.data_gap_open),
83 data_qual(other.data_qual)
84 {
85 }
~Score()86 ~Score()
87 {
88 }
89
90 Score & operator = (Score const & other)
91 {
92 data_match = other.data_match;
93 data_mismatch = other.data_mismatch;
94 data_gap_extend = other.data_gap_extend;
95 data_gap_open = other.data_gap_open;
96 data_qual = other.data_qual;
97 return *this;
98 }
99 };
100
101 //////////////////////////////////////////////////////////////////////////////
102
103 template <typename TValue, typename TQualityString, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
104 inline TValue
score(Score<TValue,Quality<TQualityString>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)105 score(Score<TValue, Quality<TQualityString> > const & me,
106 TPos1 pos1,
107 TPos2 pos2,
108 TSeq1 const &seq1,
109 TSeq2 const &seq2)
110 {
111 if (seq1[pos1] != seq2[pos2])
112 if (me.data_qual)
113 return (*me.data_qual)[pos2];
114 else
115 return scoreMismatch(me);
116 else
117 return scoreMatch(me);
118 }
119
120
121 //////////////////////////////////////////////////////////////////////////////
122 // Less-operators ...
123
124 // ... to sort matches and remove duplicates with equal gBegin
125 template <typename TReadMatch>
126 struct LessGPosRNo : public ::std::binary_function < TReadMatch, TReadMatch, bool >
127 {
operatorLessGPosRNo128 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
129 {
130 // genome position and orientation
131 if (a.gseqNo < b.gseqNo) return true;
132 if (a.gseqNo > b.gseqNo) return false;
133 if (a.gBegin < b.gBegin) return true;
134 if (a.gBegin > b.gBegin) return false;
135 if (a.orientation < b.orientation) return true;
136 if (a.orientation > b.orientation) return false;
137
138 // read number
139 if (a.rseqNo < b.rseqNo) return true;
140 if (a.rseqNo > b.rseqNo) return false;
141
142 // pair match id
143 if (a.pairId < b.pairId) return true;
144 if (a.pairId > b.pairId) return false;
145
146 // quality
147 return a.editDist < b.editDist;
148 }
149 };
150
151 //////////////////////////////////////////////////////////////////////////////
152 // Determine error distribution
153 template <typename TErrDistr, typename TMatches, typename TReads, typename TGenomes, typename TOptions>
154 inline unsigned
getErrorDistribution(TErrDistr & posError,TMatches & matches,TReads & reads,TGenomes & genomes,TOptions & options)155 getErrorDistribution(
156 TErrDistr &posError,
157 TMatches &matches,
158 TReads &reads,
159 TGenomes &genomes,
160 TOptions &options)
161 {
162 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
163 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
164
165 Dna5String genome;
166 unsigned unique = 0;
167 for (; it != itEnd; ++it)
168 {
169 if ((*it).orientation == '-') continue;
170
171 Dna5String const &read = reads[(*it).rseqNo];
172 genome = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
173 if ((*it).orientation == 'R')
174 reverseComplement(genome);
175 for (unsigned i = 0; i < length(posError) && i < length(read); ++i)
176 if ((options.compMask[ordValue(genome[i])] & options.compMask[ordValue(read[i])]) == 0)
177 ++posError[i];
178 ++unique;
179 }
180 return unique;
181 }
182
183 template <typename TErrDistr, typename TCount1, typename TCount2, typename TMatches, typename TReads, typename TGenomes, typename TSpec>
184 inline unsigned
getErrorDistribution(TErrDistr & posError,TCount1 & insertions,TCount2 & deletions,TMatches & matches,TReads & reads,TGenomes & genomes,RazerSOptions<TSpec> & options)185 getErrorDistribution(
186 TErrDistr &posError,
187 TCount1 &insertions,
188 TCount2 &deletions,
189 TMatches &matches,
190 TReads &reads,
191 TGenomes &genomes,
192 RazerSOptions<TSpec> &options)
193 {
194 typedef Align<String<Dna5>, ArrayGaps> TAlign;
195 typedef typename Row<TAlign>::Type TRow;
196 typedef typename Iterator<TRow>::Type TIter;
197
198 //typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
199 typedef typename Position<TAlign>::Type TPosition;
200
201 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
202 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
203
204 Align<Dna5String, ArrayGaps> align;
205 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
206 if (options.hammingOnly)
207 scoreType.data_mismatch = -1;
208 resize(rows(align), 2);
209
210 unsigned unique = 0;
211 for (; it != itEnd; ++it)
212 {
213 if ((*it).orientation == '-') continue;
214
215 assignSource(row(align, 0), reads[(*it).rseqNo]);
216 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
217 if ((*it).orientation == 'R')
218 reverseComplement(source(row(align, 1)));
219 globalAlignment(align, scoreType);
220
221 TRow& row0 = row(align, 0);
222 TRow& row1 = row(align, 1);
223
224 TPosition begin = beginPosition(cols(align));
225 TPosition end = endPosition(cols(align));
226
227 TIter it0 = iter(row0, begin);
228 TIter it1 = iter(row1, begin);
229 TIter end0 = iter(row0, end);
230
231 unsigned pos = 0;
232 for (; it0 != end0 && pos < length(posError); ++it0, ++it1)
233 {
234 if (isGap(it0))
235 ++insertions;
236 else
237 {
238 if (isGap(it1))
239 ++deletions;
240 else
241 if ((options.compMask[ordValue(getValue(it0))] & options.compMask[ordValue(getValue(it1))]) == 0)
242 ++posError[pos];
243 ++pos;
244 }
245 }
246 ++unique;
247 }
248 return unique;
249 }
250
251
252 //////////////////////////////////////////////////////////////////////////////
253 template <typename TFile, typename TSource, typename TSpec, typename TPosition>
254 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source,TPosition begin_,TPosition end_)255 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source, TPosition begin_, TPosition end_)
256 {
257 typedef Align<TSource, TSpec> const TAlign;
258 typedef typename Row<TAlign>::Type TRow;
259 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
260
261 TRowsPosition row_count = length(rows(source));
262
263 // Print sequences
264 for (TRowsPosition i = 0; i < row_count; ++i)
265 {
266 if (i == 0)
267 target << "#Read: ";
268 else
269 target << "#Genome: ";
270 TRow& row_ = row(source, i);
271 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
272 TIter begin1_ = iter(row_, begin_);
273 TIter end1_ = iter(row_, end_);
274 for (; begin1_ != end1_; ++begin1_) {
275 if (isGap(begin1_))
276 target << gapValue<char>();
277 else
278 target << *begin1_;
279 }
280 target << '\n';
281 }
282 }
283
284 template<typename TMatches, typename TCounts, typename TOptions>
285 void
countCoocurrences(TMatches & matches,TCounts & cooc,TOptions & options)286 countCoocurrences(TMatches & matches, TCounts & cooc, TOptions & options)
287 {
288 clear(cooc);
289 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength) + 1;
290 resize(cooc,maxSeedErrors+1,0);
291 for (int i = 0; i < maxSeedErrors+1; ++i)
292 cooc[i] = 1;
293
294 int count = 0;
295 unsigned readNo = -1;
296 int preEditDist = -1;
297 typename Iterator<TMatches>::Type it = begin(matches,Standard());
298 typename Iterator<TMatches>::Type itEnd = end(matches,Standard());
299
300 for(; it != itEnd; ++it)
301 {
302 if ((*it).rseqNo == readNo)
303 {
304 if(preEditDist > 1) continue;// || dist > options.errorRate * maxReadLength + 1) continue;
305 int dist = (*it).seedEditDist - preEditDist;
306 if(dist > maxSeedErrors) continue;
307 if(dist < 0) ++cooc[0];
308 else ++cooc[dist];
309 }
310 else
311 {
312 readNo = (*it).rseqNo;
313 preEditDist = (*it).seedEditDist;
314 if(preEditDist <= 1) ++count;
315 }
316 }
317 for (unsigned i = 0; i < length(cooc); ++i)
318 {
319 cooc[i] = (int)(-4.343 * log((double)cooc[i]/count) );
320 if (cooc[i] < 0) cooc[i] = 0;
321 }
322 if(options._debugLevel > 1)
323 {
324 ::std::cerr << "[mapping_count] ";
325 for(unsigned j = 0; j < length(cooc); ++j)
326 ::std::cerr << cooc[j] << " ";
327 ::std::cerr << ::std::endl;
328 }
329
330 }
331
332
333 template<typename TAlign, typename TString, typename TIter>
334 void
getCigarLine(TAlign & align,TString & cigar,TString & mutations,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)335 getCigarLine(TAlign & align, TString & cigar, TString & mutations, TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
336 {
337
338 typedef typename Source<TAlign>::Type TSource;
339 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
340
341 // typedef typename Row<TAlign>::Type TRow;
342 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
343
344 TStringIterator readBase = begin(source(row(align,0)));
345
346 int readPos = 0;
347 bool first = true;
348 int inserted = 0;
349 int deleted = 0;
350 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
351 {
352 inserted = 0;
353 deleted = 0;
354 int matched = 0;
355 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
356 {
357 ++readPos;
358 if(*ali_it1 != *ali_it0)
359 {
360 if(first) first = false;
361 else mutations << ",";
362 mutations << readPos <<*readBase;
363 }
364 ++readBase;
365 ++ali_it0;
366 ++ali_it1;
367 ++matched;
368 }
369 if(matched>0) cigar << matched<< "M" ;
370 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
371 {
372 ++ali_it0;
373 ++ali_it1;
374 ++deleted;
375 }
376 if(deleted>0) cigar << deleted << "D";
377 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
378 {
379 ++ali_it0;
380 ++ali_it1;
381 ++readPos;
382 if(first) first = false;
383 else mutations << ",";
384 mutations << readPos << *readBase;
385 ++readBase;
386 ++inserted;
387 }
388 if(inserted>0) cigar << inserted << "I";
389 }
390 // end gaps can happen in split mapping
391 while(ali_it0!=ali_it0_stop)
392 {
393 ++ali_it0;
394 ++deleted;
395 }
396 if(deleted>0) cigar << deleted << "D";
397 while(ali_it1 != ali_it1_stop)
398 {
399 ++ali_it1;
400 ++readPos;
401 if(first) first = false;
402 else mutations << ",";
403 mutations << readPos << *readBase;
404 ++inserted;
405 }
406 if(inserted>0) cigar << inserted << "I";
407
408 }
409
410
411 #ifdef RAZERS_DIRECT_MAQ_MAPPING
412 //////////////////////////////////////////////////////////////////////////////
413 // Assign mapping quality and remove suboptimal matches
414 template < typename TMatches, typename TReads, typename TCooc, typename TCounts, typename TSpec >
assignMappingQuality(TMatches & matches,TReads & reads,TCooc & cooc,TCounts & cnts,RazerSOptions<TSpec> & options)415 void assignMappingQuality(TMatches &matches, TReads & reads, TCooc & cooc, TCounts &cnts, RazerSOptions<TSpec> & options)
416 {
417 typedef typename Value<TMatches>::Type TMatch;
418 typedef typename Iterator<TMatches, Standard>::Type TIterator;
419
420 //matches are already sorted
421 //::std::sort(
422 // begin(matches, Standard()),
423 // end(matches, Standard()),
424 // LessRNoMQ<TMatch>());
425
426
427 int maxSeedErrors = (int)(options.errorRate*options.artSeedLength)+1;
428 unsigned readNo = -1;
429
430 TIterator it = begin(matches, Standard());
431 TIterator itEnd = end(matches, Standard());
432 TIterator dit = it;
433
434 int bestQualSum, secondBestQualSum;
435 int secondBestDist = -1 ,secondBestMatches = -1;
436 for (; it != itEnd; ++it)
437 {
438 if ((*it).orientation == '-') continue;
439 bool mappingQualityFound = false;
440 int mappingQuality = 0;
441 int qualTerm1,qualTerm2;
442
443 readNo = (*it).rseqNo;
444 bestQualSum = (*it).mScore;
445
446 if(++it!=itEnd && (*it).rseqNo==readNo)
447 {
448 secondBestQualSum = (*it).mScore;
449 secondBestDist = (*it).editDist;
450 secondBestDist = (*it).editDist;
451 secondBestMatches = cnts[1][readNo] >> 5;
452 //CHECKcnts secondBestMatches = cnts[secondBestDist][readNo];
453 // secondBestMatches = cnts[secondBestDist][readNo];
454 (*it).orientation = '-';
455 // if(secondBestDist<=bestDist) unique=0;
456 }
457 else secondBestQualSum = -1000;
458 --it; //it is now at best match of current rseqNo
459
460 int bestDist = (*it).editDist;
461 int kPrime = (*it).seedEditDist;
462 if((bestQualSum==secondBestQualSum) || (kPrime>maxSeedErrors))
463 mappingQualityFound = true; //mq=0
464 else{
465 if(secondBestQualSum == -1000) qualTerm1 = 99;
466 else
467 {
468 qualTerm1 = (int)(secondBestQualSum - bestQualSum - 4.343 * log((double)secondBestMatches));
469 //if (secondBestKPrime - kPrime <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual; //TODO abchecken was mehr sinn macht
470 if (secondBestDist - bestDist <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual;
471 }
472 float avgSeedQual = 0.0;
473 if(!mappingQualityFound)
474 {
475 //TODO!!! generalize and adapt to razers lossrates
476 // lossrate 0.42 => -10 log10(0.42) = 4
477 int kPrimeLoss = 4; // options.kPrimeLoss; // bezieht sich auf 3 fehler in 28bp
478 qualTerm2 = kPrimeLoss + cooc[maxSeedErrors-kPrime];
479
480 for(unsigned j = 0; j<options.artSeedLength; ++j)
481 {
482 int q = getQualityValue(reads[readNo][j]);//(int)((unsigned char)(reads[readNo][j])>>3);
483 if(q>options.mutationRateQual) q = options.mutationRateQual;
484 avgSeedQual+=q;
485 }
486 avgSeedQual/=options.artSeedLength;
487 //-10 log10(28-2) = 14;
488 //generalize to -10 log10(artSeedLength - maxSeedErrors +1 ) // 14 fits for seedlength 28 to 32 with 2 errors
489 if(avgSeedQual>14) qualTerm2 += (int)((maxSeedErrors-kPrime)*(avgSeedQual-14));
490 }
491 }
492 if (!mappingQualityFound) mappingQuality = (qualTerm1<qualTerm2) ? qualTerm1:qualTerm2;
493 if (mappingQuality < 0) mappingQuality = 0;
494 (*it).mScore = mappingQuality;
495
496 *dit = *it;
497 // if(secondBestQualSum != -1000) ++it;
498 ++dit;
499 }
500 resize(matches, dit - begin(matches, Standard()));
501 }
502 #endif
503
504
505 //////////////////////////////////////////////////////////////////////////////
506 // Output matches
507 template <
508 typename TMatches,
509 typename TGenomeNames,
510 typename TReads,
511 typename TReadNames,
512 typename TCounts,
513 typename TSpec
514 >
dumpMatches(TMatches & matches,TGenomeNames const & genomeIDs,StringSet<CharString> & genomeFileNameList,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReads const & reads,TCounts & stats,TReadNames const & readIDs,CharString readFName,CharString errorPrbFileName,RazerSOptions<TSpec> & options)515 void dumpMatches(
516 TMatches &matches, // forward/reverse matches
517 TGenomeNames const &genomeIDs, // Genome names (read from Fasta file, currently unused)
518 StringSet<CharString> &genomeFileNameList, // list of genome names (e.g. {"hs_ref_chr1.fa","hs_ref_chr2.fa"})
519 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > &gnoToFileMap, //map to retrieve genome filename and sequence number within that file
520 TReads const &reads,
521 TCounts & stats, // Match statistics (possibly empty)
522 TReadNames const &readIDs, // Read names (read from Fasta file, currently unused)
523 CharString readFName, // read name (e.g. "reads.fa"), used for file/read naming
524 CharString errorPrbFileName,
525 RazerSOptions<TSpec> &options)
526 {
527 typedef typename Value<TMatches>::Type TMatch;
528 //typedef typename Value<TReads>::Type TRead;
529 //typedef typename Value<TGenomeSet>::Type TGenome;
530 //typedef typename TMatch::TGPos TGPos;
531
532 if (options.outputFormat == 2)
533 {
534 options.sortOrder = 0; // ... to count multiple matches
535 }
536
537 if (options.outputFormat == 2)
538 {
539 options.maxHits = 1; // Eland outputs at most one match
540 options.sortOrder = 0; // read numbers are increasing
541 options.positionFormat = 1; // bases in file are numbered starting at 1
542 options.dumpAlignment = options.hammingOnly;
543 }
544 #ifdef RAZERS_DIRECT_MAQ_MAPPING
545 if (options.maqMapping) options.outputFormat = 3;
546 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength); //without + 1 here, used to check whether match should be supported if noBelowIdentity option is switched on
547 #endif
548 if (options.outputFormat == 3)
549 {
550 options.sortOrder = 1; // sort according to gPos
551 options.positionFormat = 1; // bases in file are numbered starting at 1
552 options.dumpAlignment = false;
553 }
554
555 #ifdef RAZERS_SPLICED
556 if(options.minMatchLen > 0)
557 options.sortOrder = 1;
558 #endif
559
560 // error profile
561 unsigned maxReadLength = 0;
562 for (unsigned i = 0; i < length(reads); ++i)
563 if (maxReadLength < length(reads[i]))
564 maxReadLength = length(reads[i]);
565
566 SEQAN_PROTIMESTART(dump_time);
567
568 // load Genome sequences for alignment dumps
569 TGenomeSet genomes;
570 if ((options.outputFormat == 0 && !options.hammingOnly) || options.dumpAlignment || !empty(errorPrbFileName))
571 if (!loadGenomes(genomes, genomeFileNameList)) {
572 ::std::cerr << "Failed to load genomes" << ::std::endl;
573 options.dumpAlignment = false;
574 }
575
576 // how many 0's should be padded?
577 int pzeros = 0;
578 for (unsigned l = length(reads); l > 9; l = l / 10)
579 ++pzeros;
580
581 int gzeros = 0;
582 for (unsigned l = length(genomes); l > 9; l = l / 10)
583 ++gzeros;
584
585 // remove the directory prefix of readFName
586 ::std::string _readName;
587 assign(_readName, readFName);
588 size_t lastPos = _readName.find_last_of('/') + 1;
589 if (lastPos == _readName.npos) lastPos = _readName.find_last_of('\\') + 1;
590 if (lastPos == _readName.npos) lastPos = 0;
591 CharString readName = _readName.substr(lastPos);
592
593
594 typedef Align<String<Dna5>, ArrayGaps> TAlign;
595 TAlign align;
596 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
597 // Score<int> scoreType(0,-1,-1,-1);
598
599 if (options.hammingOnly)
600 scoreType.data_mismatch = -1;
601 resize(rows(align), 2);
602
603 VirtualStream<char, Output> file;
604 bool success;
605 if (!isEqual(options.output, "-"))
606 success = open(file, toCString(options.output));
607 else
608 success = open(file, std::cout, Nothing());
609
610 if (!success)
611 {
612 ::std::cerr << "Failed to open output file" << ::std::endl;
613 return;
614 }
615
616 #ifdef RAZERS_SPLICED
617 //maskPairDuplicates(matches);
618 #else
619 maskDuplicates(matches, options);
620 #endif
621 if (options.outputFormat > 0
622 #ifdef RAZERS_DIRECT_MAQ_MAPPING
623 && !options.maqMapping
624 #endif
625 )
626 {
627 // match statistics
628 unsigned maxErrors = (int)(options.errorRate * maxReadLength);
629 //if (maxErrors > 10) maxErrors = 10;
630 if (maxErrors < 2 && options.outputFormat == 2)
631 maxErrors = 2;
632 resize(stats, maxErrors + 1);
633 for (unsigned i = 0; i <= maxErrors; ++i)
634 resize(stats[i], length(reads), 0);
635 #ifdef RAZERS_SPLICED
636 if(options.minMatchLen > 0) countSplitMatches(matches, stats);
637 else
638 #endif
639 countMatches(matches, stats);
640 }
641
642
643
644 Nothing nothing;
645 unsigned currSeqNo = 0;
646 #ifdef RAZERS_DIRECT_MAQ_MAPPING
647 if(options.maqMapping)
648 {
649 String<int> cooc;
650 compactMatches(matches, stats, options, true, nothing, false); //only best two matches per read are kept
651 countCoocurrences(matches,cooc,options); //coocurrence statistics are filled
652 assignMappingQuality(matches,reads,cooc,stats,options);//mapping qualities are assigned and only one match per read is kept
653 }
654 else
655 #endif
656
657 #ifdef RAZERS_MICRO_RNA
658 if(options.microRNA)purgeAmbiguousRnaMatches(matches,options);
659 else
660 #endif
661 {
662 #ifdef RAZERS_SPLICED
663 if(options.minMatchLen>0)
664 compactSplicedMatches(matches, stats, options, true, nothing,nothing);
665 else
666 #endif
667 compactMatches(matches, stats, options, true, nothing);
668 }
669
670 switch (options.sortOrder) {
671 case 0:
672 ::std::sort(
673 begin(matches, Standard()),
674 end(matches, Standard()),
675 LessRNoGPos<TMatch>());
676 break;
677
678 case 1:
679 ::std::sort(
680 begin(matches, Standard()),
681 end(matches, Standard()),
682 LessGPosRNo<TMatch>());
683 break;
684
685 }
686
687 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
688 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
689
690
691 Dna5String gInf;
692 char _sep_ = '\t';
693 unsigned viewPosReadFirst = 0;
694 unsigned viewPosReadLast = 0;
695
696 switch (options.outputFormat)
697 {
698 case 0: // Razer Format
699 // _sep_ = ',';
700 for(; it != itEnd; ++it)
701 {
702 unsigned readLen = length(reads[(*it).rseqNo]);
703 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
704 #ifdef RAZERS_MICRO_RNA
705 percId = 100.0 * (1.0 - (double)(*it).editDist / (double) ((*it).mScore));
706 #endif
707 switch (options.readNaming)
708 {
709 // 0..filename is the read's Fasta id
710 case 0:
711 file << readIDs[(*it).rseqNo];
712 break;
713
714 // 1..filename is the read filename + seqNo
715 case 1:
716 file.fill('0');
717 file << readName << '#' << ::std::setw(pzeros) << (*it).rseqNo + 1;
718 break;
719
720 // 2..filename is the read sequence itself
721 case 2:
722 file << reads[(*it).rseqNo];
723 }
724
725 file << _sep_ << options.positionFormat << _sep_ << readLen << _sep_ << (*it).orientation << _sep_;
726
727 switch (options.genomeNaming)
728 {
729 // 0..filename is the read's Fasta id
730 case 0:
731 file << genomeIDs[(*it).gseqNo];
732 break;
733
734 // 1..filename is the read filename + seqNo
735 case 1:
736 file.fill('0');
737 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
738 }
739
740 // get alignment to dump or to fix end coordinates
741 if (options.dumpAlignment || !options.hammingOnly)
742 {
743 #ifdef RAZERS_MICRO_RNA
744 if(options.microRNA)
745 assignSource(row(align, 0), prefix(reads[(*it).rseqNo],(*it).mScore));
746 else
747 #endif
748 assignSource(row(align, 0), reads[(*it).rseqNo]);
749 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
750 #ifdef RAZERS_MATEPAIRS
751 if ((*it).pairId != 0 && ((*it).rseqNo & 1))
752 reverseComplement(source(row(align, 0)));
753 #endif
754 if ((*it).orientation == 'R')
755 reverseComplement(source(row(align, 1)));
756
757 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
758
759 // transform first and last read character to genomic positions
760 viewPosReadFirst = toViewPosition(row(align, 0), 0);
761 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
762 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
763 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
764
765 if ((*it).orientation == 'R')
766 {
767 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
768 (*it).gEnd -= genomePosReadFirst;
769 }
770 else
771 {
772 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
773 (*it).gBegin += genomePosReadFirst;
774 }
775 }
776
777 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd << _sep_ << ::std::setprecision(5) << percId;
778 #ifdef RAZERS_MICRO_RNA
779 if(options.microRNA) file << _sep_ << (*it).mScore;
780 #endif
781
782 #ifdef RAZERS_MATEPAIRS
783 if ((*it).pairId != 0)
784 file << _sep_ << (*it).pairId << _sep_ << (*it).pairScore << _sep_ << (*it).mateDelta;
785 #endif
786 file << ::std::endl;
787
788 if (options.dumpAlignment)
789 dumpAlignment(file, align, viewPosReadFirst, viewPosReadLast + 1);
790 }
791 break;
792
793
794 case 1: // Enhanced Fasta Format
795 _sep_ = ',';
796 for(unsigned matchReadNo = -1, matchReadCount = 0; it != itEnd; ++it)
797 {
798 unsigned readLen = length(reads[(*it).rseqNo]);
799 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
800
801 if (matchReadNo != (*it).rseqNo)
802 {
803 matchReadNo = (*it).rseqNo;
804 matchReadCount = 0;
805 } else
806 ++matchReadCount;
807
808 ::std::string fastaID;
809 assign(fastaID, readIDs[(*it).rseqNo]);
810
811 ::std::string id = fastaID;
812 int fragId = (*it).rseqNo;
813 bool appendMatchId = options.maxHits > 1;
814
815 size_t left = fastaID.find_first_of('[');
816 size_t right = fastaID.find_last_of(']');
817 if (left != fastaID.npos && right != fastaID.npos && left < right)
818 {
819 fastaID.erase(right);
820 fastaID.erase(0, left + 1);
821 replace(fastaID.begin(), fastaID.end(), ',', ' ');
822 size_t pos = fastaID.find("id=");
823 if (pos != fastaID.npos) {
824 ::std::istringstream iss(fastaID.substr(pos + 3));
825 iss >> id;
826 // appendMatchId = false;
827 }
828 pos = fastaID.find("fragId=");
829 if (pos != fastaID.npos) {
830 ::std::istringstream iss(fastaID.substr(pos + 7));
831 iss >> fragId;
832 }
833 }
834
835 if ((*it).orientation == 'F')
836 // forward strand
837 file << '>' << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd;
838 else
839 // reverse strand (switch begin and end)
840 file << '>' << (*it).gEnd << _sep_ << ((*it).gBegin + options.positionFormat);
841
842 unsigned ambig = 0;
843 for (unsigned i = 0; i <= (*it).editDist && i < length(stats); ++i)
844 ambig += stats[i][(*it).rseqNo];
845
846 file << "[id=" << id;
847 if (appendMatchId) file << "_" << matchReadCount;
848 file << ",fragId=" << fragId;
849 file << ",contigId=" << genomeIDs[(*it).gseqNo];
850 file << ",errors=" << (*it).editDist << ",percId=" << ::std::setprecision(5) << percId;
851 file << ",ambiguity=" << ambig << ']' << ::std::endl;
852
853 file << reads[(*it).rseqNo] << ::std::endl;
854 }
855 break;
856
857
858 case 2: // Eland Format
859 _sep_ = '\t';
860 for(unsigned readNo = 0; readNo < length(reads); ++readNo)
861 {
862 switch (options.readNaming)
863 {
864 // 0..filename is the read's Fasta id
865 case 0:
866 file << '>' << readIDs[readNo] << _sep_;
867 break;
868
869 // 1..filename is the read filename + seqNo
870 case 1:
871 file.fill('0');
872 file << readName << '#' << ::std::setw(pzeros) << readNo + 1 << _sep_;
873 break;
874 }
875
876 if (it == itEnd || readNo < (*it).rseqNo)
877 {
878 if (!empty(reads[readNo]))
879 file << reads[readNo] << _sep_ << "NM" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
880 else
881 {
882 for (unsigned i = 0; i < maxReadLength; ++i)
883 file << '.';
884 file << _sep_ << "QC" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
885 }
886 } else
887 {
888 file << reads[readNo] << _sep_;
889 unsigned bestMatches = 1;
890 if ((unsigned)(*it).editDist < length(stats))
891 bestMatches = stats[(*it).editDist][readNo];
892
893 if (bestMatches == 0) file << '?'; // impossible
894 if (bestMatches == 1) file << 'U'; // unique best match
895 if (bestMatches > 1) file << 'R'; // non-unique best matches
896
897 file << (*it).editDist << _sep_ << stats[0][readNo] << _sep_ << stats[1][readNo] << _sep_ << stats[2][readNo];
898
899 if (bestMatches == 1)
900 {
901 file << _sep_;
902 switch (options.genomeNaming)
903 {
904 // 0..filename is the read's Fasta id
905 case 0:
906 file << genomeIDs[(*it).gseqNo];
907 break;
908
909 // 1..filename is the read filename + seqNo
910 case 1:
911 file.fill('0');
912 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
913 }
914
915 if ((*it).orientation == 'F')
916 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << 'F' << _sep_ << "..";
917 else
918 file << _sep_ << (*it).gEnd << _sep_ << 'R' << _sep_ << "..";
919
920 if ((*it).editDist > 0 && options.dumpAlignment && options.hammingOnly)
921 {
922 gInf = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
923 if ((*it).orientation == 'R')
924 reverseComplement(gInf);
925 for (unsigned i = 0; i < length(gInf); ++i)
926 if ((options.compMask[ordValue(reads[readNo][i])] &
927 options.compMask[ordValue(gInf[i])]) == 0)
928 file << _sep_ << i + 1 << gInf[i];
929 }
930 }
931 file << ::std::endl;
932 ++it;
933 }
934 }
935 break;
936 case 3: // Gff: printf "$chr $name_$format read $pos %ld . $dir . ID=$col[0]$unique$rest\n",$pos+$len-1;
937 // NOTE(marehr): filecount+=2 might be a potential bug [https://github.com/seqan/seqan/issues/2165]
938 // In revision 4dbf27b55 and before, filecount was incremented twice at the
939 // end of the for loop, which caused a compiler warning (once in the body
940 // and once in the iteration_expression of the for loop). We kept this
941 // behaviour, because we have no active maintainer for this app.
942 for (unsigned filecount = 0; filecount < length(genomeFileNameList); filecount+=2)
943 {
944 // open genome file
945 SeqFileIn gFile;
946 if (!open(gFile, toCString(genomeFileNameList[filecount])))
947 {
948 std::cerr << "Couldn't open genome file." << std::endl;
949 break;
950 }
951
952 CharString currId;
953 Dna5String currGenome;
954
955 // iterate over genome sequences
956 for(; !atEnd(gFile); ++currSeqNo)
957 {
958 readRecord(currId, currGenome, gFile); // read Fasta sequence
959 while(it != itEnd && (*it).gseqNo == currSeqNo)
960 {
961 #ifdef RAZERS_DIRECT_MAQ_MAPPING
962 if(options.maqMapping && options.noBelowIdentity && (*it).seedEditDist > maxSeedErrors)
963 {
964 ++it;
965 continue;
966 }
967 #endif
968
969 unsigned currReadNo = (*it).rseqNo;
970 int unique = 1;
971 unsigned bestMatches = 0;
972
973 #ifdef RAZERS_DIRECT_MAQ_MAPPING
974 if(options.maqMapping)
975 bestMatches = stats[0][currReadNo] >> 5;
976 else
977 #endif
978 {
979 #ifdef RAZERS_SPLICED
980 if (options.minMatchLen > 0)
981 {
982 if( -(*it).pairScore < (int)length(stats))
983 bestMatches = stats[-(*it).pairScore][currReadNo]/2;
984 }
985 else
986 #endif
987 if (bestMatches == 0 && (unsigned)(*it).editDist < length(stats))
988 bestMatches = stats[(*it).editDist][currReadNo];
989
990 }
991
992 bool suboptimal = false;
993 if (
994 #ifdef RAZERS_DIRECT_MAQ_MAPPING
995 !options.maqMapping &&
996 #endif
997 (unsigned)(*it).editDist > 0
998 #ifdef RAZERS_SPLICED
999 && options.minMatchLen == 0
1000 #endif
1001 )
1002 {
1003 for(unsigned d = 0; d < (unsigned)(*it).editDist; ++d)
1004 if (stats[d][currReadNo]>0) suboptimal=true;
1005 }
1006 #ifdef RAZERS_SPLICED
1007 if(options.minMatchLen > 0 && -(*it).pairScore > 0)
1008 {
1009 for(unsigned d = 0; d < -(unsigned)(*it).pairScore; ++d)
1010 if (stats[d][currReadNo]>0) suboptimal=true;
1011 }
1012 #endif
1013
1014 if (bestMatches != 1)
1015 {
1016 unique = 0;
1017 if(options.purgeAmbiguous)
1018 {
1019 ++it;
1020 continue;
1021 }
1022
1023 // if((*it).mScore > 0) std::cout << (*it).mScore << "<-non uniq but score > 0\n";
1024 // ++it;
1025 // continue; // TODO: output non-unique matches (concerns maq mapping only)
1026 }
1027 unsigned readLen = length(reads[currReadNo]);
1028 #ifdef RAZERS_SPLICED
1029 if(options.minMatchLen > 0)
1030 readLen = (*it).mScore;
1031 #endif
1032 String<Dna5Q> readInf = infix(reads[currReadNo],0,readLen);
1033 #ifdef RAZERS_SPLICED
1034 if(options.minMatchLen > 0)
1035 {
1036 if(((*it).orientation == 'F' && (*it).mateDelta < 0)
1037 || ((*it).orientation == 'R' && (*it).mateDelta > 0))
1038 {
1039 readInf = infix(reads[currReadNo],
1040 length(reads[currReadNo])-readLen,
1041 length(reads[currReadNo]));
1042 }
1043 }
1044 #endif
1045
1046 switch (options.genomeNaming)
1047 {
1048 // 0..filename is the read's Fasta id
1049 case 0:
1050 file << genomeIDs[(*it).gseqNo] <<'\t';
1051 break;
1052 // 1..filename is the read filename + seqNo
1053 case 1:
1054 file.fill('0');
1055 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1056 break;
1057 }
1058
1059 // get alignment to dump or to fix end coordinates
1060 if (!options.hammingOnly)
1061 {
1062 assignSource(row(align, 0), readInf);
1063 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1064 if ((*it).orientation == 'R')
1065 reverseComplement(source(row(align, 1)));
1066
1067 #ifdef RAZERS_SPLICED
1068 if(options.minMatchLen > 0)
1069 globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1070 else
1071 #endif
1072 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1073
1074 // transform first and last read character to genomic positions
1075 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1076 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1077 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
1078 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
1079 #ifdef RAZERS_SPLICED
1080 if(options.minMatchLen == 0)
1081 #endif
1082 {
1083 if ((*it).orientation == 'R')
1084 {
1085 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
1086 (*it).gEnd -= genomePosReadFirst;
1087 }
1088 else
1089 {
1090 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
1091 (*it).gBegin += genomePosReadFirst;
1092 }
1093 }
1094 }
1095
1096 //file << options.runID << "_razers\tread";
1097 file << "razers\tread";
1098 file << '\t' << ((*it).gBegin + options.positionFormat) << '\t' << (*it).gEnd << '\t';
1099 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
1100 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1101 if(options.maqMapping)
1102 file << (*it).mScore << "\t";
1103 else
1104 #endif
1105 file << percId << "\t";
1106
1107 if ((*it).orientation == 'F')
1108 file << '+' << '\t' << '.' <<'\t';
1109 else
1110 file << '-' << '\t' << '.' <<'\t';
1111
1112 switch (options.readNaming)
1113 {
1114 // 0..filename is the read's Fasta id
1115 case 0:
1116 file << "ID=" <<readIDs[currReadNo];
1117 break;
1118
1119 // 1..filename is the read filename + seqNo
1120 case 1:
1121 file.fill('0');
1122 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1123 break;
1124 }
1125 if(suboptimal) file << ";suboptimal";
1126 else
1127 {
1128 if(unique==1) file << ";unique";
1129 if(unique==0) file << ";multi";
1130 }
1131
1132 #ifdef RAZERS_SPLICED
1133 if(options.minMatchLen > 0)
1134 {
1135 file << ";delta=" << (*it).mateDelta << ";pairId="<<(*it).pairId;
1136 file << ";pairErr=" << -(*it).pairScore;
1137 }
1138 #endif
1139
1140 if ((*it).editDist > 0)
1141 {
1142 if (options.hammingOnly)
1143 {
1144 gInf = infix(currGenome, (*it).gBegin, (*it).gEnd);
1145 if ((*it).orientation == 'R')
1146 reverseComplement(gInf);
1147 bool first = true;
1148 file << ";cigar=" << readLen << "M";
1149 file << ";mutations=";
1150 for (unsigned i = 0; i < length(gInf); ++i)
1151 if ((options.compMask[ordValue(readInf[i])] &
1152 options.compMask[ordValue(gInf[i])]) == 0)
1153 {
1154 if(first){ file << i + 1 << (Dna5)readInf[i]; first = false;}
1155 else file <<','<< i + 1 << (Dna5)readInf[i];
1156 }
1157
1158 }
1159 else
1160 {
1161 std::stringstream cigar, mutations;
1162 typedef typename Row<TAlign>::Type TRow;
1163 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1164 TAlignIterator ali_it0 = iter(row(align,0),viewPosReadFirst);
1165 TAlignIterator ali_it1 = iter(row(align,1),viewPosReadFirst);
1166 TAlignIterator ali_it0stop = iter(row(align,0),viewPosReadLast + 1);
1167 TAlignIterator ali_it1stop = iter(row(align,1),viewPosReadLast + 1);
1168
1169 #ifdef RAZERS_SPLICED
1170 if(options.minMatchLen > 0)
1171 {
1172 ali_it0 = begin(row(align,0));
1173 ali_it1 = begin(row(align,1));
1174 ali_it0stop = end(row(align,0));
1175 ali_it1stop = end(row(align,1));
1176 }
1177 #endif
1178 getCigarLine(align,cigar,mutations,ali_it0,ali_it0stop,ali_it1,ali_it1stop);
1179 file << ";cigar="<<cigar.str();
1180 if(length(mutations.str())>0)
1181 file << ";mutations=" << mutations.str();
1182 }
1183 }
1184
1185
1186 if(
1187 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1188 options.maqMapping ||
1189 #endif
1190 options.fastaIdQual)
1191 {
1192 // file << ";read=";
1193 // for(unsigned j=0;j<length(reads[currReadNo]);++j)
1194 // {
1195 // file << (Dna5)reads[currReadNo][j];
1196 // }
1197 file << ";quality=";
1198 for(unsigned j=0;j<readLen;++j)
1199 {
1200 file << (char)(getQualityValue(readInf[j])+ 33);
1201 }
1202 }
1203
1204 file << ::std::endl;
1205 ++it;
1206 }
1207 }
1208 close(gFile);
1209 }
1210 break;
1211 }
1212
1213 close(file);
1214
1215 // get empirical error distribution
1216 if (!empty(errorPrbFileName) && maxReadLength > 0)
1217 {
1218 std::ofstream file;
1219 file.open(toCString(errorPrbFileName), ::std::ios_base::out | ::std::ios_base::trunc);
1220 if (file.is_open())
1221 {
1222 String<long double> posError;
1223 unsigned unique = 0;
1224 unsigned insertions = 0;
1225 unsigned deletions = 0;
1226 resize(posError, maxReadLength, 0);
1227
1228 if (options.hammingOnly)
1229 unique = getErrorDistribution(posError, matches, reads, genomes, options);
1230 else
1231 {
1232 unique = getErrorDistribution(posError, insertions, deletions, matches, reads, genomes, options);
1233 ::std::cerr << "insertProb: " << (double)insertions / ((double)length(posError) * (double)unique) << ::std::endl;
1234 ::std::cerr << "deleteProb: " << (double)deletions / ((double)length(posError) * (double)unique) << ::std::endl;
1235 }
1236
1237 file << (double)posError[0] / (double)unique;
1238 for (unsigned i = 1; i < length(posError); ++i)
1239 file << '\t' << (double)posError[i] / (double)unique;
1240 file << ::std::endl;
1241 file.close();
1242 } else
1243 ::std::cerr << "Failed to open error distribution file" << ::std::endl;
1244 }
1245
1246 options.timeDumpResults = SEQAN_PROTIMEDIFF(dump_time);
1247
1248 if (options._debugLevel >= 1)
1249 ::std::cerr << "Dumping results took \t" << options.timeDumpResults << " seconds" << ::std::endl;
1250 }
1251
1252
1253 }
1254
1255 #endif
1256
1257