1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_OUTPUT_FORMAT_H
23 #define SEQAN_HEADER_OUTPUT_FORMAT_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28
29 #include "razers.h"
30 #include <seqan/align.h>
31
32 namespace SEQAN_NAMESPACE_MAIN
33 {
34
35 //////////////////////////////////////////////////////////////////////////////
36 // Quality-based score
37
38 template <typename TQualityString = CharString>
39 struct Quality;
40
41 template <typename TValue, typename TQualityString>
42 class Score<TValue, Quality<TQualityString> >
43 {
44 public:
45 TValue data_match;
46 TValue data_mismatch;
47 TValue data_gap_extend;
48 TValue data_gap_open;
49
50 TQualityString const *data_qual;
51
52 public:
Score()53 Score():
54 data_match(0),
55 data_mismatch(-1),
56 data_gap_extend(-1),
57 data_gap_open(-1),
58 data_qual(NULL)
59 {
60 }
Score(TValue _match,TValue _mismatch,TValue _gap)61 Score(TValue _match, TValue _mismatch, TValue _gap):
62 data_match(_match),
63 data_mismatch(_mismatch),
64 data_gap_extend(_gap),
65 data_gap_open(_gap),
66 data_qual(NULL)
67 {
68 }
Score(TValue _match,TValue _mismatch,TValue _gap_extend,TValue _gap_open,TQualityString const & _qual)69 Score(TValue _match, TValue _mismatch, TValue _gap_extend, TValue _gap_open, TQualityString const &_qual):
70 data_match(_match),
71 data_mismatch(_mismatch),
72 data_gap_extend(_gap_extend),
73 data_gap_open(_gap_open),
74 data_qual(&_qual)
75 {
76 }
77
Score(Score const & other)78 Score(Score const & other):
79 data_match(other.data_match),
80 data_mismatch(other.data_mismatch),
81 data_gap_extend(other.data_gap_extend),
82 data_gap_open(other.data_gap_open),
83 data_qual(other.data_qual)
84 {
85 }
~Score()86 ~Score()
87 {
88 }
89
90 Score & operator = (Score const & other)
91 {
92 data_match = other.data_match;
93 data_mismatch = other.data_mismatch;
94 data_gap_extend = other.data_gap_extend;
95 data_gap_open = other.data_gap_open;
96 data_qual = other.data_qual;
97 return *this;
98 }
99 };
100
101 //////////////////////////////////////////////////////////////////////////////
102
103 template <typename TValue, typename TQualityString, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
104 inline TValue
score(Score<TValue,Quality<TQualityString>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)105 score(Score<TValue, Quality<TQualityString> > const & me,
106 TPos1 pos1,
107 TPos2 pos2,
108 TSeq1 const &seq1,
109 TSeq2 const &seq2)
110 {
111 if (seq1[pos1] != seq2[pos2])
112 if (me.data_qual)
113 return (*me.data_qual)[pos2];
114 else
115 return scoreMismatch(me);
116 else
117 return scoreMatch(me);
118 }
119
120
121 //////////////////////////////////////////////////////////////////////////////
122 // Less-operators ...
123
124 // ... to sort matches and remove duplicates with equal gBegin
125 template <typename TReadMatch>
126 struct LessGPosRNo : public ::std::binary_function < TReadMatch, TReadMatch, bool >
127 {
operatorLessGPosRNo128 inline bool operator() (TReadMatch const &a, TReadMatch const &b) const
129 {
130 // genome position and orientation
131 if (a.gseqNo < b.gseqNo) return true;
132 if (a.gseqNo > b.gseqNo) return false;
133 if (a.gBegin < b.gBegin) return true;
134 if (a.gBegin > b.gBegin) return false;
135 if (a.orientation < b.orientation) return true;
136 if (a.orientation > b.orientation) return false;
137
138 // read number
139 if (a.rseqNo < b.rseqNo) return true;
140 if (a.rseqNo > b.rseqNo) return false;
141
142 // quality
143 return a.editDist < b.editDist;
144 }
145 };
146
147 //////////////////////////////////////////////////////////////////////////////
148 // Determine error distribution
149 template <typename TErrDistr, typename TMatches, typename TReads, typename TGenomes, typename TOptions>
150 inline unsigned
getErrorDistribution(TErrDistr & posError,TMatches & matches,TReads & reads,TGenomes & genomes,TOptions & options)151 getErrorDistribution(
152 TErrDistr &posError,
153 TMatches &matches,
154 TReads &reads,
155 TGenomes &genomes,
156 TOptions &options)
157 {
158 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
159 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
160
161 Dna5String genome;
162 unsigned unique = 0;
163 for (; it != itEnd; ++it)
164 {
165 if ((*it).orientation == '-') continue;
166
167 Dna5String const &read = reads[(*it).rseqNo];
168 genome = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
169 if ((*it).orientation == 'R')
170 reverseComplement(genome);
171 for (unsigned i = 0; i < length(posError) && i < length(read); ++i)
172 if ((options.compMask[ordValue(genome[i])] & options.compMask[ordValue(read[i])]) == 0)
173 ++posError[i];
174 ++unique;
175 }
176 return unique;
177 }
178
179 template <typename TErrDistr, typename TCount1, typename TCount2, typename TMatches, typename TReads, typename TGenomes, typename TSpec>
180 inline unsigned
getErrorDistribution(TErrDistr & posError,TCount1 & insertions,TCount2 & deletions,TMatches & matches,TReads & reads,TGenomes & genomes,RazerSOptions<TSpec> & options)181 getErrorDistribution(
182 TErrDistr &posError,
183 TCount1 &insertions,
184 TCount2 &deletions,
185 TMatches &matches,
186 TReads &reads,
187 TGenomes &genomes,
188 RazerSOptions<TSpec> &options)
189 {
190 typedef Align<String<Dna5>, ArrayGaps> TAlign;
191 typedef typename Row<TAlign>::Type TRow;
192 typedef typename Iterator<TRow>::Type TIter;
193
194 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
195 typedef typename Position<TAlign>::Type TPosition;
196
197 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
198 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
199
200 Align<Dna5String, ArrayGaps> align;
201 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
202 if (options.hammingOnly)
203 scoreType.data_mismatch = -1;
204 resize(rows(align), 2);
205
206 unsigned unique = 0;
207 for (; it != itEnd; ++it)
208 {
209 if ((*it).orientation == '-') continue;
210
211 assignSource(row(align, 0), reads[(*it).rseqNo]);
212 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
213 if ((*it).orientation == 'R')
214 reverseComplement(source(row(align, 1)));
215 globalAlignment(align, scoreType);
216
217 TRow& row0 = row(align, 0);
218 TRow& row1 = row(align, 1);
219
220 TPosition begin = beginPosition(cols(align));
221 TPosition end = endPosition(cols(align));
222
223 TIter it0 = iter(row0, begin);
224 TIter it1 = iter(row1, begin);
225 TIter end0 = iter(row0, end);
226
227 unsigned pos = 0;
228 for (; it0 != end0 && pos < length(posError); ++it0, ++it1)
229 {
230 if (isGap(it0))
231 ++insertions;
232 else
233 {
234 if (isGap(it1))
235 ++deletions;
236 else
237 if ((options.compMask[ordValue(getValue(it0))] & options.compMask[ordValue(getValue(it1))]) == 0)
238 ++posError[pos];
239 ++pos;
240 }
241 }
242 ++unique;
243 }
244 return unique;
245 }
246
247
248 //////////////////////////////////////////////////////////////////////////////
249 template <typename TFile, typename TSource, typename TSpec, typename TPosition>
250 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source,TPosition begin_,TPosition end_)251 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source, TPosition begin_, TPosition end_)
252 {
253 typedef Align<TSource, TSpec> const TAlign;
254 typedef typename Row<TAlign>::Type TRow;
255 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
256
257 TRowsPosition row_count = length(rows(source));
258
259 // Print sequences
260 for (TRowsPosition i = 0; i < row_count; ++i)
261 {
262 if (i == 0)
263 _streamWrite(target, "#Read: ");
264 else
265 _streamWrite(target, "#Genome: ");
266 TRow& row_ = row(source, i);
267 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
268 TIter begin1_ = iter(row_, begin_);
269 TIter end1_ = iter(row_, end_);
270 for (; begin1_ != end1_; ++begin1_) {
271 if (isGap(begin1_)) _streamPut(target, gapValue<char>());
272 else _streamPut(target, *begin1_);
273 }
274 _streamPut(target, '\n');
275 }
276 }
277
278 template<typename TMatches, typename TCounts, typename TOptions>
279 void
countCoocurrences(TMatches & matches,TCounts & cooc,TOptions & options)280 countCoocurrences(TMatches & matches, TCounts & cooc, TOptions & options)
281 {
282 clear(cooc);
283 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength) + 1;
284 resize(cooc,maxSeedErrors+1,0);
285 for (int i = 0; i < maxSeedErrors+1; ++i)
286 cooc[i] = 1;
287
288 int count = 0;
289 unsigned readNo = -1;
290 int preEditDist = -1;
291 typename Iterator<TMatches>::Type it = begin(matches,Standard());
292 typename Iterator<TMatches>::Type itEnd = end(matches,Standard());
293
294 for(; it != itEnd; ++it)
295 {
296 if ((*it).rseqNo == readNo)
297 {
298 if(preEditDist > 1) continue;// || dist > options.errorRate * maxReadLength + 1) continue;
299 int dist = (*it).seedEditDist - preEditDist;
300 if(dist > maxSeedErrors) continue;
301 if(dist < 0) ++cooc[0];
302 else ++cooc[dist];
303 }
304 else
305 {
306 readNo = (*it).rseqNo;
307 preEditDist = (*it).seedEditDist;
308 if(preEditDist <= 1) ++count;
309 }
310 }
311 for (unsigned i = 0; i < length(cooc); ++i)
312 {
313 cooc[i] = (int)(-4.343 * log((double)cooc[i]/count) );
314 if (cooc[i] < 0) cooc[i] = 0;
315 }
316 if(options._debugLevel > 1)
317 {
318 ::std::cerr << "[mapping_count] ";
319 for(unsigned j = 0; j < length(cooc); ++j)
320 ::std::cerr << cooc[j] << " ";
321 ::std::cerr << ::std::endl;
322 }
323
324 }
325
326
327 template<typename TAlign, typename TString, typename TIter>
328 void
getCigarLine(TAlign & align,TString & cigar,TString & mutations,TIter ali_it0,TIter ali_it0_stop,TIter ali_it1,TIter ali_it1_stop)329 getCigarLine(TAlign & align, TString & cigar, TString & mutations, TIter ali_it0, TIter ali_it0_stop, TIter ali_it1, TIter ali_it1_stop)
330 {
331
332 typedef typename Source<TAlign>::Type TSource;
333 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
334
335 // typedef typename Row<TAlign>::Type TRow;
336 // typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
337
338 TStringIterator readBase = begin(source(row(align,0)));
339
340 int readPos = 0;
341 bool first = true;
342 int inserted = 0;
343 int deleted = 0;
344 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
345 {
346 inserted = 0;
347 deleted = 0;
348 int matched = 0;
349 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
350 {
351 ++readPos;
352 if(*ali_it1 != *ali_it0)
353 {
354 if(first) first = false;
355 else mutations << ",";
356 mutations << readPos <<*readBase;
357 }
358 ++readBase;
359 ++ali_it0;
360 ++ali_it1;
361 ++matched;
362 }
363 if(matched>0) cigar << matched<< "M" ;
364 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
365 {
366 ++ali_it0;
367 ++ali_it1;
368 ++deleted;
369 }
370 if(deleted>0) cigar << deleted << "D";
371 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
372 {
373 ++ali_it0;
374 ++ali_it1;
375 ++readPos;
376 if(first) first = false;
377 else mutations << ",";
378 mutations << readPos << *readBase;
379 ++readBase;
380 ++inserted;
381 }
382 if(inserted>0) cigar << inserted << "I";
383 }
384 // end gaps can happen in split mapping
385 while(ali_it0!=ali_it0_stop)
386 {
387 ++ali_it0;
388 ++deleted;
389 }
390 if(deleted>0) cigar << deleted << "D";
391 while(ali_it1 != ali_it1_stop)
392 {
393 ++ali_it1;
394 ++readPos;
395 if(first) first = false;
396 else mutations << ",";
397 mutations << readPos << *readBase;
398 ++inserted;
399 }
400 if(inserted>0) cigar << inserted << "I";
401
402 }
403
404
405 #ifdef RAZERS_DIRECT_MAQ_MAPPING
406 //////////////////////////////////////////////////////////////////////////////
407 // Assign mapping quality and remove suboptimal matches
408 template < typename TMatches, typename TReads, typename TCooc, typename TCounts, typename TSpec >
assignMappingQuality(TMatches & matches,TReads & reads,TCooc & cooc,TCounts & cnts,RazerSOptions<TSpec> & options)409 void assignMappingQuality(TMatches &matches, TReads & reads, TCooc & cooc, TCounts &cnts, RazerSOptions<TSpec> & options)
410 {
411 typedef typename Value<TMatches>::Type TMatch;
412 typedef typename Iterator<TMatches, Standard>::Type TIterator;
413
414 //matches are already sorted
415 //::std::sort(
416 // begin(matches, Standard()),
417 // end(matches, Standard()),
418 // LessRNoMQ<TMatch>());
419
420
421 int maxSeedErrors = (int)(options.errorRate*options.artSeedLength)+1;
422 unsigned readNo = -1;
423
424 TIterator it = begin(matches, Standard());
425 TIterator itEnd = end(matches, Standard());
426 TIterator dit = it;
427
428 int bestQualSum, secondBestQualSum;
429 int secondBestDist = -1 ,secondBestMatches = -1;
430 for (; it != itEnd; ++it)
431 {
432 if ((*it).orientation == '-') continue;
433 bool mappingQualityFound = false;
434 int mappingQuality = 0;
435 int qualTerm1,qualTerm2;
436
437 readNo = (*it).rseqNo;
438 bestQualSum = (*it).mScore;
439
440 if(++it!=itEnd && (*it).rseqNo==readNo)
441 {
442 secondBestQualSum = (*it).mScore;
443 secondBestDist = (*it).editDist;
444 secondBestDist = (*it).editDist;
445 secondBestMatches = cnts[1][readNo] >> 5;
446 //CHECKcnts secondBestMatches = cnts[secondBestDist][readNo];
447 // secondBestMatches = cnts[secondBestDist][readNo];
448 (*it).orientation = '-';
449 // if(secondBestDist<=bestDist) unique=0;
450 }
451 else secondBestQualSum = -1000;
452 --it; //it is now at best match of current rseqNo
453
454 int bestDist = (*it).editDist;
455 int kPrime = (*it).seedEditDist;
456 if((bestQualSum==secondBestQualSum) || (kPrime>maxSeedErrors))
457 mappingQualityFound = true; //mq=0
458 else{
459 if(secondBestQualSum == -1000) qualTerm1 = 99;
460 else
461 {
462 qualTerm1 = (int)(secondBestQualSum - bestQualSum - 4.343 * log((double)secondBestMatches));
463 //if (secondBestKPrime - kPrime <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual; //TODO abchecken was mehr sinn macht
464 if (secondBestDist - bestDist <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual;
465 }
466 float avgSeedQual = 0.0;
467 if(!mappingQualityFound)
468 {
469 //TODO!!! generalize and adapt to razers lossrates
470 // lossrate 0.42 => -10 log10(0.42) = 4
471 int kPrimeLoss = 4; // options.kPrimeLoss; // bezieht sich auf 3 fehler in 28bp
472 qualTerm2 = kPrimeLoss + cooc[maxSeedErrors-kPrime];
473
474 for(unsigned j = 0; j<options.artSeedLength; ++j)
475 {
476 int q = getQualityValue(reads[readNo][j]);//(int)((unsigned char)(reads[readNo][j])>>3);
477 if(q>options.mutationRateQual) q = options.mutationRateQual;
478 avgSeedQual+=q;
479 }
480 avgSeedQual/=options.artSeedLength;
481 //-10 log10(28-2) = 14;
482 //generalize to -10 log10(artSeedLength - maxSeedErrors +1 ) // 14 fits for seedlength 28 to 32 with 2 errors
483 if(avgSeedQual>14) qualTerm2 += (int)((maxSeedErrors-kPrime)*(avgSeedQual-14));
484 }
485 }
486 if (!mappingQualityFound) mappingQuality = (qualTerm1<qualTerm2) ? qualTerm1:qualTerm2;
487 if (mappingQuality < 0) mappingQuality = 0;
488 (*it).mScore = mappingQuality;
489
490 *dit = *it;
491 // if(secondBestQualSum != -1000) ++it;
492 ++dit;
493 }
494 resize(matches, dit - begin(matches, Standard()));
495 }
496 #endif
497
498
499 //////////////////////////////////////////////////////////////////////////////
500 // Output matches
501 template <
502 typename TMatches,
503 typename TGenomeNames,
504 typename TReads,
505 typename TReadNames,
506 typename TCounts,
507 typename TSpec
508 >
dumpMatches(TMatches & matches,TGenomeNames const & genomeIDs,StringSet<CharString> & genomeFileNameList,::std::map<unsigned,::std::pair<::std::string,unsigned>> & gnoToFileMap,TReads const & reads,TCounts & stats,TReadNames const & readIDs,CharString readFName,CharString errorPrbFileName,RazerSOptions<TSpec> & options)509 void dumpMatches(
510 TMatches &matches, // forward/reverse matches
511 TGenomeNames const &genomeIDs, // Genome names (read from Fasta file, currently unused)
512 StringSet<CharString> &genomeFileNameList, // list of genome names (e.g. {"hs_ref_chr1.fa","hs_ref_chr2.fa"})
513 ::std::map<unsigned,::std::pair< ::std::string,unsigned> > &gnoToFileMap, //map to retrieve genome filename and sequence number within that file
514 TReads const &reads,
515 TCounts & stats, // Match statistics (possibly empty)
516 TReadNames const &readIDs, // Read names (read from Fasta file, currently unused)
517 CharString readFName, // read name (e.g. "reads.fa"), used for file/read naming
518 CharString errorPrbFileName,
519 RazerSOptions<TSpec> &options)
520 {
521 typedef typename Value<TMatches>::Type TMatch;
522 typedef typename Value<TReads>::Type TRead;
523 typedef typename Value<TGenomeSet>::Type TGenome;
524 typedef typename TMatch::TGPos TGPos;
525
526 if (options.outputFormat == 2)
527 {
528 options.sortOrder = 0; // ... to count multiple matches
529 }
530
531 if (options.outputFormat == 2)
532 {
533 options.maxHits = 1; // Eland outputs at most one match
534 options.sortOrder = 0; // read numbers are increasing
535 options.positionFormat = 1; // bases in file are numbered starting at 1
536 options.dumpAlignment = options.hammingOnly;
537 }
538 #ifdef RAZERS_DIRECT_MAQ_MAPPING
539 if (options.maqMapping) options.outputFormat = 3;
540 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength); //without + 1 here, used to check whether match should be supported if noBelowIdentity option is switched on
541 #endif
542 if (options.outputFormat == 3)
543 {
544 options.sortOrder = 1; // sort according to gPos
545 options.positionFormat = 1; // bases in file are numbered starting at 1
546 options.dumpAlignment = false;
547 }
548
549 #ifdef RAZERS_SPLICED
550 if(options.minMatchLen > 0)
551 options.sortOrder = 1;
552 #endif
553
554 // error profile
555 unsigned maxReadLength = 0;
556 for (unsigned i = 0; i < length(reads); ++i)
557 if (maxReadLength < length(reads[i]))
558 maxReadLength = length(reads[i]);
559
560 SEQAN_PROTIMESTART(dump_time);
561
562 // load Genome sequences for alignment dumps
563 TGenomeSet genomes;
564 if ((options.outputFormat == 0 && !options.hammingOnly) || options.dumpAlignment || !empty(errorPrbFileName))
565 if (!loadGenomes(genomes, genomeFileNameList)) {
566 ::std::cerr << "Failed to load genomes" << ::std::endl;
567 options.dumpAlignment = false;
568 }
569
570 // how many 0's should be padded?
571 int pzeros = 0;
572 for (unsigned l = length(reads); l > 9; l = l / 10)
573 ++pzeros;
574
575 int gzeros = 0;
576 for (unsigned l = length(genomes); l > 9; l = l / 10)
577 ++gzeros;
578
579 // remove the directory prefix of readFName
580 ::std::string _readName;
581 assign(_readName, readFName);
582 size_t lastPos = _readName.find_last_of('/') + 1;
583 if (lastPos == _readName.npos) lastPos = _readName.find_last_of('\\') + 1;
584 if (lastPos == _readName.npos) lastPos = 0;
585 CharString readName = _readName.substr(lastPos);
586
587
588 typedef Align<String<Dna5>, ArrayGaps> TAlign;
589 TAlign align;
590 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
591 // Score<int> scoreType(0,-1,-1,-1);
592
593 if (options.hammingOnly)
594 scoreType.data_mismatch = -1;
595 resize(rows(align), 2);
596
597 ::std::ofstream file;
598 CharString fileName = options.output;
599 if (empty(fileName))
600 {
601 fileName = readFName;
602 append(fileName, ".result");
603 }
604
605 file.open(toCString(fileName), ::std::ios_base::out | ::std::ios_base::trunc);
606 if (!file.is_open()) {
607 ::std::cerr << "Failed to open output file" << ::std::endl;
608 return;
609 }
610
611
612 #ifdef RAZERS_SPLICED
613 //maskPairDuplicates(matches);
614 #else
615 maskDuplicates(matches);
616 #endif
617 if (options.outputFormat > 0
618 #ifdef RAZERS_DIRECT_MAQ_MAPPING
619 && !options.maqMapping
620 #endif
621 )
622 {
623 // match statistics
624 unsigned maxErrors = (int)(options.errorRate * maxReadLength);
625 //if (maxErrors > 10) maxErrors = 10;
626 resize(stats, maxErrors + 1);
627 for (unsigned i = 0; i <= maxErrors; ++i)
628 resize(stats[i], length(reads), 0);
629 #ifdef RAZERS_SPLICED
630 if(options.minMatchLen > 0) countSplitMatches(matches, stats);
631 else
632 #endif
633 countMatches(matches, stats);
634 }
635
636
637
638 Nothing nothing;
639 unsigned currSeqNo = 0;
640 #ifdef RAZERS_DIRECT_MAQ_MAPPING
641 if(options.maqMapping)
642 {
643 String<int> cooc;
644 compactMatches(matches, stats, options, true, nothing, false); //only best two matches per read are kept
645 countCoocurrences(matches,cooc,options); //coocurrence statistics are filled
646 assignMappingQuality(matches,reads,cooc,stats,options);//mapping qualities are assigned and only one match per read is kept
647 }
648 else
649 #endif
650
651 #ifdef RAZERS_MICRO_RNA
652 if(options.microRNA)purgeAmbiguousRnaMatches(matches,options);
653 else
654 #endif
655 {
656 #ifdef RAZERS_SPLICED
657 if(options.minMatchLen>0)
658 compactSplicedMatches(matches, stats, options, true, nothing,nothing);
659 else
660 #endif
661 compactMatches(matches, stats, options, true, nothing);
662 }
663
664 switch (options.sortOrder) {
665 case 0:
666 ::std::sort(
667 begin(matches, Standard()),
668 end(matches, Standard()),
669 LessRNoGPos<TMatch>());
670 break;
671
672 case 1:
673 ::std::sort(
674 begin(matches, Standard()),
675 end(matches, Standard()),
676 LessGPosRNo<TMatch>());
677 break;
678
679 }
680
681 typename Iterator<TMatches, Standard>::Type it = begin(matches, Standard());
682 typename Iterator<TMatches, Standard>::Type itEnd = end(matches, Standard());
683
684
685 Dna5String gInf;
686 char _sep_ = '\t';
687 unsigned viewPosReadFirst = 0;
688 unsigned viewPosReadLast = 0;
689
690 switch (options.outputFormat)
691 {
692 case 0: // Razer Format
693 // _sep_ = ',';
694 for(; it != itEnd; ++it)
695 {
696 unsigned readLen = length(reads[(*it).rseqNo]);
697 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
698 #ifdef RAZERS_MICRO_RNA
699 percId = 100.0 * (1.0 - (double)(*it).editDist / (double) ((*it).mScore));
700 #endif
701 switch (options.readNaming)
702 {
703 // 0..filename is the read's Fasta id
704 case 0:
705 file << readIDs[(*it).rseqNo];
706 break;
707
708 // 1..filename is the read filename + seqNo
709 case 1:
710 file.fill('0');
711 file << readName << '#' << ::std::setw(pzeros) << (*it).rseqNo + 1;
712 break;
713
714 // 2..filename is the read sequence itself
715 case 2:
716 file << reads[(*it).rseqNo];
717 }
718
719 file << _sep_ << options.positionFormat << _sep_ << readLen << _sep_ << (*it).orientation << _sep_;
720
721 switch (options.genomeNaming)
722 {
723 // 0..filename is the read's Fasta id
724 case 0:
725 file << genomeIDs[(*it).gseqNo];
726 break;
727
728 // 1..filename is the read filename + seqNo
729 case 1:
730 file.fill('0');
731 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
732 }
733
734 // get alignment to dump or to fix end coordinates
735 if (options.dumpAlignment || !options.hammingOnly)
736 {
737 #ifdef RAZERS_MICRO_RNA
738 if(options.microRNA)
739 assignSource(row(align, 0), prefix(reads[(*it).rseqNo],(*it).mScore));
740 else
741 #endif
742 assignSource(row(align, 0), reads[(*it).rseqNo]);
743 assignSource(row(align, 1), infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd));
744 #ifdef RAZERS_MATEPAIRS
745 if ((*it).pairId != 0 && ((*it).rseqNo & 1))
746 reverseComplement(source(row(align, 0)));
747 #endif
748 if ((*it).orientation == 'R')
749 reverseComplement(source(row(align, 1)));
750
751 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
752
753 // transform first and last read character to genomic positions
754 viewPosReadFirst = toViewPosition(row(align, 0), 0);
755 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
756 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
757 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
758
759 if ((*it).orientation == 'R')
760 {
761 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
762 (*it).gEnd -= genomePosReadFirst;
763 }
764 else
765 {
766 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
767 (*it).gBegin += genomePosReadFirst;
768 }
769 }
770
771 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd << _sep_ << ::std::setprecision(5) << percId;
772 #ifdef RAZERS_MICRO_RNA
773 if(options.microRNA) file << _sep_ << (*it).mScore;
774 #endif
775
776 #ifdef RAZERS_MATEPAIRS
777 if ((*it).pairId != 0)
778 file << _sep_ << (*it).pairId << _sep_ << (*it).pairScore << _sep_ << (*it).mateDelta;
779 #endif
780 file << ::std::endl;
781
782 if (options.dumpAlignment)
783 dumpAlignment(file, align, viewPosReadFirst, viewPosReadLast + 1);
784 }
785 break;
786
787
788 case 1: // Enhanced Fasta Format
789 _sep_ = ',';
790 for(unsigned matchReadNo = -1, matchReadCount = 0; it != itEnd; ++it)
791 {
792 unsigned readLen = length(reads[(*it).rseqNo]);
793 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
794
795 if (matchReadNo != (*it).rseqNo)
796 {
797 matchReadNo = (*it).rseqNo;
798 matchReadCount = 0;
799 } else
800 ++matchReadCount;
801
802 ::std::string fastaID;
803 assign(fastaID, readIDs[(*it).rseqNo]);
804
805 ::std::string id = fastaID;
806 int fragId = (*it).rseqNo;
807 bool appendMatchId = options.maxHits > 1;
808
809 size_t left = fastaID.find_first_of('[');
810 size_t right = fastaID.find_last_of(']');
811 if (left != fastaID.npos && right != fastaID.npos && left < right)
812 {
813 fastaID.erase(right);
814 fastaID.erase(0, left + 1);
815 replace(fastaID.begin(), fastaID.end(), ',', ' ');
816 size_t pos = fastaID.find("id=");
817 if (pos != fastaID.npos) {
818 ::std::istringstream iss(fastaID.substr(pos + 3));
819 iss >> id;
820 // appendMatchId = false;
821 }
822 pos = fastaID.find("fragId=");
823 if (pos != fastaID.npos) {
824 ::std::istringstream iss(fastaID.substr(pos + 7));
825 iss >> fragId;
826 }
827 }
828
829 if ((*it).orientation == 'F')
830 // forward strand
831 file << '>' << ((*it).gBegin + options.positionFormat) << _sep_ << (*it).gEnd;
832 else
833 // reverse strand (switch begin and end)
834 file << '>' << (*it).gEnd << _sep_ << ((*it).gBegin + options.positionFormat);
835
836 unsigned ambig = 0;
837 for (unsigned i = 0; i <= (*it).editDist && i < length(stats); ++i)
838 ambig += stats[i][(*it).rseqNo];
839
840 file << "[id=" << id;
841 if (appendMatchId) file << "_" << matchReadCount;
842 file << ",fragId=" << fragId;
843 file << ",contigId=" << genomeIDs[(*it).gseqNo];
844 file << ",errors=" << (*it).editDist << ",percId=" << ::std::setprecision(5) << percId;
845 file << ",ambiguity=" << ambig << ']' << ::std::endl;
846
847 file << reads[(*it).rseqNo] << ::std::endl;
848 }
849 break;
850
851
852 case 2: // Eland Format
853 _sep_ = '\t';
854 for(unsigned readNo = 0; readNo < length(reads); ++readNo)
855 {
856 switch (options.readNaming)
857 {
858 // 0..filename is the read's Fasta id
859 case 0:
860 file << '>' << readIDs[readNo] << _sep_;
861 break;
862
863 // 1..filename is the read filename + seqNo
864 case 1:
865 file.fill('0');
866 file << readName << '#' << ::std::setw(pzeros) << readNo + 1 << _sep_;
867 break;
868 }
869
870 if (it == itEnd || readNo < (*it).rseqNo)
871 {
872 if (!empty(reads[readNo]))
873 file << reads[readNo] << _sep_ << "NM" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
874 else
875 {
876 for (unsigned i = 0; i < maxReadLength; ++i)
877 file << '.';
878 file << _sep_ << "QC" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << ::std::endl;
879 }
880 } else
881 {
882 file << reads[readNo] << _sep_;
883 unsigned bestMatches = 1;
884 if ((unsigned)(*it).editDist < length(stats))
885 bestMatches = stats[(*it).editDist][readNo];
886
887 if (bestMatches == 0) file << '?'; // impossible
888 if (bestMatches == 1) file << 'U'; // unique best match
889 if (bestMatches > 1) file << 'R'; // non-unique best matches
890
891 file << (*it).editDist << _sep_ << stats[0][readNo] << _sep_ << stats[1][readNo] << _sep_ << stats[2][readNo];
892
893 if (bestMatches == 1)
894 {
895 file << _sep_;
896 switch (options.genomeNaming)
897 {
898 // 0..filename is the read's Fasta id
899 case 0:
900 file << genomeIDs[(*it).gseqNo];
901 break;
902
903 // 1..filename is the read filename + seqNo
904 case 1:
905 file.fill('0');
906 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1;
907 }
908
909 if ((*it).orientation == 'F')
910 file << _sep_ << ((*it).gBegin + options.positionFormat) << _sep_ << 'F' << _sep_ << "..";
911 else
912 file << _sep_ << (*it).gEnd << _sep_ << 'R' << _sep_ << "..";
913
914 if ((*it).editDist > 0 && options.dumpAlignment && options.hammingOnly)
915 {
916 gInf = infix(genomes[(*it).gseqNo], (*it).gBegin, (*it).gEnd);
917 if ((*it).orientation == 'R')
918 reverseComplement(gInf);
919 for (unsigned i = 0; i < length(gInf); ++i)
920 if ((options.compMask[ordValue(reads[readNo][i])] &
921 options.compMask[ordValue(gInf[i])]) == 0)
922 file << _sep_ << i + 1 << gInf[i];
923 }
924 }
925 file << ::std::endl;
926 ++it;
927 }
928 }
929 break;
930 case 3: // Gff: printf "$chr $name_$format read $pos %ld . $dir . ID=$col[0]$unique$rest\n",$pos+$len-1;
931 for (unsigned filecount = 0; filecount < length(genomeFileNameList); ++filecount)
932 {
933 // open genome file
934 ::std::ifstream gFile;
935 gFile.open(toCString(genomeFileNameList[filecount]), ::std::ios_base::in | ::std::ios_base::binary);
936 if (!gFile.is_open())
937 {
938 std::cerr << "Couldn't open genome file." << std::endl;
939 break;
940 }
941
942 Dna5String currGenome;
943
944 // iterate over genome sequences
945 for(; !_streamEOF(gFile); ++currSeqNo)
946 {
947 read(gFile, currGenome, Fasta()); // read Fasta sequence
948 while(it != itEnd && (*it).gseqNo == currSeqNo)
949 {
950 #ifdef RAZERS_DIRECT_MAQ_MAPPING
951 if(options.maqMapping && options.noBelowIdentity && (*it).seedEditDist > maxSeedErrors)
952 {
953 ++it;
954 continue;
955 }
956 #endif
957
958 unsigned currReadNo = (*it).rseqNo;
959 int unique = 1;
960 unsigned bestMatches = 0;
961
962 #ifdef RAZERS_DIRECT_MAQ_MAPPING
963 if(options.maqMapping)
964 bestMatches = stats[0][currReadNo] >> 5;
965 else
966 #endif
967 {
968 #ifdef RAZERS_SPLICED
969 if (options.minMatchLen > 0)
970 {
971 if( -(*it).pairScore < (int)length(stats))
972 bestMatches = stats[-(*it).pairScore][currReadNo]/2;
973 }
974 else
975 #endif
976 if (bestMatches == 0 && (unsigned)(*it).editDist < length(stats))
977 bestMatches = stats[(*it).editDist][currReadNo];
978
979 }
980
981 bool suboptimal = false;
982 if (
983 #ifdef RAZERS_DIRECT_MAQ_MAPPING
984 !options.maqMapping &&
985 #endif
986 (unsigned)(*it).editDist > 0
987 #ifdef RAZERS_SPLICED
988 && options.minMatchLen == 0
989 #endif
990 )
991 {
992 for(unsigned d = 0; d < (unsigned)(*it).editDist; ++d)
993 if (stats[d][currReadNo]>0) suboptimal=true;
994 }
995 #ifdef RAZERS_SPLICED
996 if(options.minMatchLen > 0 && -(*it).pairScore > 0)
997 {
998 for(unsigned d = 0; d < -(unsigned)(*it).pairScore; ++d)
999 if (stats[d][currReadNo]>0) suboptimal=true;
1000 }
1001 #endif
1002
1003 if (bestMatches != 1)
1004 {
1005 unique = 0;
1006 if(options.purgeAmbiguous)
1007 {
1008 ++it;
1009 continue;
1010 }
1011
1012 // if((*it).mScore > 0) std::cout << (*it).mScore << "<-non uniq but score > 0\n";
1013 // ++it;
1014 // continue; // TODO: output non-unique matches (concerns maq mapping only)
1015 }
1016 unsigned readLen = length(reads[currReadNo]);
1017 #ifdef RAZERS_SPLICED
1018 if(options.minMatchLen > 0)
1019 readLen = (*it).mScore;
1020 #endif
1021 String<Dna5Q> readInf = infix(reads[currReadNo],0,readLen);
1022 #ifdef RAZERS_SPLICED
1023 if(options.minMatchLen > 0)
1024 {
1025 if(((*it).orientation == 'F' && (*it).mateDelta < 0)
1026 || ((*it).orientation == 'R' && (*it).mateDelta > 0))
1027 {
1028 readInf = infix(reads[currReadNo],
1029 length(reads[currReadNo])-readLen,
1030 length(reads[currReadNo]));
1031 }
1032 }
1033 #endif
1034
1035 switch (options.genomeNaming)
1036 {
1037 // 0..filename is the read's Fasta id
1038 case 0:
1039 file << genomeIDs[(*it).gseqNo] <<'\t';
1040 break;
1041 // 1..filename is the read filename + seqNo
1042 case 1:
1043 file.fill('0');
1044 file << gnoToFileMap[(*it).gseqNo].first << '#' << ::std::setw(gzeros) << gnoToFileMap[(*it).gseqNo].second + 1 << '\t';
1045 break;
1046 }
1047
1048 // get alignment to dump or to fix end coordinates
1049 if (!options.hammingOnly)
1050 {
1051 assignSource(row(align, 0), readInf);
1052 assignSource(row(align, 1), infix(currGenome, (*it).gBegin, (*it).gEnd));
1053 if ((*it).orientation == 'R')
1054 reverseComplement(source(row(align, 1)));
1055
1056 #ifdef RAZERS_SPLICED
1057 if(options.minMatchLen > 0)
1058 globalAlignment(align, scoreType, AlignConfig<false,false,false,false>(), Gotoh());
1059 else
1060 #endif
1061 globalAlignment(align, scoreType, AlignConfig<false,true,true,false>(), Gotoh());
1062
1063 // transform first and last read character to genomic positions
1064 viewPosReadFirst = toViewPosition(row(align, 0), 0);
1065 viewPosReadLast = toViewPosition(row(align, 0), readLen - 1);
1066 unsigned genomePosReadFirst = toSourcePosition(row(align, 1), viewPosReadFirst);
1067 unsigned genomePosReadLast = toSourcePosition(row(align, 1), viewPosReadLast);
1068 #ifdef RAZERS_SPLICED
1069 if(options.minMatchLen == 0)
1070 #endif
1071 {
1072 if ((*it).orientation == 'R')
1073 {
1074 (*it).gBegin = (*it).gEnd - (genomePosReadLast + 1);
1075 (*it).gEnd -= genomePosReadFirst;
1076 }
1077 else
1078 {
1079 (*it).gEnd = (*it).gBegin + (genomePosReadLast + 1);
1080 (*it).gBegin += genomePosReadFirst;
1081 }
1082 }
1083 }
1084
1085 //file << options.runID << "_razers\tread";
1086 file << "razers\tread";
1087 file << '\t' << ((*it).gBegin + options.positionFormat) << '\t' << (*it).gEnd << '\t';
1088 double percId = 100.0 * (1.0 - (double)(*it).editDist / (double)readLen);
1089 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1090 if(options.maqMapping)
1091 file << (*it).mScore << "\t";
1092 else
1093 #endif
1094 file << percId << "\t";
1095
1096 if ((*it).orientation == 'F')
1097 file << '+' << '\t' << '.' <<'\t';
1098 else
1099 file << '-' << '\t' << '.' <<'\t';
1100
1101 switch (options.readNaming)
1102 {
1103 // 0..filename is the read's Fasta id
1104 case 0:
1105 file << "ID=" <<readIDs[currReadNo];
1106 break;
1107
1108 // 1..filename is the read filename + seqNo
1109 case 1:
1110 file.fill('0');
1111 file << "ID=" << readName << '#' << ::std::setw(pzeros) << currReadNo + 1;
1112 break;
1113 }
1114 if(suboptimal) file << ";suboptimal";
1115 else
1116 {
1117 if(unique==1) file << ";unique";
1118 if(unique==0) file << ";multi";
1119 }
1120
1121 #ifdef RAZERS_SPLICED
1122 if(options.minMatchLen > 0)
1123 {
1124 file << ";delta=" << (*it).mateDelta << ";pairId="<<(*it).pairId;
1125 file << ";pairErr=" << -(*it).pairScore;
1126 }
1127 #endif
1128
1129 if ((*it).editDist > 0)
1130 {
1131 if (options.hammingOnly)
1132 {
1133 gInf = infix(currGenome, (*it).gBegin, (*it).gEnd);
1134 if ((*it).orientation == 'R')
1135 reverseComplement(gInf);
1136 bool first = true;
1137 file << ";cigar=" << readLen << "M";
1138 file << ";mutations=";
1139 for (unsigned i = 0; i < length(gInf); ++i)
1140 if ((options.compMask[ordValue(readInf[i])] &
1141 options.compMask[ordValue(gInf[i])]) == 0)
1142 {
1143 if(first){ file << i + 1 << (Dna5)readInf[i]; first = false;}
1144 else file <<','<< i + 1 << (Dna5)readInf[i];
1145 }
1146
1147 }
1148 else
1149 {
1150 std::stringstream cigar, mutations;
1151 typedef typename Row<TAlign>::Type TRow;
1152 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
1153 TAlignIterator ali_it0 = iter(row(align,0),viewPosReadFirst);
1154 TAlignIterator ali_it1 = iter(row(align,1),viewPosReadFirst);
1155 TAlignIterator ali_it0stop = iter(row(align,0),viewPosReadLast + 1);
1156 TAlignIterator ali_it1stop = iter(row(align,1),viewPosReadLast + 1);
1157
1158 #ifdef RAZERS_SPLICED
1159 if(options.minMatchLen > 0)
1160 {
1161 ali_it0 = begin(row(align,0));
1162 ali_it1 = begin(row(align,1));
1163 ali_it0stop = end(row(align,0));
1164 ali_it1stop = end(row(align,1));
1165 }
1166 #endif
1167 getCigarLine(align,cigar,mutations,ali_it0,ali_it0stop,ali_it1,ali_it1stop);
1168 file << ";cigar="<<cigar.str();
1169 if(length(mutations.str())>0)
1170 file << ";mutations=" << mutations.str();
1171 }
1172 }
1173
1174
1175 if(
1176 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1177 options.maqMapping ||
1178 #endif
1179 options.fastaIdQual)
1180 {
1181 // file << ";read=";
1182 // for(unsigned j=0;j<length(reads[currReadNo]);++j)
1183 // {
1184 // file << (Dna5)reads[currReadNo][j];
1185 // }
1186 file << ";quality=";
1187 for(unsigned j=0;j<readLen;++j)
1188 {
1189 file << (char)(getQualityValue(readInf[j])+ 33);
1190 }
1191 }
1192
1193 file << ::std::endl;
1194 ++it;
1195 }
1196 }
1197 gFile.close();
1198 ++filecount;
1199 }
1200 break;
1201 }
1202
1203 file.close();
1204
1205 // get empirical error distribution
1206 if (!empty(errorPrbFileName) && maxReadLength > 0)
1207 {
1208 file.open(toCString(errorPrbFileName), ::std::ios_base::out | ::std::ios_base::trunc);
1209 if (file.is_open())
1210 {
1211 String<long double> posError;
1212 unsigned unique = 0;
1213 unsigned insertions = 0;
1214 unsigned deletions = 0;
1215 resize(posError, maxReadLength, 0);
1216
1217 if (options.hammingOnly)
1218 unique = getErrorDistribution(posError, matches, reads, genomes, options);
1219 else
1220 {
1221 unique = getErrorDistribution(posError, insertions, deletions, matches, reads, genomes, options);
1222 ::std::cerr << "insertProb: " << (double)insertions / ((double)length(posError) * (double)unique) << ::std::endl;
1223 ::std::cerr << "deleteProb: " << (double)deletions / ((double)length(posError) * (double)unique) << ::std::endl;
1224 }
1225
1226 file << (double)posError[0] / (double)unique;
1227 for (unsigned i = 1; i < length(posError); ++i)
1228 file << '\t' << (double)posError[i] / (double)unique;
1229 file << ::std::endl;
1230 file.close();
1231 } else
1232 ::std::cerr << "Failed to open error distribution file" << ::std::endl;
1233 }
1234
1235 options.timeDumpResults = SEQAN_PROTIMEDIFF(dump_time);
1236
1237 if (options._debugLevel >= 1)
1238 ::std::cerr << "Dumping results took \t" << options.timeDumpResults << " seconds" << ::std::endl;
1239 }
1240
1241
1242 }
1243
1244 #endif
1245
1246