1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #ifndef SEQAN_HEADER_OUTPUT_FORMAT_H
23 #define SEQAN_HEADER_OUTPUT_FORMAT_H
24
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28
29 #include "razers.h"
30 #include <seqan/align.h>
31
32 namespace SEQAN_NAMESPACE_MAIN
33 {
34
35 //////////////////////////////////////////////////////////////////////////////
36 // Quality-based score
37
38 template <typename TQualityString = CharString>
39 struct Quality;
40
41 template <typename TValue, typename TQualityString>
42 class Score<TValue, Quality<TQualityString> >
43 {
44 public:
45 TValue data_match;
46 TValue data_mismatch;
47 TValue data_gap_extend;
48 TValue data_gap_open;
49
50 TQualityString const *data_qual;
51
52 public:
Score()53 Score():
54 data_match(0),
55 data_mismatch(-1),
56 data_gap_extend(-1),
57 data_gap_open(-1),
58 data_qual(NULL)
59 {
60 }
Score(TValue _match,TValue _mismatch,TValue _gap)61 Score(TValue _match, TValue _mismatch, TValue _gap):
62 data_match(_match),
63 data_mismatch(_mismatch),
64 data_gap_extend(_gap),
65 data_gap_open(_gap),
66 data_qual(NULL)
67 {
68 }
Score(TValue _match,TValue _mismatch,TValue _gap_extend,TValue _gap_open,TQualityString const & _qual)69 Score(TValue _match, TValue _mismatch, TValue _gap_extend, TValue _gap_open, TQualityString const &_qual):
70 data_match(_match),
71 data_mismatch(_mismatch),
72 data_gap_extend(_gap_extend),
73 data_gap_open(_gap_open),
74 data_qual(&_qual)
75 {
76 }
77
Score(Score const & other)78 Score(Score const & other):
79 data_match(other.data_match),
80 data_mismatch(other.data_mismatch),
81 data_gap_extend(other.data_gap_extend),
82 data_gap_open(other.data_gap_open),
83 data_qual(other.data_qual)
84 {
85 }
~Score()86 ~Score()
87 {
88 }
89
90 Score & operator = (Score const & other)
91 {
92 data_match = other.data_match;
93 data_mismatch = other.data_mismatch;
94 data_gap_extend = other.data_gap_extend;
95 data_gap_open = other.data_gap_open;
96 data_qual = other.data_qual;
97 return *this;
98 }
99 };
100
101 //////////////////////////////////////////////////////////////////////////////
102
103 template <typename TValue, typename TQualityString, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
104 inline TValue
score(Score<TValue,Quality<TQualityString>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)105 score(Score<TValue, Quality<TQualityString> > const & me,
106 TPos1 pos1,
107 TPos2 pos2,
108 TSeq1 const &seq1,
109 TSeq2 const &seq2)
110 {
111 if (seq1[pos1] != seq2[pos2])
112 if (me.data_qual)
113 return (*me.data_qual)[pos2];
114 else
115 return scoreMismatch(me);
116 else
117 return scoreMatch(me);
118 }
119
120
121 //////////////////////////////////////////////////////////////////////////////
122 // Less-operators ...
123
124 // ... to sort matches and remove duplicates with equal beginPos
125 template <typename TAlignedReadStore, typename TAlignedReadQualityStore>
126 struct LessGPosRNo :
127 public ::std::binary_function < typename Value<TAlignedReadStore>::Type, typename Value<TAlignedReadStore>::Type, bool >
128 {
129 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
130 TAlignedReadQualityStore &qualStore;
131
LessGPosRNoLessGPosRNo132 LessGPosRNo(TAlignedReadQualityStore &_qualStore):
133 qualStore(_qualStore) {}
134
operatorLessGPosRNo135 inline bool operator() (TAlignedRead const &a, TAlignedRead const &b) const
136 {
137 // contig
138 if (a.contigId < b.contigId) return true;
139 if (a.contigId > b.contigId) return false;
140
141 // beginning position
142 typename TAlignedRead::TPos ba = _min(a.beginPos, a.endPos);
143 typename TAlignedRead::TPos bb = _min(b.beginPos, b.endPos);
144 if (ba < bb) return true;
145 if (ba > bb) return false;
146
147 // orientation
148 bool oa = a.beginPos < a.endPos;
149 bool ob = b.beginPos < b.endPos;
150 if (oa != ob) return oa;
151
152 // read number
153 if (a.readId < b.readId) return true;
154 if (a.readId > b.readId) return false;
155
156 // qualities
157 if (a.id == TAlignedRead::INVALID_ID) return false;
158 if (b.id == TAlignedRead::INVALID_ID) return true;
159 typename GetValue<TAlignedReadQualityStore>::Type qa = getValue(qualStore, a.id);
160 typename GetValue<TAlignedReadQualityStore>::Type qb = getValue(qualStore, b.id);
161 if (qa.pairScore > qb.pairScore) return true;
162 if (qa.pairScore < qb.pairScore) return false;
163 return qa.score > qb.score;
164 }
165 };
166
167 //////////////////////////////////////////////////////////////////////////////
168 // Determine error distribution
169 template <typename TErrDistr, typename TFragmentStore, typename TOptions>
170 inline unsigned
getErrorDistribution(TErrDistr & posError,TFragmentStore & store,TOptions & options)171 getErrorDistribution(
172 TErrDistr &posError,
173 TFragmentStore &store,
174 TOptions &options)
175 {
176 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore;
177 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
178 typedef typename TFragmentStore::TContigPos TContigPos;
179
180 typename Iterator<TAlignedReadStore, Standard>::Type it = begin(store.alignedReadStore, Standard());
181 typename Iterator<TAlignedReadStore, Standard>::Type itEnd = end(store.alignedReadStore, Standard());
182
183 Dna5String genome;
184 TContigPos left, right;
185 unsigned unique = 0;
186
187 for (; it != itEnd; ++it)
188 {
189 if ((*it).id == TAlignedRead::INVALID_ID) continue;
190
191 Dna5String const &read = store.readSeqStore[(*it).readId];
192 left = (*it).beginPos;
193 right = (*it).endPos;
194
195 if (left < right)
196 genome = infix(store.contigStore[(*it).contigId].seq, left, right);
197 else
198 {
199 genome = infix(store.contigStore[(*it).contigId].seq, right, left);
200 reverseComplement(genome);
201 }
202 for (unsigned i = 0; i < length(posError) && i < length(read); ++i)
203 if ((options.compMask[ordValue(genome[i])] & options.compMask[ordValue(read[i])]) == 0)
204 ++posError[i];
205 ++unique;
206 }
207 return unique;
208 }
209
210 template <typename TErrDistr, typename TCount1, typename TCount2, typename TFragmentStore, typename TSpec>
211 inline unsigned
getErrorDistribution(TErrDistr & posError,TCount1 & insertions,TCount2 & deletions,TFragmentStore & store,RazerSOptions<TSpec> & options)212 getErrorDistribution(
213 TErrDistr &posError,
214 TCount1 &insertions,
215 TCount2 &deletions,
216 TFragmentStore &store,
217 RazerSOptions<TSpec> &options)
218 {
219 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore;
220 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
221 typedef typename TFragmentStore::TContigPos TContigPos;
222
223 typedef Align<String<Dna5>, ArrayGaps> TAlign;
224 typedef typename Row<TAlign>::Type TRow;
225 typedef typename Iterator<TRow>::Type TIter;
226
227 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
228 typedef typename Position<TAlign>::Type TPosition;
229
230 typename Iterator<TAlignedReadStore, Standard>::Type it = begin(store.alignedReadStore, Standard());
231 typename Iterator<TAlignedReadStore, Standard>::Type itEnd = end(store.alignedReadStore, Standard());
232
233 Align<Dna5String, ArrayGaps> align;
234 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
235 if (options.hammingOnly)
236 scoreType.data_mismatch = -1;
237 resize(rows(align), 2);
238
239 unsigned unique = 0;
240 for (; it != itEnd; ++it)
241 {
242 if ((*it).id == TAlignedRead::INVALID_ID) continue;
243
244 assignSource(row(align, 0), store.readSeqStore[(*it).readId]);
245 TContigPos left = (*it).beginPos;
246 TContigPos right = (*it).endPos;
247
248 if (left < right)
249 assignSource(row(align, 1), infix(store.contigStore[(*it).contigId].seq, left, right));
250 else
251 {
252 assignSource(row(align, 1), infix(store.contigStore[(*it).contigId].seq, right, left));
253 reverseComplement(source(row(align, 1)));
254 }
255 globalAlignment(align, scoreType);
256
257 TRow& row0 = row(align, 0);
258 TRow& row1 = row(align, 1);
259
260 TPosition begin = beginPosition(cols(align));
261 TPosition end = endPosition(cols(align));
262
263 TIter it0 = iter(row0, begin);
264 TIter it1 = iter(row1, begin);
265 TIter end0 = iter(row0, end);
266
267 unsigned pos = 0;
268 for (; it0 != end0 && pos < length(posError); ++it0, ++it1)
269 {
270 if (isGap(it0))
271 ++insertions;
272 else
273 {
274 if (isGap(it1))
275 ++deletions;
276 else
277 if ((options.compMask[ordValue(getValue(it0))] & options.compMask[ordValue(getValue(it1))]) == 0)
278 ++posError[pos];
279 ++pos;
280 }
281 }
282 ++unique;
283 }
284 return unique;
285 }
286
287
288 //////////////////////////////////////////////////////////////////////////////
289 // Dump an alignment
290 template <typename TFile, typename TSource, typename TSpec>
291 inline void
dumpAlignment(TFile & target,Align<TSource,TSpec> const & source)292 dumpAlignment(TFile & target, Align<TSource, TSpec> const & source)
293 {
294 typedef Align<TSource, TSpec> const TAlign;
295 typedef typename Row<TAlign>::Type TRow;
296 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
297 typedef typename Position<TAlign>::Type TPosition;
298
299 TRowsPosition row_count = length(rows(source));
300 TPosition begin_ = beginPosition(cols(source));
301 TPosition end_ = endPosition(cols(source));
302
303 // Print sequences
304 for(TRowsPosition i=0;i<row_count;++i) {
305 if (i == 0)
306 _streamWrite(target, "#Read: ");
307 else
308 _streamWrite(target, "#Genome: ");
309 TRow& row_ = row(source, i);
310 typedef typename Iterator<typename Row<TAlign>::Type const>::Type TIter;
311 TIter begin1_ = iter(row_, begin_);
312 TIter end1_ = iter(row_, end_);
313 for (; begin1_ != end1_; ++begin1_) {
314 if (isGap(begin1_)) _streamPut(target, gapValue<char>());
315 else _streamPut(target, *begin1_);
316 }
317 _streamPut(target, '\n');
318 }
319 }
320
321 template<typename TMatches, typename TCounts, typename TOptions>
322 void
countCoocurrences(TMatches & matches,TCounts & cooc,TOptions & options)323 countCoocurrences(TMatches & matches, TCounts & cooc, TOptions & options)
324 {
325 clear(cooc);
326 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength) + 1;
327 resize(cooc,maxSeedErrors+1,0);
328 for (int i = 0; i < maxSeedErrors+1; ++i)
329 cooc[i] = 1;
330
331 int count = 0;
332 unsigned readNo = -1;
333 int preEditDist = -1;
334 typename Iterator<TMatches>::Type it = begin(matches,Standard());
335 typename Iterator<TMatches>::Type itEnd = end(matches,Standard());
336
337 for(; it != itEnd; ++it)
338 {
339 if ((*it).readId == readNo)
340 {
341 if(preEditDist > 1) continue;// || dist > options.errorRate * maxReadLength + 1) continue;
342 int dist = (*it).seedEditDist - preEditDist;
343 if(dist > maxSeedErrors) continue;
344 if(dist < 0) ++cooc[0];
345 else ++cooc[dist];
346 }
347 else
348 {
349 readNo = (*it).readId;
350 preEditDist = (*it).seedEditDist;
351 if(preEditDist <= 1) ++count;
352 }
353 }
354 for (unsigned i = 0; i < length(cooc); ++i)
355 {
356 cooc[i] = (int)(-4.343 * log((double)cooc[i]/count) );
357 if (cooc[i] < 0) cooc[i] = 0;
358 }
359 if(options._debugLevel > 1)
360 {
361 std::cerr << "[mapping_count] ";
362 for(unsigned j = 0; j < length(cooc); ++j)
363 std::cerr << cooc[j] << " ";
364 std::cerr << std::endl;
365 }
366
367 }
368
369
370 template<typename TAlign, typename TString>
371 void
getCigarLine(TAlign & align,TString & cigar,TString & mutations)372 getCigarLine(TAlign & align, TString & cigar, TString & mutations)
373 {
374
375 typedef typename Source<TAlign>::Type TSource;
376 typedef typename Iterator<TSource, Rooted>::Type TStringIterator;
377
378 typedef typename Row<TAlign>::Type TRow;
379 typedef typename Iterator<TRow, Rooted>::Type TAlignIterator;
380
381 TAlignIterator ali_it0_stop = iter(row(align,0),endPosition(cols(align)));
382 TAlignIterator ali_it1_stop = iter(row(align,1),endPosition(cols(align)));
383 TAlignIterator ali_it0 = iter(row(align,0),beginPosition(cols(align)));
384 TAlignIterator ali_it1 = iter(row(align,1),beginPosition(cols(align)));
385 TStringIterator readBase = begin(source(row(align,0)));
386 //std::cout << "getting cigar line\n";//ali0 len = " <<ali_it0_stop-ali_it0 << " \t ali1 len = "<<ali_it1_stop-ali_it1<<"\n";
387 int readPos = 0;
388 bool first = true;
389 while(ali_it0 != ali_it0_stop && ali_it1 != ali_it1_stop)
390 {
391 int matched = 0;
392 int inserted = 0;
393 int deleted = 0;
394 while(ali_it0!=ali_it0_stop && ali_it1!=ali_it1_stop && !isGap(ali_it0)&& !isGap(ali_it1))
395 {
396 ++readPos;
397 if(*ali_it1 != *ali_it0)
398 {
399 if(first) first = false;
400 else mutations << ",";
401 mutations << readPos <<*readBase;
402 }
403 ++readBase;
404 ++ali_it0;
405 ++ali_it1;
406 ++matched;
407 }
408 if(matched>0) cigar << matched<< "M" ;
409 while(ali_it0!=ali_it0_stop && isGap(ali_it0))
410 {
411 ++ali_it0;
412 ++ali_it1;
413 ++deleted;
414 }
415 if(deleted>0) cigar << deleted << "D";
416 while(isGap(ali_it1)&& ali_it1!=ali_it1_stop)
417 {
418 ++ali_it0;
419 ++ali_it1;
420 ++readPos;
421 if(first) first = false;
422 else mutations << ",";
423 mutations << readPos << *readBase;
424 ++readBase;
425 ++inserted;
426 }
427 if(inserted>0) cigar << inserted << "I";
428 }
429
430 }
431
432
433 #ifdef RAZERS_DIRECT_MAQ_MAPPING
434 //////////////////////////////////////////////////////////////////////////////
435 // Assign mapping quality and remove suboptimal matches
436 template < typename TMatches, typename TReads, typename TCooc, typename TCounts, typename TSpec >
assignMappingQuality(TMatches & matches,TReads & reads,TCooc & cooc,TCounts & cnts,RazerSOptions<TSpec> & options)437 void assignMappingQuality(TMatches &matches, TReads & reads, TCooc & cooc, TCounts &cnts, RazerSOptions<TSpec> & options)
438 {
439 typedef typename Value<TMatches>::Type TMatch;
440 typedef typename Iterator<TMatches, Standard>::Type TIterator;
441
442 //matches are already sorted
443 //std::sort(
444 // begin(matches, Standard()),
445 // end(matches, Standard()),
446 // LessRNoMQ<TMatch>());
447
448
449 int maxSeedErrors = (int)(options.errorRate*options.artSeedLength)+1;
450 unsigned readNo = -1;
451
452 TIterator it = begin(matches, Standard());
453 TIterator itEnd = end(matches, Standard());
454 TIterator dit = it;
455
456 int bestQualSum, secondBestQualSum;
457 int secondBestDist = -1 ,secondBestMatches = -1;
458 for (; it != itEnd; ++it)
459 {
460 if ((*it).orientation == '-') continue;
461 bool mappingQualityFound = false;
462 int mappingQuality = 0;
463 int qualTerm1,qualTerm2;
464
465 readNo = (*it).readId;
466 bestQualSum = (*it).mScore;
467
468 if(++it!=itEnd && (*it).readId==readNo)
469 {
470 secondBestQualSum = (*it).mScore;
471 secondBestDist = (*it).editDist;
472 secondBestDist = (*it).editDist;
473 secondBestMatches = cnts[1][readNo] >> 5;
474 //CHECKcnts secondBestMatches = cnts[secondBestDist][readNo];
475 // secondBestMatches = cnts[secondBestDist][readNo];
476 (*it).orientation = '-';
477 // if(secondBestDist<=bestDist) unique=0;
478 }
479 else secondBestQualSum = -1000;
480 --it; //it is now at best match of current readId
481
482 int bestDist = (*it).editDist;
483 int kPrime = (*it).seedEditDist;
484 if((bestQualSum==secondBestQualSum) || (kPrime>maxSeedErrors))
485 mappingQualityFound = true; //mq=0
486 else{
487 if(secondBestQualSum == -1000) qualTerm1 = 99;
488 else
489 {
490 qualTerm1 = (int)(secondBestQualSum - bestQualSum - 4.343 * log((double)secondBestMatches));
491 //if (secondBestKPrime - kPrime <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual; //TODO abchecken was mehr sinn macht
492 if (secondBestDist - bestDist <= 1 && qualTerm1 > options.mutationRateQual) qualTerm1 = options.mutationRateQual;
493 }
494 float avgSeedQual = 0.0;
495 if(!mappingQualityFound)
496 {
497 //TODO!!! generalize and adapt to razers lossrates
498 // lossrate 0.42 => -10 log10(0.42) = 4
499 int kPrimeLoss = 4; // options.kPrimeLoss; // bezieht sich auf 3 fehler in 28bp
500 qualTerm2 = kPrimeLoss + cooc[maxSeedErrors-kPrime];
501
502 for(unsigned j = 0; j<options.artSeedLength; ++j)
503 {
504 int q = getQualityValue(store.readSeqStore[readNo][j]);//(int)((unsigned char)(store.readSeqStore[readNo][j])>>3);
505 if(q>options.mutationRateQual) q = options.mutationRateQual;
506 avgSeedQual+=q;
507 }
508 avgSeedQual/=options.artSeedLength;
509 //-10 log10(28-2) = 14;
510 //generalize to -10 log10(artSeedLength - maxSeedErrors +1 ) // 14 fits for seedlength 28 to 32 with 2 errors
511 if(avgSeedQual>14) qualTerm2 += (int)((maxSeedErrors-kPrime)*(avgSeedQual-14));
512 }
513 }
514 if (!mappingQualityFound) mappingQuality = (qualTerm1<qualTerm2) ? qualTerm1:qualTerm2;
515 if (mappingQuality < 0) mappingQuality = 0;
516 (*it).mScore = mappingQuality;
517
518 *dit = *it;
519 // if(secondBestQualSum != -1000) ++it;
520 ++dit;
521 }
522 resize(matches, dit - begin(matches, Standard()));
523 }
524 #endif
525
526
527 //////////////////////////////////////////////////////////////////////////////
528 // Output matches
529 template <
530 typename TFSSpec,
531 typename TFSConfig,
532 typename TGNoToFile,
533 typename TCounts,
534 typename TSpec
535 >
dumpMatches(FragmentStore<TFSSpec,TFSConfig> & store,StringSet<CharString> & genomeFileNameList,String<TGNoToFile> & gnoToFileMap,TCounts & stats,CharString readFName,CharString errorPrbFileName,RazerSOptions<TSpec> & options)536 void dumpMatches(
537 FragmentStore<TFSSpec, TFSConfig> &store, // forward/reverse matches
538 StringSet<CharString> &genomeFileNameList, // list of genome names (e.g. {"hs_ref_chr1.fa","hs_ref_chr2.fa"})
539 String<TGNoToFile> &gnoToFileMap, // map to retrieve genome filename and sequence number within that file
540 TCounts & stats, // Match statistics (possibly empty)
541 CharString readFName, // read name (e.g. "reads.fa"), used for file/read naming
542 CharString errorPrbFileName,
543 RazerSOptions<TSpec> &options)
544 {
545 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore;
546 typedef typename TFragmentStore::TAlignedReadStore TAlignedReadStore;
547 typedef typename TFragmentStore::TAlignQualityStore TAlignQualityStore;
548 typedef typename TFragmentStore::TContigPos TContigPos;
549 typedef typename Value<TAlignedReadStore>::Type TAlignedRead;
550 typedef typename Iterator<TAlignedReadStore, Standard>::Type TAlignedReadIter;
551 typedef typename Id<TAlignedRead>::Type TId;
552 typedef typename GetValue<TAlignQualityStore>::Type TQuality;
553 typedef typename Value<TGenomeSet>::Type TGenome;
554 typedef typename TFragmentStore::TContigPos TGPos;
555
556 if (options.outputFormat == 2)
557 {
558 options.sortOrder = 0; // ... to count multiple matches
559 }
560
561 if (options.outputFormat == 2)
562 {
563 options.maxHits = 1; // Eland outputs at most one match
564 options.sortOrder = 0; // read numbers are increasing
565 options.positionFormat = 1; // bases in file are numbered starting at 1
566 options.dumpAlignment = options.hammingOnly;
567 }
568 #ifdef RAZERS_DIRECT_MAQ_MAPPING
569 if (options.maqMapping) options.outputFormat = 3;
570 int maxSeedErrors = (int)(options.errorRate * options.artSeedLength); //without + 1 here, used to check whether match should be supported if noBelowIdentity option is switched on
571 #endif
572 if (options.outputFormat == 3)
573 {
574 options.sortOrder = 1; // sort according to gPos
575 options.positionFormat = 1; // bases in file are numbered starting at 1
576 options.dumpAlignment = false;
577 }
578
579
580 // error profile
581 unsigned maxReadLength = 0;
582 for (unsigned i = 0; i < length(store.readSeqStore); ++i)
583 if (maxReadLength < length(store.readSeqStore[i]))
584 maxReadLength = length(store.readSeqStore[i]);
585
586 SEQAN_PROTIMESTART(dump_time);
587
588 // load Genome sequences for alignment dumps
589 TGenomeSet genomes;
590 if (options.dumpAlignment || options.outputFormat == 4 || options.outputFormat == 5 || !empty(errorPrbFileName))
591 if (!loadContigs(store, genomeFileNameList)) {
592 std::cerr << "Failed to load genomes" << std::endl;
593 options.dumpAlignment = false;
594 }
595
596 // how many 0's should be padded?
597 int pzeros = 0;
598 for (unsigned l = length(store.readSeqStore); l > 9; l = l / 10)
599 ++pzeros;
600
601 int gzeros = 0;
602 for (unsigned l = length(store.contigStore); l > 9; l = l / 10)
603 ++gzeros;
604
605 // remove the directory prefix of readFName
606 std::string _readName;
607 assign(_readName, readFName);
608 size_t lastPos = _readName.find_last_of('/') + 1;
609 if (lastPos == _readName.npos) lastPos = _readName.find_last_of('\\') + 1;
610 if (lastPos == _readName.npos) lastPos = 0;
611 CharString readName = _readName.substr(lastPos);
612
613
614 Align<String<Dna5>, ArrayGaps> align;
615 Score<int> scoreType = Score<int>(0, -999, -1001, -1000); // levenshtein-score (match, mismatch, gapOpen, gapExtend)
616
617 if (options.hammingOnly)
618 scoreType.data_mismatch = -1;
619 resize(rows(align), 2);
620
621 std::ofstream file;
622 CharString fileName = options.output;
623 if (empty(fileName))
624 {
625 fileName = readFName;
626 append(fileName, ".result");
627 }
628
629 file.open(toCString(fileName), std::ios_base::out | std::ios_base::trunc);
630 if (!file.is_open()) {
631 std::cerr << "Failed to open output file" << std::endl;
632 return;
633 }
634
635
636 #ifndef RAZERS_DONTMASKDUPLICATES
637 maskDuplicates(store);
638 #endif
639 if (options.outputFormat > 0
640 #ifdef RAZERS_DIRECT_MAQ_MAPPING
641 && !options.maqMapping
642 #endif
643 )
644 {
645 // match statistics
646 unsigned maxErrors = (int)(options.errorRate * maxReadLength);
647 if (maxErrors > 10) maxErrors = 10;
648 resize(stats, maxErrors + 1);
649 for (unsigned i = 0; i <= maxErrors; ++i)
650 resize(stats[i], length(store.readStore), 0);
651 countMatches(store, stats);
652 }
653
654 Nothing nothing;
655 unsigned curreadId = 0;
656 #ifdef RAZERS_DIRECT_MAQ_MAPPING
657 if(options.maqMapping)
658 {
659 String<int> cooc;
660 compactMatches(matches, stats, options, true, nothing, false); //only best two matches per read are kept
661 countCoocurrences(matches,cooc,options); //coocurrence statistics are filled
662 assignMappingQuality(matches,reads,cooc,stats,options);//mapping qualities are assigned and only one match per read is kept
663 }
664 else
665 #endif
666
667 #ifdef RAZERS_MICRO_RNA
668 if(options.microRNA)purgeAmbiguousRnaMatches(store,options);
669 else
670 #endif
671 compactMatches(store, stats, options, true, nothing);
672
673 String<int> libSize; // store outer library size for each pair match (indexed by pairMatchId)
674 calculateInsertSizes(libSize, store);
675
676 switch (options.sortOrder) {
677 case 0:
678 sortAlignedReads(
679 store.alignedReadStore,
680 LessRNoGPos<TAlignedReadStore, TAlignQualityStore>(store.alignQualityStore));
681 break;
682
683 case 1:
684 sortAlignedReads(
685 store.alignedReadStore,
686 LessGPosRNo<TAlignedReadStore, TAlignQualityStore>(store.alignQualityStore));
687 break;
688
689 }
690
691 TAlignedReadIter it = begin(store.alignedReadStore, Standard());
692 TAlignedReadIter itEnd = end(store.alignedReadStore, Standard());
693
694 Dna5String gInf;
695 char _sep_ = '\t';
696
697 switch (options.outputFormat)
698 {
699 case 0: // Razer Format
700 // _sep_ = ',';
701 for(; it != itEnd; ++it)
702 {
703 TQuality qual = getValue(store.alignQualityStore, (*it).id);
704 unsigned readLen = length(store.readSeqStore[(*it).readId]);
705 double percId = 100.0 * (1.0 - (double)qual.errors / (double)readLen);
706 #ifdef RAZERS_MICRO_RNA
707 percId = 100.0 * (1.0 - (double)qual.errors / (double) ((*it).mScore));
708 #endif
709
710 switch (options.readNaming)
711 {
712 // 0..filename is the read's Fasta id
713 case 0:
714 case 3: // same as 0 if non-paired
715 file << store.readNameStore[(*it).readId];
716 break;
717
718 // 1..filename is the read filename + seqNo
719 case 1:
720 file.fill('0');
721 file << readName << '#' << std::setw(pzeros) << (*it).readId + 1;
722 break;
723
724 // 2..filename is the read sequence itself
725 case 2:
726 file << store.readSeqStore[(*it).readId];
727 }
728
729 file << _sep_ << options.positionFormat << _sep_ << readLen << _sep_ << (((*it).beginPos < (*it).endPos)? 'F': 'R') << _sep_;
730
731 switch (options.genomeNaming)
732 {
733 // 0..filename is the read's Fasta id
734 case 0:
735 file << store.contigNameStore[(*it).contigId];
736 break;
737
738 // 1..filename is the read filename + seqNo
739 case 1:
740 file.fill('0');
741 file << gnoToFileMap[(*it).contigId].i1 << '#' << std::setw(gzeros) << gnoToFileMap[(*it).contigId].i2 + 1;
742 }
743
744 if ((*it).beginPos < (*it).endPos)
745 file << _sep_ << ((*it).beginPos + options.positionFormat) << _sep_ << (*it).endPos << _sep_ << std::setprecision(5) << percId;
746 else
747 file << _sep_ << ((*it).endPos + options.positionFormat) << _sep_ << (*it).beginPos << _sep_ << std::setprecision(5) << percId;
748 #ifdef RAZERS_MICRO_RNA
749 if(options.microRNA) file << _sep_ << (*it).mScore;
750 #endif
751
752 if ((*it).pairMatchId != TAlignedRead::INVALID_ID)
753 {
754 file << _sep_ << (*it).pairMatchId << _sep_ << (int)store.alignQualityStore[(*it).id].pairScore << _sep_;
755 unsigned char no = getMateNo(store, (*it).readId);
756 if (no == 0) file << libSize[(*it).pairMatchId];
757 if (no == 1) file << -libSize[(*it).pairMatchId];
758 }
759 file << std::endl;
760
761 if (options.dumpAlignment)
762 {
763 #ifdef RAZERS_MICRO_RNA
764 if(options.microRNA)
765 assignSource(row(align, 0), prefix(store.readSeqStore[(*it).readId],(*it).mScore));
766 else
767 #endif
768 assignSource(row(align, 0), store.readSeqStore[(*it).readId]);
769
770 TContigPos left = (*it).beginPos;
771 TContigPos right = (*it).endPos;
772
773 if (left < right)
774 assignSource(row(align, 1), infix(store.contigStore[(*it).contigId].seq, left, right));
775 else
776 {
777 assignSource(row(align, 1), infix(store.contigStore[(*it).contigId].seq, right, left));
778 reverseComplement(source(row(align, 1)));
779 }
780 globalAlignment(align, scoreType);
781 dumpAlignment(file, align);
782 }
783
784 }
785 break;
786
787
788 case 1: // Enhanced Fasta Format
789 _sep_ = ',';
790 for(unsigned matchReadNo = -1, matchReadCount = 0; it != itEnd; ++it)
791 {
792 TQuality qual = getValue(store.alignQualityStore, (*it).id);
793 unsigned readLen = length(store.readSeqStore[(*it).readId]);
794 double percId = 100.0 * (1.0 - (double)qual.errors / (double)readLen);
795
796 if (matchReadNo != (*it).readId)
797 {
798 matchReadNo = (*it).readId;
799 matchReadCount = 0;
800 } else
801 ++matchReadCount;
802
803 std::string fastaID;
804 assign(fastaID, store.readNameStore[(*it).readId]);
805
806 std::string id = fastaID;
807 int fragId = (*it).readId;
808 bool appendMatchId = options.maxHits > 1;
809
810 size_t left = fastaID.find_first_of('[');
811 size_t right = fastaID.find_last_of(']');
812 if (left != fastaID.npos && right != fastaID.npos && left < right)
813 {
814 fastaID.erase(right);
815 fastaID.erase(0, left + 1);
816 replace(fastaID.begin(), fastaID.end(), ',', ' ');
817 size_t pos = fastaID.find("id=");
818 if (pos != fastaID.npos) {
819 std::istringstream iss(fastaID.substr(pos + 3));
820 iss >> id;
821 // appendMatchId = false;
822 }
823 pos = fastaID.find("fragId=");
824 if (pos != fastaID.npos) {
825 std::istringstream iss(fastaID.substr(pos + 7));
826 iss >> fragId;
827 }
828 }
829
830 if ((*it).beginPos < (*it).endPos)
831 // forward strand
832 file << '>' << ((*it).beginPos + options.positionFormat) << _sep_ << (*it).endPos;
833 else
834 // reverse strand (switch begin and end)
835 file << '>' << (*it).beginPos << _sep_ << ((*it).endPos + options.positionFormat);
836
837 unsigned ambig = 0;
838 for (unsigned i = 0; i <= qual.errors && i < length(stats); ++i)
839 ambig += stats[i][(*it).readId];
840
841 file << "[id=" << id;
842 if (appendMatchId) file << "_" << matchReadCount;
843 file << ",fragId=" << fragId;
844 file << ",contigId=" << store.contigNameStore[(*it).contigId];
845 file << ",errors=" << (unsigned)qual.errors << ",percId=" << std::setprecision(5) << percId;
846 file << ",ambiguity=" << ambig << ']' << std::endl;
847
848 file << store.readSeqStore[(*it).readId] << std::endl;
849 }
850 break;
851
852
853 case 2: // Eland Format
854 _sep_ = '\t';
855 for(unsigned readNo = 0; readNo < length(store.readSeqStore); ++readNo)
856 {
857 TQuality qual = getValue(store.alignQualityStore, (*it).id);
858
859 switch (options.readNaming)
860 {
861 // 0..filename is the read's Fasta id
862 case 0:
863 case 3: // same as 0 if non-paired
864 file << '>' << store.readNameStore[readNo] << _sep_;
865 break;
866
867 // 1..filename is the read filename + seqNo
868 case 1:
869 file.fill('0');
870 file << readName << '#' << std::setw(pzeros) << readNo + 1 << _sep_;
871 break;
872 }
873
874 if (it == itEnd || readNo < (*it).readId)
875 {
876 if (!empty(store.readSeqStore[readNo]))
877 file << store.readSeqStore[readNo] << _sep_ << "NM" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << std::endl;
878 else
879 {
880 for (unsigned i = 0; i < maxReadLength; ++i)
881 file << '.';
882 file << _sep_ << "QC" << _sep_ << '0' << _sep_ << '0' << _sep_ << '0' << std::endl;
883 }
884 } else
885 {
886 file << store.readSeqStore[readNo] << _sep_;
887 unsigned bestMatches = 1;
888 if ((unsigned)qual.errors < length(stats))
889 bestMatches = stats[qual.errors][readNo];
890
891 if (bestMatches == 0) file << '?'; // impossible
892 if (bestMatches == 1) file << 'U'; // unique best match
893 if (bestMatches > 1) file << 'R'; // non-unique best matches
894
895 file << (unsigned)qual.errors << _sep_ << stats[0][readNo] << _sep_ << stats[1][readNo] << _sep_ << stats[2][readNo];
896
897 if (bestMatches == 1)
898 {
899 file << _sep_;
900 switch (options.genomeNaming)
901 {
902 // 0..filename is the read's Fasta id
903 case 0:
904 file << store.contigNameStore[(*it).contigId];
905 break;
906
907 // 1..filename is the read filename + seqNo
908 case 1:
909 file.fill('0');
910 file << gnoToFileMap[(*it).contigId].i1 << '#' << std::setw(gzeros) << gnoToFileMap[(*it).contigId].i2 + 1;
911 }
912
913 if ((*it).beginPos < (*it).endPos)
914 file << _sep_ << ((*it).beginPos + options.positionFormat) << _sep_ << 'F' << _sep_ << "..";
915 else
916 file << _sep_ << (*it).beginPos << _sep_ << 'R' << _sep_ << "..";
917
918 if (qual.errors > 0 && options.dumpAlignment && options.hammingOnly)
919 {
920 TContigPos left = (*it).beginPos;
921 TContigPos right = (*it).endPos;
922
923 if (left < right)
924 gInf = infix(store.contigStore[(*it).contigId].seq, left, right);
925 else
926 {
927 gInf = infix(store.contigStore[(*it).contigId].seq, right, left);
928 reverseComplement(gInf);
929 }
930 for (unsigned i = 0; i < length(gInf); ++i)
931 if ((options.compMask[ordValue(store.readSeqStore[readNo][i])] &
932 options.compMask[ordValue(gInf[i])]) == 0)
933 file << _sep_ << i + 1 << gInf[i];
934 }
935 }
936 file << std::endl;
937 ++it;
938 }
939 }
940 break;
941 case 3: // Gff: printf "$chr $name_$format read $pos %ld . $dir . ID=$col[0]$unique$rest\n",$pos+$len-1;
942 for (unsigned filecount = 0; filecount < length(genomeFileNameList); ++filecount)
943 {
944 TQuality qual = getValue(store.alignQualityStore, (*it).id);
945
946 // open genome file
947 std::ifstream gFile;
948 gFile.open(toCString(genomeFileNameList[filecount]), std::ios_base::in | std::ios_base::binary);
949 if (!gFile.is_open())
950 {
951 std::cerr << "Couldn't open genome file." << std::endl;
952 break;
953 }
954
955 Dna5String currGenome;
956
957 // iterate over genome sequences
958 for(; !_streamEOF(gFile); ++curreadId)
959 {
960 read(gFile, currGenome, Fasta()); // read Fasta sequence
961 while(it != itEnd && (*it).contigId == curreadId)
962 {
963 #ifdef RAZERS_DIRECT_MAQ_MAPPING
964 if(options.maqMapping && options.noBelowIdentity && (*it).seedEditDist > maxSeedErrors)
965 {
966 ++it;
967 continue;
968 }
969 #endif
970
971 unsigned currReadNo = (*it).readId;
972 int unique = 1;
973 unsigned bestMatches = 0;
974 //would seedEditDist make more sense here?
975 //CHECKcnts if ((unsigned)qual.errors < length(stats))
976 // bestMatches = stats[qual.errors][currReadNo];
977 #ifdef RAZERS_DIRECT_MAQ_MAPPING
978 if(options.maqMapping)
979 bestMatches = stats[0][currReadNo] >> 5;
980 else
981 #endif
982 if (bestMatches == 0 && (unsigned)qual.errors < length(stats))
983 bestMatches = stats[qual.errors][currReadNo];
984
985 bool suboptimal = false;
986 if (
987 #ifdef RAZERS_DIRECT_MAQ_MAPPING
988 !options.maqMapping &&
989 #endif
990 (unsigned)qual.errors > 0)
991 {
992 for(unsigned d = 0; d < (unsigned)qual.errors; ++d)
993 if (stats[d][currReadNo]>0) suboptimal=true;
994 }
995 //std::cout << (stats[0][currReadNo] & 31) <<"<-dist "<< (stats[0][currReadNo] >> 5) <<"<-count\n";
996 // std::cout << "hier1\n";
997 if (bestMatches != 1)
998 {
999 unique = 0;
1000 if(options.purgeAmbiguous)
1001 {
1002 ++it;
1003 continue;
1004 }
1005
1006 // if((*it).mScore > 0) std::cout << (*it).mScore << "<-non uniq but score > 0\n";
1007 // ++it;
1008 // continue; // TODO: output non-unique matches
1009 }
1010 // std::cout << "hier2\n";
1011 unsigned readLen = length(store.readSeqStore[currReadNo]);
1012
1013 switch (options.genomeNaming)
1014 {
1015 // 0..filename is the read's Fasta id
1016 case 0:
1017 file << store.contigNameStore[(*it).contigId] <<'\t';
1018 break;
1019 // 1..filename is the read filename + seqNo
1020 case 1:
1021 file.fill('0');
1022 file << gnoToFileMap[(*it).contigId].i1 << '#' << std::setw(gzeros) << gnoToFileMap[(*it).contigId].i2 + 1 << '\t';
1023 break;
1024 }
1025 // std::cout << "hier3\n";
1026 //file << options.runID << "_razers\tread";
1027 file << "razers\tread\t";
1028 if ((*it).beginPos < (*it).endPos)
1029 file << ((*it).beginPos + options.positionFormat) << '\t' << (*it).endPos << '\t';
1030 else
1031 file << ((*it).endPos + options.positionFormat) << '\t' << (*it).beginPos << '\t';
1032 // if ((*it).orientation == 'F')
1033 // file << '\t' << ((*it).beginPos + options.positionFormat) << '\t' << (*it).endPos <<'\t';
1034 // else
1035 // file << '\t' << (*it).endPos << '\t'<<((*it).beginPos + options.positionFormat)<< '\t';
1036 double percId = 100.0 * (1.0 - (double)qual.errors / (double)readLen);
1037 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1038 if(options.maqMapping)
1039 file << (*it).mScore << "\t";
1040 else
1041 #endif
1042 file << percId << "\t";
1043 // std::cout << "hier4\n";
1044
1045 if ((*it).beginPos < (*it).endPos)
1046 file << '+' << '\t' << '.' <<'\t';
1047 else
1048 file << '-' << '\t' << '.' <<'\t';
1049
1050 switch (options.readNaming)
1051 {
1052 // 0..filename is the read's Fasta id
1053 case 0:
1054 case 3: // same as 0 if non-paired
1055 file << "ID=" <<store.readNameStore[currReadNo];
1056 break;
1057
1058 // 1..filename is the read filename + seqNo
1059 case 1:
1060 file.fill('0');
1061 file << "ID=" << readName << '#' << std::setw(pzeros) << currReadNo + 1;
1062 break;
1063 }
1064 // std::cout << "hier5\n";
1065 if(suboptimal) file << ";suboptimal";
1066 else
1067 {
1068 if(unique==1) file << ";unique";
1069 if(unique==0) file << ";multi";
1070 }
1071 if (qual.errors > 0)
1072 {
1073 if (options.hammingOnly)
1074 {
1075 TContigPos left = (*it).beginPos;
1076 TContigPos right = (*it).endPos;
1077
1078 if (left < right)
1079 gInf = infix(store.contigStore[(*it).contigId].seq, left, right);
1080 else
1081 {
1082 gInf = infix(store.contigStore[(*it).contigId].seq, right, left);
1083 reverseComplement(gInf);
1084 }
1085 bool first = true;
1086 file << ";cigar=" << length(store.readSeqStore[currReadNo]) << "M";
1087 file << ";mutations=";
1088 unsigned i = 0;
1089 // while ((*it).beginPos == 0 && i < length(store.readSeqStore[currReadNo])-length(gInf) )
1090 // {
1091 // if(first){ file << i + 1 << (Dna5)store.readSeqStore[currReadNo][i]; first = false;}
1092 // else file <<','<< i + 1 << (Dna5)store.readSeqStore[currReadNo][i];
1093 // ++i;
1094 // }
1095 for (; i < length(gInf); ++i)
1096 if ((options.compMask[ordValue(store.readSeqStore[currReadNo][i])] &
1097 options.compMask[ordValue(gInf[i])]) == 0)
1098 {
1099 // if(first){ file << i + 1 << gInf[i]; first = false;}
1100 // else file <<','<< i + 1 << gInf[i];
1101 if(first){ file << i + 1 << (Dna5)store.readSeqStore[currReadNo][i]; first = false;}
1102 else file <<','<< i + 1 << (Dna5)store.readSeqStore[currReadNo][i];
1103 }
1104 // while ((*it).endPos == length(currGenome) && i < length(store.readSeqStore[currReadNo]) )
1105 // {
1106 // if(first){ file << i + 1 << (Dna5)store.readSeqStore[currReadNo][i]; first = false;}
1107 // else file <<','<< i + 1 << (Dna5)store.readSeqStore[currReadNo][i];
1108 // ++i;
1109 // }
1110
1111 }
1112 else
1113 {
1114 assignSource(row(align, 0), store.readSeqStore[currReadNo]);
1115 TContigPos left = (*it).beginPos;
1116 TContigPos right = (*it).endPos;
1117
1118 if (left < right)
1119 assignSource(row(align, 1), infix(currGenome, left, right));
1120 else
1121 {
1122 assignSource(row(align, 1), infix(currGenome, right, left));
1123 reverseComplement(source(row(align, 1)));
1124 }
1125 globalAlignment(align, scoreType);
1126
1127 std::stringstream cigar, mutations;
1128 getCigarLine(align,cigar,mutations);
1129 file << ";cigar="<<cigar.str();
1130
1131 if(length(mutations.str())>0)
1132 file << ";mutations=" << mutations.str();
1133
1134 }
1135 }
1136 #ifdef RAZERS_DIRECT_MAQ_MAPPING
1137 if(options.maqMapping || options.fastaIdQual)
1138 {
1139 // file << ";read=";
1140 // for(unsigned j=0;j<length(store.readSeqStore[currReadNo]);++j)
1141 // {
1142 // file << (Dna5)store.readSeqStore[currReadNo][j];
1143 // }
1144 file << ";quality=";
1145 for(unsigned j=0;j<readLen;++j)
1146 {
1147 //TODO need to output the original quality here!
1148 //file << (char) ((int)((unsigned char)store.readSeqStore[currReadNo][j] >> 3) + 33);
1149 file << (char)(getQualityValue(store.readSeqStore[currReadNo][j])+ 33);
1150 }
1151 }
1152 #endif
1153 //std::cout << "hier6\n";
1154 file << std::endl;
1155 ++it;
1156 }
1157 }
1158 gFile.close();
1159 ++filecount;
1160 }
1161 break;
1162 case 4: // Sam
1163 convertMatchesToGlobalAlignment(store, scoreType, True());
1164 /*{
1165 String<String<unsigned> > layout;
1166 layoutAlignment(layout, store, 0);
1167 for (unsigned i=0;i<length(store.contigStore);++i)
1168 printAlignment(std::cout, layout, store, i, 0, 2000, 0, 100, -1);
1169 }*/
1170 write(file, store, Sam());
1171 break;
1172 case 5: // AFG
1173 convertMatchesToGlobalAlignment(store, scoreType, True());
1174 write(file, store, Amos());
1175 break;
1176 }
1177
1178 file.close();
1179
1180 // get empirical error distribution
1181 if (!empty(errorPrbFileName) && maxReadLength > 0)
1182 {
1183 file.open(toCString(errorPrbFileName), std::ios_base::out | std::ios_base::trunc);
1184 if (file.is_open())
1185 {
1186 String<long double> posError;
1187 unsigned unique = 0;
1188 unsigned insertions = 0;
1189 unsigned deletions = 0;
1190 resize(posError, maxReadLength, 0);
1191
1192 if (options.hammingOnly)
1193 unique = getErrorDistribution(posError, store, options);
1194 else
1195 {
1196 unique = getErrorDistribution(posError, insertions, deletions, store, options);
1197 std::cerr << "insertProb: " << (double)insertions / ((double)length(posError) * (double)unique) << std::endl;
1198 std::cerr << "deleteProb: " << (double)deletions / ((double)length(posError) * (double)unique) << std::endl;
1199 }
1200
1201 file << (double)posError[0] / (double)unique;
1202 for (unsigned i = 1; i < length(posError); ++i)
1203 file << '\t' << (double)posError[i] / (double)unique;
1204 file << std::endl;
1205 file.close();
1206 } else
1207 std::cerr << "Failed to open error distribution file" << std::endl;
1208 }
1209
1210 options.timeDumpResults = SEQAN_PROTIMEDIFF(dump_time);
1211
1212 if (options._debugLevel >= 1)
1213 std::cerr << "Dumping results took \t" << options.timeDumpResults << " seconds" << std::endl;
1214 }
1215
1216
1217 }
1218
1219 #endif
1220
1221