1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32
33 #ifndef SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_REFINEMENT_H
34 #define SEQAN_HEADER_GRAPH_ALIGN_TCOFFEE_REFINEMENT_H
35
36 namespace SEQAN_NAMESPACE_MAIN
37 {
38
39 //////////////////////////////////////////////////////////////////////////////////
40
41 template<typename TScore, typename TSc>
42 inline void
_matchScore(TScore &,TSc)43 _matchScore(TScore&,
44 TSc)
45 {
46 // No operation
47 }
48
49 //////////////////////////////////////////////////////////////////////////////////
50
51 template<typename TScore, typename TSc>
52 inline void
_mismatchScore(TScore &,TSc)53 _mismatchScore(TScore&,
54 TSc)
55 {
56 // No operation
57 }
58
59 //////////////////////////////////////////////////////////////////////////////////
60
61 template<typename TValue, typename TSc>
62 inline void
_matchScore(Score<TValue,Simple> & sc,TSc msc)63 _matchScore(Score<TValue, Simple>& sc,
64 TSc msc)
65 {
66 sc.data_match = msc;
67 }
68
69 //////////////////////////////////////////////////////////////////////////////////
70
71 template<typename TValue, typename TSc>
72 inline void
_mismatchScore(Score<TValue,Simple> & sc,TSc mmsc)73 _mismatchScore(Score<TValue, Simple>& sc,
74 TSc mmsc)
75 {
76 sc.data_mismatch = mmsc;
77 }
78
79
80 //////////////////////////////////////////////////////////////////////////////
81
82
83 template<typename T>
84 struct ProfileProfileScore;
85
86 //////////////////////////////////////////////////////////////////////////////
87
88 template <typename TValue, typename TScoreMember>
89 class Score<TValue, ProfileProfileScore<TScoreMember> >
90 {
91 public:
92 TScoreMember sc;
93
Score()94 Score() {}
95
96 template<typename TSpec>
Score(Score<TValue,TSpec> & old_sc)97 Score(Score<TValue, TSpec>& old_sc)
98 {
99 sc.data_gap_extend = scoreGapExtend(old_sc);
100 sc.data_gap_open = scoreGapOpen(old_sc);
101 }
102
Score(Score<TValue,Simple> & old_sc)103 Score(Score<TValue, Simple>& old_sc)
104 {
105 sc.data_gap_extend = scoreGapExtend(old_sc);
106 sc.data_gap_open = scoreGapOpen(old_sc);
107 _matchScore(sc, scoreMatch(old_sc));
108 _mismatchScore(sc, scoreMismatch(old_sc));
109 }
110
Score(TValue gap_extend,TValue gap_open)111 Score(TValue gap_extend, TValue gap_open)
112 {
113 sc.data_gap_extend = gap_extend;
114 sc.data_gap_open = gap_open;
115 }
116
Score(TValue match,TValue mismatch,TValue gap_extend,TValue gap_open)117 Score(TValue match, TValue mismatch, TValue gap_extend, TValue gap_open)
118 {
119 sc.data_gap_extend = gap_extend;
120 sc.data_gap_open = gap_open;
121 _matchScore(sc, match);
122 _mismatchScore(sc, mismatch);
123 }
124 };
125
126
127 template <typename TValue, typename TScoreMember, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
128 inline TValue
scoreGapExtendHorizontal(Score<TValue,ProfileProfileScore<TScoreMember>> const & me,TPos1 pos1,TPos2,TSeq1 const & seq1,TSeq2 const &)129 scoreGapExtendHorizontal(
130 Score<TValue, ProfileProfileScore<TScoreMember> > const & me,
131 TPos1 pos1,
132 TPos2,
133 TSeq1 const& seq1,
134 TSeq2 const&)
135 {
136 typedef typename Size<TSeq1>::Type TSize;
137 typedef typename Value<TSeq1>::Type TAlphabet1;
138 typedef typename Value<TSeq2>::Type TAlphabet2;
139 typedef typename SourceValue<TAlphabet1>::Type TSourceValue;
140
141 // Last character is a gap
142 // std::cout << seq1[pos1] << std::endl;
143
144 TSize alph_size = ValueSize<TSourceValue>::VALUE;
145 TValue n1 = 0;
146 for(TSize i = 0; i < alph_size; ++i) n1 += seq1[pos1].count[i];
147 TValue totalSc = n1 * scoreGapExtend(me.sc);
148 n1 += seq1[pos1].count[alph_size];
149 //std::cout << totalSc / n1 << ',';
150 return totalSc / n1;
151 }
152
153 template <typename TValue, typename TScoreMember, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
154 inline TValue
scoreGapOpenHorizontal(Score<TValue,ProfileProfileScore<TScoreMember>> const & me,TPos1 pos1,TPos2,TSeq1 const & seq1,TSeq2 const &)155 scoreGapOpenHorizontal(
156 Score<TValue, ProfileProfileScore<TScoreMember> > const & me,
157 TPos1 pos1,
158 TPos2,
159 TSeq1 const & seq1,
160 TSeq2 const &)
161 {
162 typedef typename Size<TSeq1>::Type TSize;
163 typedef typename Value<TSeq1>::Type TAlphabet1;
164 typedef typename Value<TSeq2>::Type TAlphabet2;
165 typedef typename SourceValue<TAlphabet1>::Type TSourceValue;
166
167 TSize alph_size = ValueSize<TSourceValue>::VALUE;
168 TValue n1 = 0;
169 for(TSize i = 0; i < alph_size+1; ++i) n1 += seq1[pos1].count[i];
170 TSize newGapOpen = n1;
171 if ((pos1) && (seq1[pos1 - 1].count[alph_size] < seq1[pos1].count[alph_size])) {
172 newGapOpen -= (seq1[pos1].count[alph_size] - seq1[pos1 - 1].count[alph_size]);
173 } else if (!pos1) {
174 newGapOpen -= seq1[pos1].count[alph_size];
175 }
176 TValue totalSc = newGapOpen * scoreGapOpen(me.sc);
177 return totalSc / n1;
178 }
179
180
181 template <typename TValue, typename TScoreMember, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
182 inline TValue
scoreGapExtendVertical(Score<TValue,ProfileProfileScore<TScoreMember>> const & me,TPos1,TPos2 pos2,TSeq1 const &,TSeq2 const & seq2)183 scoreGapExtendVertical(
184 Score<TValue, ProfileProfileScore<TScoreMember> > const & me,
185 TPos1,
186 TPos2 pos2,
187 TSeq1 const &,
188 TSeq2 const & seq2)
189 {
190 typedef typename Size<TSeq1>::Type TSize;
191 typedef typename Value<TSeq1>::Type TAlphabet1;
192 typedef typename Value<TSeq2>::Type TAlphabet2;
193 typedef typename SourceValue<TAlphabet1>::Type TSourceValue;
194
195 // Last character is a gap
196 // std::cout << seq2[pos2] << std::endl;
197
198 TSize alph_size = ValueSize<TSourceValue>::VALUE;
199 TValue n2 = 0;
200 for(TSize j = 0; j < alph_size; ++j) n2 += seq2[pos2].count[j];
201 TValue totalSc = n2 * scoreGapExtend(me.sc);
202 n2 += seq2[pos2].count[alph_size];
203 //std::cout << totalSc / n2 << ',';
204 return totalSc / n2;
205 }
206
207 template <typename TValue, typename TScoreMember, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
208 inline TValue
scoreGapOpenVertical(Score<TValue,ProfileProfileScore<TScoreMember>> const & me,TPos1,TPos2 pos2,TSeq1 const &,TSeq2 const & seq2)209 scoreGapOpenVertical(
210 Score<TValue, ProfileProfileScore<TScoreMember> > const & me,
211 TPos1,
212 TPos2 pos2,
213 TSeq1 const &,
214 TSeq2 const & seq2)
215 {
216 typedef typename Size<TSeq1>::Type TSize;
217 typedef typename Value<TSeq1>::Type TAlphabet1;
218 typedef typename Value<TSeq2>::Type TAlphabet2;
219 typedef typename SourceValue<TAlphabet1>::Type TSourceValue;
220
221 TSize alph_size = ValueSize<TSourceValue>::VALUE;
222 TValue n2 = 0;
223 for(TSize j = 0; j < alph_size+1; ++j) n2 += seq2[pos2].count[j];
224 TSize newGapOpen = n2;
225 if ((pos2) && (seq2[pos2 - 1].count[alph_size] < seq2[pos2].count[alph_size])) {
226 newGapOpen -= (seq2[pos2].count[alph_size] - seq2[pos2 - 1].count[alph_size]);
227 } else if (!pos2) {
228 newGapOpen -= seq2[pos2].count[alph_size];
229 }
230 TValue totalSc = newGapOpen * scoreGapOpen(me.sc);
231 return totalSc / n2;
232 }
233
234
235 template <typename TValue, typename TScoreMember, typename TPos1, typename TPos2, typename TSeq1, typename TSeq2>
236 inline TValue
score(Score<TValue,ProfileProfileScore<TScoreMember>> const & me,TPos1 pos1,TPos2 pos2,TSeq1 const & seq1,TSeq2 const & seq2)237 score(Score<TValue, ProfileProfileScore<TScoreMember> > const & me,
238 TPos1 pos1,
239 TPos2 pos2,
240 TSeq1 const &seq1,
241 TSeq2 const &seq2)
242 {
243 typedef typename Size<TSeq1>::Type TSize;
244 typedef typename Value<TSeq1>::Type TAlphabet1;
245 typedef typename Value<TSeq2>::Type TAlphabet2;
246 typedef typename SourceValue<TAlphabet1>::Type TSourceValue;
247
248 // Last character is a gap
249 //std::cout << seq1[pos1] << std::endl;
250 //std::cout << seq2[pos2] << std::endl;
251
252 TValue totalSc = 0;
253 TSize alph_size = ValueSize<TSourceValue>::VALUE;
254 TValue n1 = 0;
255 TValue n2 = 0;
256 for(TSize j = 0; j < alph_size; ++j) n2 += seq2[pos2].count[j];
257 for(TSize i = 0; i < alph_size; ++i) {
258 if (seq1[pos1].count[i]) {
259 n1 += seq1[pos1].count[i];
260 for(TSize j = 0; j < alph_size; ++j) {
261 if (seq2[pos2].count[j]) {
262 //std::cout << TSourceValue(i) << ',' << TSourceValue(j) << ',' << score(me.sc, TSourceValue(i), TSourceValue(j)) << std::endl;
263 totalSc += seq1[pos1].count[i] * seq2[pos2].count[j] * score(me.sc, TSourceValue(i), TSourceValue(j));
264 }
265 }
266 }
267 }
268 totalSc += n2 * seq1[pos1].count[alph_size] * scoreGapExtend(me.sc);
269 totalSc += n1 * seq2[pos2].count[alph_size] * scoreGapExtend(me.sc);
270 n1 += seq1[pos1].count[alph_size];
271 n2 += seq2[pos2].count[alph_size];
272 //std::cout << totalSc / (n1 * n2) << ',';
273 return totalSc / (n1 * n2);
274 }
275
276
277 //////////////////////////////////////////////////////////////////////////////
278
279 template<typename TValue, typename TSpec, typename TSize, typename TAlphabet, typename TScore>
280 inline void
_msaRefinement(String<TValue,TSpec> & mat,TSize nseq,TSize splitPos,TScore & sc,TAlphabet)281 _msaRefinement(String<TValue, TSpec>& mat,
282 TSize nseq,
283 TSize splitPos,
284 TScore& sc,
285 TAlphabet)
286 {
287 // Is it a valid split?
288 if ((splitPos == 0) || (splitPos > nseq - 1)) return;
289
290 // Initialization
291 typedef String<TValue, TSpec> TAlignMat;
292 typedef typename Iterator<TAlignMat, Standard>::Type TMatIter;
293 TSize alignLen = length(mat) / nseq;
294
295 //// Debug code
296 //for(TSize row = 0; row < nseq; ++row) {
297 // for(TSize col = 0; col < alignLen; ++col) {
298 // std::cout << mat[row * alignLen + col];
299 // }
300 // std::cout << std::endl;
301 //}
302
303 char gapChar = gapValue<char>();
304 TSize nseq1 = splitPos;
305 TSize nseq2 = nseq - nseq1;
306 TSize gapPos = ValueSize<TAlphabet>::VALUE;
307 typedef ProfileType<TAlphabet> TProfile;
308 typedef String<TProfile> TProfileString;
309 typedef typename Iterator<TProfileString, Standard>::Type TProfIter;
310 TProfileString prof1;
311 TProfileString prof2;
312 resize(prof1, alignLen, TProfile());
313 resize(prof2, alignLen, TProfile());
314 TMatIter itMat = begin(mat, Standard());
315 TMatIter itMatEnd = begin(mat, Standard());
316 itMatEnd += nseq1 * alignLen;
317 TProfIter itProf = begin(prof1, Standard());
318 for(TSize i = 0;itMat != itMatEnd;++itMat, ++itProf, ++i) {
319 if (i % alignLen == 0) itProf = begin(prof1, Standard());
320 if (*itMat == gapChar) ++(itProf->count[gapPos]);
321 else ++(itProf->count[ordValue((TAlphabet) *itMat)]);
322 }
323 itProf = begin(prof2, Standard());
324 itMatEnd = end(mat, Standard());
325 for(TSize i = 0;itMat != itMatEnd;++itMat, ++itProf, ++i) {
326 if (i % alignLen == 0) itProf = begin(prof2, Standard());
327 if (*itMat == gapChar) ++(itProf->count[gapPos]);
328 else ++(itProf->count[ordValue((TAlphabet) *itMat)]);
329 }
330
331
332 TProfileString alignProf1;
333 reserve(alignProf1, alignLen);
334 itProf = begin(prof1, Standard());
335 TProfIter itProfEnd = end(prof1, Standard());
336 typedef String<TSize> TDelGap;
337 typedef typename Iterator<TDelGap, Standard>::Type TDelGapIter;
338 TDelGap delGapCol;
339 for(TSize i = 0;itProf != itProfEnd;++itProf, ++i) {
340 if (itProf->count[gapPos] != nseq1) appendValue(alignProf1, *itProf);
341 else appendValue(delGapCol, i, Generous());
342 }
343 TDelGapIter itDelGap = begin(delGapCol, Standard());
344 TDelGapIter itDelGapEnd = end(delGapCol, Standard());
345 TAlignMat mat1;
346 TSize alignLen1 = alignLen - length(delGapCol);
347 resize(mat1, nseq1 * alignLen1);
348 itMat = begin(mat, Standard());
349 itMatEnd = begin(mat, Standard());
350 itMatEnd += nseq1 * alignLen;
351 TMatIter itMat1 = begin(mat1, Standard());
352 for(TSize col = 0;itMat != itMatEnd;++itMat, ++col) {
353 if (col % alignLen == 0) itDelGap = begin(delGapCol, Standard());
354 if ((itDelGap != itDelGapEnd) && (col % alignLen == *itDelGap)) {
355 ++itDelGap;
356 continue;
357 }
358 *itMat1 = *itMat;
359 ++itMat1;
360 }
361
362 //// Debug code
363 //for(TSize row = 0; row < nseq1; ++row) {
364 // for(TSize col = 0; col < alignLen1; ++col) {
365 // std::cout << mat1[row * alignLen1 + col];
366 // }
367 // std::cout << std::endl;
368 //}
369
370
371 TProfileString alignProf2;
372 reserve(alignProf2, alignLen);
373 itProf = begin(prof2, Standard());
374 itProfEnd = end(prof2, Standard());
375 clear(delGapCol);
376 for(TSize i = 0;itProf != itProfEnd;++itProf, ++i) {
377 if (itProf->count[gapPos] != nseq2) appendValue(alignProf2, *itProf);
378 else appendValue(delGapCol, i, Generous());
379 }
380 itDelGap = begin(delGapCol, Standard());
381 itDelGapEnd = end(delGapCol, Standard());
382 TAlignMat mat2;
383 TSize alignLen2 = alignLen - length(delGapCol);
384 resize(mat2, nseq2 * alignLen2);
385 itMatEnd = end(mat, Standard());
386 TMatIter itMat2 = begin(mat2, Standard());
387 for(TSize col = 0;itMat != itMatEnd;++itMat, ++col) {
388 if (col % alignLen == 0) itDelGap = begin(delGapCol, Standard());
389 if ((itDelGap != itDelGapEnd) && (col % alignLen == *itDelGap)) {
390 ++itDelGap;
391 continue;
392 }
393 *itMat2 = *itMat;
394 ++itMat2;
395 }
396
397 //// Debug code
398 //for(TSize row = 0; row < nseq2; ++row) {
399 // for(TSize col = 0; col < alignLen2; ++col) {
400 // std::cout << mat2[row * alignLen2 + col];
401 // }
402 // std::cout << std::endl;
403 //}
404
405
406 // ReAlign the consensus with the sequence
407 typedef StringSet<TProfileString, Dependent<> > TStringSet;
408 TStringSet pairSet;
409 appendValue(pairSet, alignProf1);
410 appendValue(pairSet, alignProf2);
411
412 typedef String<Fragment<> > TFragmentString;
413 TFragmentString matches;
414 globalAlignment(matches, pairSet, sc, Gotoh());
415
416 typedef typename Iterator<TFragmentString, Standard>::Type TFragIter;
417 TFragIter fragIt = end(matches, Standard() );
418 TFragIter fragItEnd = begin(matches, Standard() );
419 typedef String<TSize> TInsertPattern;
420 typedef typename Iterator<TInsertPattern, Standard>::Type TInsertIter;
421 TInsertPattern insertPattern1;
422 TInsertPattern insertPattern2;
423 TSize begin1Pos = 0;
424 TSize begin2Pos = 0;
425 if (fragIt != fragItEnd) {
426 do {
427 --fragIt;
428 appendValue(insertPattern1, fragIt->begin2 - begin2Pos, Generous());
429 appendValue(insertPattern1, fragIt->begin1 - begin1Pos, Generous());
430 appendValue(insertPattern1, fragIt->len, Generous());
431 appendValue(insertPattern2, fragIt->begin2 - begin2Pos, Generous());
432 appendValue(insertPattern2, fragIt->begin1 - begin1Pos, Generous());
433 appendValue(insertPattern2, fragIt->len, Generous());
434 begin1Pos = fragIt->begin1 + fragIt->len;
435 begin2Pos = fragIt->begin2 + fragIt->len;
436 } while (fragIt != fragItEnd);
437 }
438 appendValue(insertPattern1, alignLen2 - begin2Pos, Generous());
439 appendValue(insertPattern1, alignLen1 - begin1Pos, Generous());
440 appendValue(insertPattern2, alignLen2 - begin2Pos, Generous());
441 appendValue(insertPattern2, alignLen1 - begin1Pos, Generous());
442 clear(matches);
443
444 TInsertIter itInsert = begin(insertPattern1, Standard());
445 TInsertIter itInsertEnd = end(insertPattern1, Standard());
446 TSize totalAlignLen = 0;
447 for(;itInsert != itInsertEnd; ++itInsert) totalAlignLen += *itInsert;
448 itMat1 = begin(mat1, Standard());
449 TMatIter itMat1End = end(mat1, Standard());
450 clear(mat);
451 while(itMat1 != itMat1End) {
452 itInsert = begin(insertPattern1, Standard());
453 for(TSize i = 0; i < *itInsert; ++i) appendValue(mat, gapChar, Generous());
454 ++itInsert;
455 for(TSize i = 0; i < *itInsert; ++i, ++itMat1) appendValue(mat, *itMat1, Generous());
456 ++itInsert;
457 while(itInsert!=itInsertEnd) {
458 for(TSize i = 0; i < *itInsert; ++i, ++itMat1) appendValue(mat, *itMat1, Generous());
459 ++itInsert;
460 for(TSize i = 0; i < *itInsert; ++i) appendValue(mat, gapChar, Generous());
461 ++itInsert;
462 for(TSize i = 0; i < *itInsert; ++i, ++itMat1) appendValue(mat, *itMat1, Generous());
463 ++itInsert;
464 }
465 }
466
467 itInsertEnd = end(insertPattern2, Standard());
468 itMat2 = begin(mat2, Standard());
469 TMatIter itMat2End = end(mat2, Standard());
470 while(itMat2 != itMat2End) {
471 itInsert = begin(insertPattern2, Standard());
472 for(TSize i = 0; i < *itInsert; ++i, ++itMat2) appendValue(mat, *itMat2, Generous());
473 ++itInsert;
474 for(TSize i = 0; i < *itInsert; ++i) appendValue(mat, gapChar, Generous());
475 ++itInsert;
476 while(itInsert!=itInsertEnd) {
477 for(TSize i = 0; i < *itInsert; ++i, ++itMat2) appendValue(mat, *itMat2, Generous());
478 ++itInsert;
479 for(TSize i = 0; i < *itInsert; ++i, ++itMat2) appendValue(mat, *itMat2, Generous());
480 ++itInsert;
481 for(TSize i = 0; i < *itInsert; ++i) appendValue(mat, gapChar, Generous());
482 ++itInsert;
483 }
484 }
485
486 //// Debug code
487 //for(TSize row = 0; row < nseq; ++row) {
488 // for(TSize col = 0; col < totalAlignLen; ++col) {
489 // std::cout << mat[row * totalAlignLen + col];
490 // }
491 // std::cout << std::endl;
492 //}
493 }
494
495
496
497 template<typename TStringSet, typename TCargo, typename TSpec, typename TScore>
498 inline void
msaRefinement(Graph<Alignment<TStringSet,TCargo,TSpec>> & gAlign,TScore & sc)499 msaRefinement(Graph<Alignment<TStringSet, TCargo, TSpec> >& gAlign,
500 TScore& sc)
501 {
502 typedef Graph<Alignment<TStringSet, TCargo, TSpec> > TGraph;
503 typedef typename Size<TGraph>::Type TSize;
504 typedef typename Value<TStringSet>::Type TSequence;
505 typedef typename Value<TSequence>::Type TAlphabet;
506 TSize nseq = length(stringSet(gAlign));
507 String<char> mat;
508 convertAlignment(gAlign, mat);
509 for(TSize i = 1; i<nseq; ++i) _msaRefinement(mat, nseq, i, sc, TAlphabet());
510 convertAlignment(mat, gAlign);
511 }
512
513 }// namespace SEQAN_NAMESPACE_MAIN
514
515 #endif //#ifndef SEQAN_HEADER_...
516