1 #include "muscle.h"
2 #include "msa.h"
3 #include "profile.h"
4 #include "objscore.h"
5
6 #define TRACE 0
7 #define TRACE_SEQPAIR 0
8 #define TEST_SPFAST 0
9
10 extern SCOREMATRIX VTML_LA;
11 extern SCOREMATRIX PAM200;
12 extern SCOREMATRIX PAM200NoCenter;
13 extern SCOREMATRIX VTML_SP;
14 extern SCOREMATRIX VTML_SPNoCenter;
15 extern SCOREMATRIX NUC_SP;
16
17 SCORE g_SPScoreLetters;
18 SCORE g_SPScoreGaps;
19
TermGapScore(bool Gap)20 static SCORE TermGapScore(bool Gap)
21 {
22 switch (g_TermGaps)
23 {
24 case TERMGAPS_Full:
25 return 0;
26
27 case TERMGAPS_Half:
28 if (Gap)
29 return g_scoreGapOpen/2;
30 return 0;
31
32 case TERMGAPS_Ext:
33 if (Gap)
34 return g_scoreGapExtend;
35 return 0;
36 }
37 Quit("TermGapScore?!");
38 return 0;
39 }
40
ScoreSeqPairLetters(const MSA & msa1,unsigned uSeqIndex1,const MSA & msa2,unsigned uSeqIndex2)41 SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,
42 const MSA &msa2, unsigned uSeqIndex2)
43 {
44 const unsigned uColCount = msa1.GetColCount();
45 const unsigned uColCount2 = msa2.GetColCount();
46 if (uColCount != uColCount2)
47 Quit("ScoreSeqPairLetters, different lengths");
48
49 #if TRACE_SEQPAIR
50 {
51 Log("\n");
52 Log("ScoreSeqPairLetters\n");
53 MSA msaTmp;
54 msaTmp.SetSize(2, uColCount);
55 msaTmp.CopySeq(0, msa1, uSeqIndex1);
56 msaTmp.CopySeq(1, msa2, uSeqIndex2);
57 msaTmp.LogMe();
58 }
59 #endif
60
61 SCORE scoreLetters = 0;
62 SCORE scoreGaps = 0;
63 bool bGapping1 = false;
64 bool bGapping2 = false;
65
66 unsigned uColStart = 0;
67 bool bLeftTermGap = false;
68 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
69 {
70 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
71 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
72 if (!bGap1 || !bGap2)
73 {
74 if (bGap1 || bGap2)
75 bLeftTermGap = true;
76 uColStart = uColIndex;
77 break;
78 }
79 }
80
81 unsigned uColEnd = uColCount - 1;
82 bool bRightTermGap = false;
83 for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
84 {
85 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
86 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
87 if (!bGap1 || !bGap2)
88 {
89 if (bGap1 || bGap2)
90 bRightTermGap = true;
91 uColEnd = (unsigned) iColIndex;
92 break;
93 }
94 }
95
96 #if TRACE_SEQPAIR
97 Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
98 #endif
99
100 for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
101 {
102 unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);
103 if (uLetter1 >= g_AlphaSize)
104 continue;
105 unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);
106 if (uLetter2 >= g_AlphaSize)
107 continue;
108
109 SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];
110 scoreLetters += scoreMatch;
111 }
112 return scoreLetters;
113 }
114
ScoreSeqPairGaps(const MSA & msa1,unsigned uSeqIndex1,const MSA & msa2,unsigned uSeqIndex2)115 SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,
116 const MSA &msa2, unsigned uSeqIndex2)
117 {
118 const unsigned uColCount = msa1.GetColCount();
119 const unsigned uColCount2 = msa2.GetColCount();
120 if (uColCount != uColCount2)
121 Quit("ScoreSeqPairGaps, different lengths");
122
123 #if TRACE_SEQPAIR
124 {
125 Log("\n");
126 Log("ScoreSeqPairGaps\n");
127 MSA msaTmp;
128 msaTmp.SetSize(2, uColCount);
129 msaTmp.CopySeq(0, msa1, uSeqIndex1);
130 msaTmp.CopySeq(1, msa2, uSeqIndex2);
131 msaTmp.LogMe();
132 }
133 #endif
134
135 SCORE scoreGaps = 0;
136 bool bGapping1 = false;
137 bool bGapping2 = false;
138
139 unsigned uColStart = 0;
140 bool bLeftTermGap = false;
141 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
142 {
143 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
144 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
145 if (!bGap1 || !bGap2)
146 {
147 if (bGap1 || bGap2)
148 bLeftTermGap = true;
149 uColStart = uColIndex;
150 break;
151 }
152 }
153
154 unsigned uColEnd = uColCount - 1;
155 bool bRightTermGap = false;
156 for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
157 {
158 bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
159 bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
160 if (!bGap1 || !bGap2)
161 {
162 if (bGap1 || bGap2)
163 bRightTermGap = true;
164 uColEnd = (unsigned) iColIndex;
165 break;
166 }
167 }
168
169 #if TRACE_SEQPAIR
170 Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
171 #endif
172
173 for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
174 {
175 bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
176 bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
177
178 if (bGap1 && bGap2)
179 continue;
180
181 if (bGap1)
182 {
183 if (!bGapping1)
184 {
185 #if TRACE_SEQPAIR
186 Log("Gap open seq 1 col %d\n", uColIndex);
187 #endif
188 if (uColIndex == uColStart)
189 scoreGaps += TermGapScore(true);
190 else
191 scoreGaps += g_scoreGapOpen;
192 bGapping1 = true;
193 }
194 else
195 scoreGaps += g_scoreGapExtend;
196 continue;
197 }
198
199 else if (bGap2)
200 {
201 if (!bGapping2)
202 {
203 #if TRACE_SEQPAIR
204 Log("Gap open seq 2 col %d\n", uColIndex);
205 #endif
206 if (uColIndex == uColStart)
207 scoreGaps += TermGapScore(true);
208 else
209 scoreGaps += g_scoreGapOpen;
210 bGapping2 = true;
211 }
212 else
213 scoreGaps += g_scoreGapExtend;
214 continue;
215 }
216
217 bGapping1 = false;
218 bGapping2 = false;
219 }
220
221 if (bGapping1 || bGapping2)
222 {
223 scoreGaps -= g_scoreGapOpen;
224 scoreGaps += TermGapScore(true);
225 }
226 return scoreGaps;
227 }
228
229 // The usual sum-of-pairs objective score: sum the score
230 // of the alignment of each pair of sequences.
ObjScoreSP(const MSA & msa,SCORE MatchScore[])231 SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])
232 {
233 #if TRACE
234 Log("==================ObjScoreSP==============\n");
235 Log("msa=\n");
236 msa.LogMe();
237 #endif
238 g_SPScoreLetters = 0;
239 g_SPScoreGaps = 0;
240
241 if (0 != MatchScore)
242 {
243 const unsigned uColCount = msa.GetColCount();
244 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
245 MatchScore[uColIndex] = 0;
246 }
247
248 const unsigned uSeqCount = msa.GetSeqCount();
249 SCORE scoreTotal = 0;
250 unsigned uPairCount = 0;
251 #if TRACE
252 Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n");
253 Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n");
254 #endif
255 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
256 {
257 const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
258 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
259 {
260 const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
261 const WEIGHT w = w1*w2;
262
263 SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);
264 SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);
265 SCORE scorePair = scoreLetters + scoreGaps;
266 ++uPairCount;
267
268 scoreTotal += w*scorePair;
269
270 g_SPScoreLetters += w*scoreLetters;
271 g_SPScoreGaps += w*scoreGaps;
272 #if TRACE
273 Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n",
274 uSeqIndex1,
275 uSeqIndex2,
276 w1,
277 w2,
278 scoreLetters,
279 scoreGaps,
280 scorePair,
281 scorePair*w1*w2,
282 scoreTotal,
283 msa.GetSeqName(uSeqIndex1),
284 msa.GetSeqName(uSeqIndex2));
285 #endif
286 }
287 }
288 #if TEST_SPFAST
289 {
290 SCORE f = ObjScoreSPFast(msa);
291 Log("Fast = %.6g\n", f);
292 Log("Brute = %.6g\n", scoreTotal);
293 if (BTEq(f, scoreTotal))
294 Log("Agree\n");
295 else
296 Log("** DISAGREE **\n");
297 }
298 #endif
299 // return scoreTotal / uPairCount;
300 return scoreTotal;
301 }
302
303 // Objective score defined as the dynamic programming score.
304 // Input is two alignments, which must be of the same length.
305 // Result is the same profile-profile score that is optimized
306 // by dynamic programming.
ObjScoreDP(const MSA & msa1,const MSA & msa2,SCORE MatchScore[])307 SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])
308 {
309 const unsigned uColCount = msa1.GetColCount();
310 if (msa2.GetColCount() != uColCount)
311 Quit("ObjScoreDP, must be same length");
312
313 const unsigned uColCount1 = msa1.GetColCount();
314 const unsigned uColCount2 = msa2.GetColCount();
315
316 const ProfPos *PA = ProfileFromMSA(msa1);
317 const ProfPos *PB = ProfileFromMSA(msa2);
318
319 return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);
320 }
321
ObjScoreDP_Profs(const ProfPos * PA,const ProfPos * PB,unsigned uColCount,SCORE MatchScore[])322 SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,
323 SCORE MatchScore[])
324 {
325 //#if TRACE
326 // Log("Profile 1:\n");
327 // ListProfile(PA, uColCount, &msa1);
328 //
329 // Log("Profile 2:\n");
330 // ListProfile(PB, uColCount, &msa2);
331 //#endif
332
333 SCORE scoreTotal = 0;
334
335 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
336 {
337 const ProfPos &PPA = PA[uColIndex];
338 const ProfPos &PPB = PB[uColIndex];
339
340 SCORE scoreGap = 0;
341 SCORE scoreMatch = 0;
342 // If gapped column...
343 if (PPA.m_bAllGaps && PPB.m_bAllGaps)
344 scoreGap = 0;
345 else if (PPA.m_bAllGaps)
346 {
347 if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)
348 scoreGap = PPB.m_scoreGapClose;
349 if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)
350 scoreGap += PPB.m_scoreGapOpen;
351 //if (0 == scoreGap)
352 // scoreGap = PPB.m_scoreGapExtend;
353 }
354 else if (PPB.m_bAllGaps)
355 {
356 if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)
357 scoreGap = PPA.m_scoreGapClose;
358 if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)
359 scoreGap += PPA.m_scoreGapOpen;
360 //if (0 == scoreGap)
361 // scoreGap = PPA.m_scoreGapExtend;
362 }
363 else
364 scoreMatch = ScoreProfPos2(PPA, PPB);
365
366 if (0 != MatchScore)
367 MatchScore[uColIndex] = scoreMatch;
368
369 scoreTotal += scoreMatch + scoreGap;
370
371 extern bool g_bTracePPScore;
372 extern MSA *g_ptrPPScoreMSA1;
373 extern MSA *g_ptrPPScoreMSA2;
374 if (g_bTracePPScore)
375 {
376 const MSA &msa1 = *g_ptrPPScoreMSA1;
377 const MSA &msa2 = *g_ptrPPScoreMSA2;
378 const unsigned uSeqCount1 = msa1.GetSeqCount();
379 const unsigned uSeqCount2 = msa2.GetSeqCount();
380
381 for (unsigned n = 0; n < uSeqCount1; ++n)
382 Log("%c", msa1.GetChar(n, uColIndex));
383 Log(" ");
384 for (unsigned n = 0; n < uSeqCount2; ++n)
385 Log("%c", msa2.GetChar(n, uColIndex));
386 Log(" %10.3f", scoreMatch);
387 if (scoreGap != 0)
388 Log(" %10.3f", scoreGap);
389 Log("\n");
390 }
391 }
392
393 delete[] PA;
394 delete[] PB;
395
396 return scoreTotal;
397 }
398
399 // Objective score defined as the sum of profile-sequence
400 // scores for each sequence in the alignment. The profile
401 // is computed from the entire alignment, so this includes
402 // the score of each sequence against itself. This is to
403 // avoid recomputing the profile each time, so we reduce
404 // complexity but introduce a questionable approximation.
405 // The goal is to see if we can exploit the apparent
406 // improvement in performance of log-expectation score
407 // over the usual sum-of-pairs by optimizing this
408 // objective score in the iterative refinement stage.
ObjScorePS(const MSA & msa,SCORE MatchScore[])409 SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])
410 {
411 if (g_PPScore != PPSCORE_LE)
412 Quit("FastScoreMSA_LASimple: LA");
413
414 const unsigned uSeqCount = msa.GetSeqCount();
415 const unsigned uColCount = msa.GetColCount();
416
417 const ProfPos *Prof = ProfileFromMSA(msa);
418
419 if (0 != MatchScore)
420 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
421 MatchScore[uColIndex] = 0;
422
423 SCORE scoreTotal = 0;
424 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
425 {
426 const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);
427 SCORE scoreSeq = 0;
428 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
429 {
430 const ProfPos &PP = Prof[uColIndex];
431 if (msa.IsGap(uSeqIndex, uColIndex))
432 {
433 bool bOpen = (0 == uColIndex ||
434 !msa.IsGap(uSeqIndex, uColIndex - 1));
435 bool bClose = (uColCount - 1 == uColIndex ||
436 !msa.IsGap(uSeqIndex, uColIndex + 1));
437
438 if (bOpen)
439 scoreSeq += PP.m_scoreGapOpen;
440 if (bClose)
441 scoreSeq += PP.m_scoreGapClose;
442 //if (!bOpen && !bClose)
443 // scoreSeq += PP.m_scoreGapExtend;
444 }
445 else if (msa.IsWildcard(uSeqIndex, uColIndex))
446 continue;
447 else
448 {
449 unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);
450 const SCORE scoreMatch = PP.m_AAScores[uLetter];
451 if (0 != MatchScore)
452 MatchScore[uColIndex] += weightSeq*scoreMatch;
453 scoreSeq += scoreMatch;
454 }
455 }
456 scoreTotal += weightSeq*scoreSeq;
457 }
458
459 delete[] Prof;
460 return scoreTotal;
461 }
462
463 // The XP score is the sum of the score of each pair of
464 // sequences between two profiles which are aligned to
465 // each other. Notice that for two given profiles aligned
466 // in different ways, the difference in XP score must be
467 // the same as the difference in SP score because the
468 // score of a pair of sequences in one profile doesn't
469 // depend on the alignment.
ObjScoreXP(const MSA & msa1,const MSA & msa2)470 SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)
471 {
472 const unsigned uColCount1 = msa1.GetColCount();
473 const unsigned uColCount2 = msa2.GetColCount();
474 if (uColCount1 != uColCount2)
475 Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);
476
477 const unsigned uSeqCount1 = msa1.GetSeqCount();
478 const unsigned uSeqCount2 = msa2.GetSeqCount();
479
480 #if TRACE
481 Log(" Score Weight Weight Total\n");
482 Log("---------- ------ ------ ----------\n");
483 #endif
484
485 SCORE scoreTotal = 0;
486 unsigned uPairCount = 0;
487 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
488 {
489 const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);
490 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
491 {
492 const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);
493 const WEIGHT w = w1*w2;
494 SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);
495 SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);
496 SCORE scorePair = scoreLetters + scoreGaps;
497 scoreTotal += w1*w2*scorePair;
498 ++uPairCount;
499 #if TRACE
500 Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n",
501 scorePair,
502 w1,
503 w2,
504 scorePair*w1*w2,
505 msa1.GetSeqName(uSeqIndex1),
506 msa2.GetSeqName(uSeqIndex2));
507 #endif
508 }
509 }
510 if (0 == uPairCount)
511 Quit("0 == uPairCount");
512
513 #if TRACE
514 Log("msa1=\n");
515 msa1.LogMe();
516 Log("msa2=\n");
517 msa2.LogMe();
518 Log("XP=%g\n", scoreTotal);
519 #endif
520 // return scoreTotal / uPairCount;
521 return scoreTotal;
522 }
523