1 #include "muscle.h"
2 #include "msa.h"
3 #include "profile.h"
4 #include "objscore.h"
5 
6 #define TRACE			0
7 #define TRACE_SEQPAIR	0
8 #define TEST_SPFAST		0
9 
10 extern SCOREMATRIX VTML_LA;
11 extern SCOREMATRIX PAM200;
12 extern SCOREMATRIX PAM200NoCenter;
13 extern SCOREMATRIX VTML_SP;
14 extern SCOREMATRIX VTML_SPNoCenter;
15 extern SCOREMATRIX NUC_SP;
16 
17 SCORE g_SPScoreLetters;
18 SCORE g_SPScoreGaps;
19 
TermGapScore(bool Gap)20 static SCORE TermGapScore(bool Gap)
21 	{
22 	switch (g_TermGaps)
23 		{
24 	case TERMGAPS_Full:
25 		return 0;
26 
27 	case TERMGAPS_Half:
28 		if (Gap)
29 			return g_scoreGapOpen/2;
30 		return 0;
31 
32 	case TERMGAPS_Ext:
33 		if (Gap)
34 			return g_scoreGapExtend;
35 		return 0;
36 		}
37 	Quit("TermGapScore?!");
38 	return 0;
39 	}
40 
ScoreSeqPairLetters(const MSA & msa1,unsigned uSeqIndex1,const MSA & msa2,unsigned uSeqIndex2)41 SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1,
42   const MSA &msa2, unsigned uSeqIndex2)
43 	{
44 	const unsigned uColCount = msa1.GetColCount();
45 	const unsigned uColCount2 = msa2.GetColCount();
46 	if (uColCount != uColCount2)
47 		Quit("ScoreSeqPairLetters, different lengths");
48 
49 #if	TRACE_SEQPAIR
50 	{
51 	Log("\n");
52 	Log("ScoreSeqPairLetters\n");
53 	MSA msaTmp;
54 	msaTmp.SetSize(2, uColCount);
55 	msaTmp.CopySeq(0, msa1, uSeqIndex1);
56 	msaTmp.CopySeq(1, msa2, uSeqIndex2);
57 	msaTmp.LogMe();
58 	}
59 #endif
60 
61 	SCORE scoreLetters = 0;
62 	SCORE scoreGaps = 0;
63 	bool bGapping1 = false;
64 	bool bGapping2 = false;
65 
66 	unsigned uColStart = 0;
67 	bool bLeftTermGap = false;
68 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
69 		{
70 		bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
71 		bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
72 		if (!bGap1 || !bGap2)
73 			{
74 			if (bGap1 || bGap2)
75 				bLeftTermGap = true;
76 			uColStart = uColIndex;
77 			break;
78 			}
79 		}
80 
81 	unsigned uColEnd = uColCount - 1;
82 	bool bRightTermGap = false;
83 	for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
84 		{
85 		bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
86 		bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
87 		if (!bGap1 || !bGap2)
88 			{
89 			if (bGap1 || bGap2)
90 				bRightTermGap = true;
91 			uColEnd = (unsigned) iColIndex;
92 			break;
93 			}
94 		}
95 
96 #if	TRACE_SEQPAIR
97 	Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
98 #endif
99 
100 	for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
101 		{
102 		unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex);
103 		if (uLetter1 >= g_AlphaSize)
104 			continue;
105 		unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex);
106 		if (uLetter2 >= g_AlphaSize)
107 			continue;
108 
109 		SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2];
110 		scoreLetters += scoreMatch;
111 		}
112 	return scoreLetters;
113 	}
114 
ScoreSeqPairGaps(const MSA & msa1,unsigned uSeqIndex1,const MSA & msa2,unsigned uSeqIndex2)115 SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1,
116   const MSA &msa2, unsigned uSeqIndex2)
117 	{
118 	const unsigned uColCount = msa1.GetColCount();
119 	const unsigned uColCount2 = msa2.GetColCount();
120 	if (uColCount != uColCount2)
121 		Quit("ScoreSeqPairGaps, different lengths");
122 
123 #if	TRACE_SEQPAIR
124 	{
125 	Log("\n");
126 	Log("ScoreSeqPairGaps\n");
127 	MSA msaTmp;
128 	msaTmp.SetSize(2, uColCount);
129 	msaTmp.CopySeq(0, msa1, uSeqIndex1);
130 	msaTmp.CopySeq(1, msa2, uSeqIndex2);
131 	msaTmp.LogMe();
132 	}
133 #endif
134 
135 	SCORE scoreGaps = 0;
136 	bool bGapping1 = false;
137 	bool bGapping2 = false;
138 
139 	unsigned uColStart = 0;
140 	bool bLeftTermGap = false;
141 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
142 		{
143 		bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
144 		bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
145 		if (!bGap1 || !bGap2)
146 			{
147 			if (bGap1 || bGap2)
148 				bLeftTermGap = true;
149 			uColStart = uColIndex;
150 			break;
151 			}
152 		}
153 
154 	unsigned uColEnd = uColCount - 1;
155 	bool bRightTermGap = false;
156 	for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex)
157 		{
158 		bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex);
159 		bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex);
160 		if (!bGap1 || !bGap2)
161 			{
162 			if (bGap1 || bGap2)
163 				bRightTermGap = true;
164 			uColEnd = (unsigned) iColIndex;
165 			break;
166 			}
167 		}
168 
169 #if	TRACE_SEQPAIR
170 	Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap);
171 #endif
172 
173 	for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex)
174 		{
175 		bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex);
176 		bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex);
177 
178 		if (bGap1 && bGap2)
179 			continue;
180 
181 		if (bGap1)
182 			{
183 			if (!bGapping1)
184 				{
185 #if	TRACE_SEQPAIR
186 				Log("Gap open seq 1 col %d\n", uColIndex);
187 #endif
188 				if (uColIndex == uColStart)
189 					scoreGaps += TermGapScore(true);
190 				else
191 					scoreGaps += g_scoreGapOpen;
192 				bGapping1 = true;
193 				}
194 			else
195 				scoreGaps += g_scoreGapExtend;
196 			continue;
197 			}
198 
199 		else if (bGap2)
200 			{
201 			if (!bGapping2)
202 				{
203 #if	TRACE_SEQPAIR
204 				Log("Gap open seq 2 col %d\n", uColIndex);
205 #endif
206 				if (uColIndex == uColStart)
207 					scoreGaps += TermGapScore(true);
208 				else
209 					scoreGaps += g_scoreGapOpen;
210 				bGapping2 = true;
211 				}
212 			else
213 				scoreGaps += g_scoreGapExtend;
214 			continue;
215 			}
216 
217 		bGapping1 = false;
218 		bGapping2 = false;
219 		}
220 
221 	if (bGapping1 || bGapping2)
222 		{
223 		scoreGaps -= g_scoreGapOpen;
224 		scoreGaps += TermGapScore(true);
225 		}
226 	return scoreGaps;
227 	}
228 
229 // The usual sum-of-pairs objective score: sum the score
230 // of the alignment of each pair of sequences.
ObjScoreSP(const MSA & msa,SCORE MatchScore[])231 SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[])
232 	{
233 #if	TRACE
234 	Log("==================ObjScoreSP==============\n");
235 	Log("msa=\n");
236 	msa.LogMe();
237 #endif
238 	g_SPScoreLetters = 0;
239 	g_SPScoreGaps = 0;
240 
241 	if (0 != MatchScore)
242 		{
243 		const unsigned uColCount = msa.GetColCount();
244 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
245 			MatchScore[uColIndex] = 0;
246 		}
247 
248 	const unsigned uSeqCount = msa.GetSeqCount();
249 	SCORE scoreTotal = 0;
250 	unsigned uPairCount = 0;
251 #if	TRACE
252 	Log("Seq1  Seq2     wt1     wt2    Letters         Gaps  Unwt.Score    Wt.Score       Total\n");
253 	Log("----  ----  ------  ------  ----------  ----------  ----------  ----------  ----------\n");
254 #endif
255 	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
256 		{
257 		const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1);
258 		for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2)
259 			{
260 			const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2);
261 			const WEIGHT w = w1*w2;
262 
263 			SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2);
264 			SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2);
265 			SCORE scorePair = scoreLetters + scoreGaps;
266 			++uPairCount;
267 
268 			scoreTotal += w*scorePair;
269 
270 			g_SPScoreLetters += w*scoreLetters;
271 			g_SPScoreGaps += w*scoreGaps;
272 #if	TRACE
273 			Log("%4d  %4d  %6.3f  %6.3f  %10.2f  %10.2f  %10.2f  %10.2f  %10.2f >%s >%s\n",
274 			  uSeqIndex1,
275 			  uSeqIndex2,
276 			  w1,
277 			  w2,
278 			  scoreLetters,
279 			  scoreGaps,
280 			  scorePair,
281 			  scorePair*w1*w2,
282 			  scoreTotal,
283 			  msa.GetSeqName(uSeqIndex1),
284 			  msa.GetSeqName(uSeqIndex2));
285 #endif
286 			}
287 		}
288 #if	TEST_SPFAST
289 	{
290 	SCORE f = ObjScoreSPFast(msa);
291 	Log("Fast  = %.6g\n", f);
292 	Log("Brute = %.6g\n", scoreTotal);
293 	if (BTEq(f, scoreTotal))
294 		Log("Agree\n");
295 	else
296 		Log("** DISAGREE **\n");
297 	}
298 #endif
299 //	return scoreTotal / uPairCount;
300 	return scoreTotal;
301 	}
302 
303 // Objective score defined as the dynamic programming score.
304 // Input is two alignments, which must be of the same length.
305 // Result is the same profile-profile score that is optimized
306 // by dynamic programming.
ObjScoreDP(const MSA & msa1,const MSA & msa2,SCORE MatchScore[])307 SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[])
308 	{
309 	const unsigned uColCount = msa1.GetColCount();
310 	if (msa2.GetColCount() != uColCount)
311 		Quit("ObjScoreDP, must be same length");
312 
313 	const unsigned uColCount1 = msa1.GetColCount();
314 	const unsigned uColCount2 = msa2.GetColCount();
315 
316 	const ProfPos *PA = ProfileFromMSA(msa1);
317 	const ProfPos *PB = ProfileFromMSA(msa2);
318 
319 	return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore);
320 	}
321 
ObjScoreDP_Profs(const ProfPos * PA,const ProfPos * PB,unsigned uColCount,SCORE MatchScore[])322 SCORE ObjScoreDP_Profs(const ProfPos *PA, const ProfPos *PB, unsigned uColCount,
323   SCORE MatchScore[])
324 	{
325 //#if	TRACE
326 //	Log("Profile 1:\n");
327 //	ListProfile(PA, uColCount, &msa1);
328 //
329 //	Log("Profile 2:\n");
330 //	ListProfile(PB, uColCount, &msa2);
331 //#endif
332 
333 	SCORE scoreTotal = 0;
334 
335 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
336 		{
337 		const ProfPos &PPA = PA[uColIndex];
338 		const ProfPos &PPB = PB[uColIndex];
339 
340 		SCORE scoreGap = 0;
341 		SCORE scoreMatch = 0;
342 	// If gapped column...
343 		if (PPA.m_bAllGaps && PPB.m_bAllGaps)
344 			scoreGap = 0;
345 		else if (PPA.m_bAllGaps)
346 			{
347 			if (uColCount - 1 == uColIndex || !PA[uColIndex+1].m_bAllGaps)
348 				scoreGap = PPB.m_scoreGapClose;
349 			if (0 == uColIndex || !PA[uColIndex-1].m_bAllGaps)
350 				scoreGap += PPB.m_scoreGapOpen;
351 			//if (0 == scoreGap)
352 			//	scoreGap = PPB.m_scoreGapExtend;
353 			}
354 		else if (PPB.m_bAllGaps)
355 			{
356 			if (uColCount - 1 == uColIndex || !PB[uColIndex+1].m_bAllGaps)
357 				scoreGap = PPA.m_scoreGapClose;
358 			if (0 == uColIndex || !PB[uColIndex-1].m_bAllGaps)
359 				scoreGap += PPA.m_scoreGapOpen;
360 			//if (0 == scoreGap)
361 			//	scoreGap = PPA.m_scoreGapExtend;
362 			}
363 		else
364 			scoreMatch = ScoreProfPos2(PPA, PPB);
365 
366 		if (0 != MatchScore)
367 			MatchScore[uColIndex] = scoreMatch;
368 
369 		scoreTotal += scoreMatch + scoreGap;
370 
371 		extern bool g_bTracePPScore;
372 		extern MSA *g_ptrPPScoreMSA1;
373 		extern MSA *g_ptrPPScoreMSA2;
374 		if (g_bTracePPScore)
375 			{
376 			const MSA &msa1 = *g_ptrPPScoreMSA1;
377 			const MSA &msa2 = *g_ptrPPScoreMSA2;
378 			const unsigned uSeqCount1 = msa1.GetSeqCount();
379 			const unsigned uSeqCount2 = msa2.GetSeqCount();
380 
381 			for (unsigned n = 0; n < uSeqCount1; ++n)
382 				Log("%c", msa1.GetChar(n, uColIndex));
383 			Log("  ");
384 			for (unsigned n = 0; n < uSeqCount2; ++n)
385 				Log("%c", msa2.GetChar(n, uColIndex));
386 			Log("  %10.3f", scoreMatch);
387 			if (scoreGap != 0)
388 				Log("  %10.3f", scoreGap);
389 			Log("\n");
390 			}
391 		}
392 
393 	delete[] PA;
394 	delete[] PB;
395 
396 	return scoreTotal;
397 	}
398 
399 // Objective score defined as the sum of profile-sequence
400 // scores for each sequence in the alignment. The profile
401 // is computed from the entire alignment, so this includes
402 // the score of each sequence against itself. This is to
403 // avoid recomputing the profile each time, so we reduce
404 // complexity but introduce a questionable approximation.
405 // The goal is to see if we can exploit the apparent
406 // improvement in performance of log-expectation score
407 // over the usual sum-of-pairs by optimizing this
408 // objective score in the iterative refinement stage.
ObjScorePS(const MSA & msa,SCORE MatchScore[])409 SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[])
410 	{
411 	if (g_PPScore != PPSCORE_LE)
412 		Quit("FastScoreMSA_LASimple: LA");
413 
414 	const unsigned uSeqCount = msa.GetSeqCount();
415 	const unsigned uColCount = msa.GetColCount();
416 
417 	const ProfPos *Prof = ProfileFromMSA(msa);
418 
419 	if (0 != MatchScore)
420 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
421 			MatchScore[uColIndex] = 0;
422 
423 	SCORE scoreTotal = 0;
424 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
425 		{
426 		const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex);
427 		SCORE scoreSeq = 0;
428 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
429 			{
430 			const ProfPos &PP = Prof[uColIndex];
431 			if (msa.IsGap(uSeqIndex, uColIndex))
432 				{
433 				bool bOpen = (0 == uColIndex ||
434 				  !msa.IsGap(uSeqIndex, uColIndex - 1));
435 				bool bClose = (uColCount - 1 == uColIndex ||
436 				  !msa.IsGap(uSeqIndex, uColIndex + 1));
437 
438 				if (bOpen)
439 					scoreSeq += PP.m_scoreGapOpen;
440 				if (bClose)
441 					scoreSeq += PP.m_scoreGapClose;
442 				//if (!bOpen && !bClose)
443 				//	scoreSeq += PP.m_scoreGapExtend;
444 				}
445 			else if (msa.IsWildcard(uSeqIndex, uColIndex))
446 				continue;
447 			else
448 				{
449 				unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex);
450 				const SCORE scoreMatch = PP.m_AAScores[uLetter];
451 				if (0 != MatchScore)
452 					MatchScore[uColIndex] += weightSeq*scoreMatch;
453 				scoreSeq += scoreMatch;
454 				}
455 			}
456 		scoreTotal += weightSeq*scoreSeq;
457 		}
458 
459 	delete[] Prof;
460 	return scoreTotal;
461 	}
462 
463 // The XP score is the sum of the score of each pair of
464 // sequences between two profiles which are aligned to
465 // each other. Notice that for two given profiles aligned
466 // in different ways, the difference in XP score must be
467 // the same as the difference in SP score because the
468 // score of a pair of sequences in one profile doesn't
469 // depend on the alignment.
ObjScoreXP(const MSA & msa1,const MSA & msa2)470 SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2)
471 	{
472 	const unsigned uColCount1 = msa1.GetColCount();
473 	const unsigned uColCount2 = msa2.GetColCount();
474 	if (uColCount1 != uColCount2)
475 		Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2);
476 
477 	const unsigned uSeqCount1 = msa1.GetSeqCount();
478 	const unsigned uSeqCount2 = msa2.GetSeqCount();
479 
480 #if	TRACE
481 	Log("     Score  Weight  Weight       Total\n");
482 	Log("----------  ------  ------  ----------\n");
483 #endif
484 
485 	SCORE scoreTotal = 0;
486 	unsigned uPairCount = 0;
487 	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1)
488 		{
489 		const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1);
490 		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2)
491 			{
492 			const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2);
493 			const WEIGHT w = w1*w2;
494 			SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2);
495 			SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2);
496 			SCORE scorePair = scoreLetters + scoreGaps;
497 			scoreTotal += w1*w2*scorePair;
498 			++uPairCount;
499 #if	TRACE
500 			Log("%10.2f  %6.3f  %6.3f  %10.2f  >%s >%s\n",
501 			  scorePair,
502 			  w1,
503 			  w2,
504 			  scorePair*w1*w2,
505 			  msa1.GetSeqName(uSeqIndex1),
506 			  msa2.GetSeqName(uSeqIndex2));
507 #endif
508 			}
509 		}
510 	if (0 == uPairCount)
511 		Quit("0 == uPairCount");
512 
513 #if	TRACE
514 	Log("msa1=\n");
515 	msa1.LogMe();
516 	Log("msa2=\n");
517 	msa2.LogMe();
518 	Log("XP=%g\n", scoreTotal);
519 #endif
520 //	return scoreTotal / uPairCount;
521 	return scoreTotal;
522 	}
523