1 #include "muscle.h"
2 #include "msa.h"
3 #include "seqvect.h"
4 #include "profile.h"
5 #include "tree.h"
6 
7 // These global variables are a hack to allow the tree
8 // dependent iteration code to communicate the edge
9 // used to divide the tree. The three-way weighting
10 // scheme needs to know this edge in order to compute
11 // sequence weights.
12 static const Tree *g_ptrMuscleTree = 0;
13 unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;
14 unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;
15 
GetFractionalWeightedCounts(unsigned uColIndex,bool bNormalize,FCOUNT fcCounts[],FCOUNT * ptrfcGapStart,FCOUNT * ptrfcGapEnd,FCOUNT * ptrfcGapExtend,FCOUNT * ptrfOcc,FCOUNT * ptrfcLL,FCOUNT * ptrfcLG,FCOUNT * ptrfcGL,FCOUNT * ptrfcGG) const16 void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
17   FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
18   FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
19   FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
20 	{
21 	const unsigned uSeqCount = GetSeqCount();
22 	const unsigned uColCount = GetColCount();
23 
24 	memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
25 	WEIGHT wTotal = 0;
26 	FCOUNT fGap = 0;
27 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
28 		{
29 		const WEIGHT w = GetSeqWeight(uSeqIndex);
30 		if (IsGap(uSeqIndex, uColIndex))
31 			{
32 			fGap += w;
33 			continue;
34 			}
35 		else if (IsWildcard(uSeqIndex, uColIndex))
36 			{
37 			const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
38 			switch (g_Alpha)
39 				{
40 			case ALPHA_Amino:
41 				switch (uLetter)
42 					{
43 				case AX_B:		// D or N
44 					fcCounts[AX_D] += w/2;
45 					fcCounts[AX_N] += w/2;
46 					break;
47 				case AX_Z:		// E or Q
48 					fcCounts[AX_E] += w/2;
49 					fcCounts[AX_Q] += w/2;
50 					break;
51 				default:		// any
52 					{
53 					const FCOUNT f = w/20;
54 					for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
55 						fcCounts[uLetter] += f;
56 					break;
57 					}
58 					}
59 				break;
60 
61 			case ALPHA_DNA:
62 			case ALPHA_RNA:
63 				switch (uLetter)
64 					{
65 				case AX_R:	// G or A
66 					fcCounts[NX_G] += w/2;
67 					fcCounts[NX_A] += w/2;
68 					break;
69 				case AX_Y:	// C or T/U
70 					fcCounts[NX_C] += w/2;
71 					fcCounts[NX_T] += w/2;
72 					break;
73 				default:	// any
74 					const FCOUNT f = w/20;
75 					for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
76 						fcCounts[uLetter] += f;
77 					break;
78 					}
79 				break;
80 
81 			default:
82 				Quit("Alphabet %d not supported", g_Alpha);
83 				}
84 			continue;
85 			}
86 		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
87 		fcCounts[uLetter] += w;
88 		wTotal += w;
89 		}
90 	*ptrfOcc = (float) (1.0 - fGap);
91 
92 	if (bNormalize && wTotal > 0)
93 		{
94 		if (wTotal > 1.001)
95 			Quit("wTotal=%g\n", wTotal);
96 		for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
97 			fcCounts[uLetter] /= wTotal;
98 //		AssertNormalized(fcCounts);
99 		}
100 
101 	FCOUNT fcStartCount = 0;
102 	if (uColIndex == 0)
103 		{
104 		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
105 			if (IsGap(uSeqIndex, uColIndex))
106 				fcStartCount += GetSeqWeight(uSeqIndex);
107 		}
108 	else
109 		{
110 		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
111 			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
112 				fcStartCount += GetSeqWeight(uSeqIndex);
113 		}
114 
115 	FCOUNT fcEndCount = 0;
116 	if (uColCount - 1 == uColIndex)
117 		{
118 		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
119 			if (IsGap(uSeqIndex, uColIndex))
120 				fcEndCount += GetSeqWeight(uSeqIndex);
121 		}
122 	else
123 		{
124 		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
125 			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
126 				fcEndCount += GetSeqWeight(uSeqIndex);
127 		}
128 
129 	FCOUNT LL = 0;
130 	FCOUNT LG = 0;
131 	FCOUNT GL = 0;
132 	FCOUNT GG = 0;
133 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
134 		{
135 		WEIGHT w = GetSeqWeight(uSeqIndex);
136 		bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
137 		bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
138 		if (bLetterHere)
139 			{
140 			if (bLetterPrev)
141 				LL += w;
142 			else
143 				GL += w;
144 			}
145 		else
146 			{
147 			if (bLetterPrev)
148 				LG += w;
149 			else
150 				GG += w;
151 			}
152 		}
153 
154 	FCOUNT fcExtendCount = 0;
155 	if (uColIndex > 0 && uColIndex < GetColCount() - 1)
156 		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
157 			if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
158 			  IsGap(uSeqIndex, uColIndex + 1))
159 				fcExtendCount += GetSeqWeight(uSeqIndex);
160 
161 	*ptrfcLL = LL;
162 	*ptrfcLG = LG;
163 	*ptrfcGL = GL;
164 	*ptrfcGG = GG;
165 	*ptrfcGapStart = fcStartCount;
166 	*ptrfcGapEnd = fcEndCount;
167 	*ptrfcGapExtend = fcExtendCount;
168 	}
169 
170 // Return true if the given column has no gaps and all
171 // its residues are in the same biochemical group.
MSAColIsConservative(const MSA & msa,unsigned uColIndex)172 bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)
173 	{
174 	extern unsigned ResidueGroup[];
175 
176 	const unsigned uSeqCount = msa.GetColCount();
177 	if (0 == uSeqCount)
178 		Quit("MSAColIsConservative: empty alignment");
179 
180 	if (msa.IsGap(0, uColIndex))
181 		return false;
182 
183 	unsigned uLetter = msa.GetLetterEx(0, uColIndex);
184 	const unsigned uGroup = ResidueGroup[uLetter];
185 
186 	for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)
187 		{
188 		if (msa.IsGap(uSeqIndex, uColIndex))
189 			return false;
190 		uLetter = msa.GetLetter(uSeqIndex, uColIndex);
191 		if (ResidueGroup[uLetter] != uGroup)
192 			return false;
193 		}
194 	return true;
195 	}
196 
MSAFromSeqRange(const MSA & msaIn,unsigned uFromSeqIndex,unsigned uSeqCount,MSA & msaOut)197 void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,
198   MSA &msaOut)
199 	{
200 	const unsigned uColCount = msaIn.GetColCount();
201 	msaOut.SetSize(uSeqCount, uColCount);
202 
203 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
204 		{
205 		const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);
206 		msaOut.SetSeqName(uSeqIndex, ptrName);
207 
208 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
209 			{
210 			const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);
211 			msaOut.SetChar(uSeqIndex, uColIndex, c);
212 			}
213 		}
214 	}
215 
MSAFromColRange(const MSA & msaIn,unsigned uFromColIndex,unsigned uColCount,MSA & msaOut)216 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
217   MSA &msaOut)
218 	{
219 	const unsigned uSeqCount = msaIn.GetSeqCount();
220 	const unsigned uInColCount = msaIn.GetColCount();
221 
222 	if (uFromColIndex + uColCount - 1 > uInColCount)
223 		Quit("MSAFromColRange, out of bounds");
224 
225 	msaOut.SetSize(uSeqCount, uColCount);
226 
227 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
228 		{
229 		const char *ptrName = msaIn.GetSeqName(uSeqIndex);
230 		unsigned uId = msaIn.GetSeqId(uSeqIndex);
231 		msaOut.SetSeqName(uSeqIndex, ptrName);
232 		msaOut.SetSeqId(uSeqIndex, uId);
233 
234 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
235 			{
236 			const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);
237 			msaOut.SetChar(uSeqIndex, uColIndex, c);
238 			}
239 		}
240 	}
241 
SeqVectFromMSA(const MSA & msa,SeqVect & v)242 void SeqVectFromMSA(const MSA &msa, SeqVect &v)
243 	{
244 	v.Clear();
245 	const unsigned uSeqCount = msa.GetSeqCount();
246 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
247 		{
248 		Seq s;
249 		msa.GetSeq(uSeqIndex, s);
250 
251 		s.StripGaps();
252 		//if (0 == s.Length())
253 		//	continue;
254 
255 		const char *ptrName = msa.GetSeqName(uSeqIndex);
256 		s.SetName(ptrName);
257 
258 		v.AppendSeq(s);
259 		}
260 	}
261 
DeleteGappedCols(MSA & msa)262 void DeleteGappedCols(MSA &msa)
263 	{
264 	unsigned uColIndex = 0;
265 	for (;;)
266 		{
267 		if (uColIndex >= msa.GetColCount())
268 			break;
269 		if (msa.IsGapColumn(uColIndex))
270 			msa.DeleteCol(uColIndex);
271 		else
272 			++uColIndex;
273 		}
274 	}
275 
MSAFromSeqSubset(const MSA & msaIn,const unsigned uSeqIndexes[],unsigned uSeqCount,MSA & msaOut)276 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
277   MSA &msaOut)
278 	{
279 	const unsigned uColCount = msaIn.GetColCount();
280 	msaOut.SetSize(uSeqCount, uColCount);
281 	for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)
282 		{
283 		unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];
284 		const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
285 		unsigned uId = msaIn.GetSeqId(uSeqIndexIn);
286 		msaOut.SetSeqName(uSeqIndexOut, ptrName);
287 		msaOut.SetSeqId(uSeqIndexOut, uId);
288 		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
289 			{
290 			const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
291 			msaOut.SetChar(uSeqIndexOut, uColIndex, c);
292 			}
293 		}
294 	}
295 
AssertMSAEqIgnoreCaseAndGaps(const MSA & msa1,const MSA & msa2)296 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)
297 	{
298 	const unsigned uSeqCount1 = msa1.GetSeqCount();
299 	const unsigned uSeqCount2 = msa2.GetSeqCount();
300 	if (uSeqCount1 != uSeqCount2)
301 		Quit("Seq count differs");
302 
303 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
304 		{
305 		Seq seq1;
306 		msa1.GetSeq(uSeqIndex, seq1);
307 
308 		unsigned uId = msa1.GetSeqId(uSeqIndex);
309 		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
310 
311 		Seq seq2;
312 		msa2.GetSeq(uSeqIndex2, seq2);
313 
314 		if (!seq1.EqIgnoreCaseAndGaps(seq2))
315 			{
316 			Log("Input:\n");
317 			seq1.LogMe();
318 			Log("Output:\n");
319 			seq2.LogMe();
320 			Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
321 			}
322 		}
323 	}
324 
AssertMSAEq(const MSA & msa1,const MSA & msa2)325 void AssertMSAEq(const MSA &msa1, const MSA &msa2)
326 	{
327 	const unsigned uSeqCount1 = msa1.GetSeqCount();
328 	const unsigned uSeqCount2 = msa2.GetSeqCount();
329 	if (uSeqCount1 != uSeqCount2)
330 		Quit("Seq count differs");
331 
332 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
333 		{
334 		Seq seq1;
335 		msa1.GetSeq(uSeqIndex, seq1);
336 
337 		unsigned uId = msa1.GetSeqId(uSeqIndex);
338 		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
339 
340 		Seq seq2;
341 		msa2.GetSeq(uSeqIndex2, seq2);
342 
343 		if (!seq1.Eq(seq2))
344 			{
345 			Log("Input:\n");
346 			seq1.LogMe();
347 			Log("Output:\n");
348 			seq2.LogMe();
349 			Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
350 			}
351 		}
352 	}
353 
SetMSAWeightsMuscle(MSA & msa)354 void SetMSAWeightsMuscle(MSA &msa)
355 	{
356 	SEQWEIGHT Method = GetSeqWeightMethod();
357 	switch (Method)
358 		{
359 	case SEQWEIGHT_None:
360 		msa.SetUniformWeights();
361 		return;
362 
363 	case SEQWEIGHT_Henikoff:
364 		msa.SetHenikoffWeights();
365 		return;
366 
367 	case SEQWEIGHT_HenikoffPB:
368 		msa.SetHenikoffWeightsPB();
369 		return;
370 
371 	case SEQWEIGHT_GSC:
372 		msa.SetGSCWeights();
373 		return;
374 
375 	case SEQWEIGHT_ClustalW:
376 		SetClustalWWeightsMuscle(msa);
377 		return;
378 
379 	case SEQWEIGHT_ThreeWay:
380 		SetThreeWayWeightsMuscle(msa);
381 		return;
382 		}
383 	Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);
384 	}
385 
386 static WEIGHT *g_MuscleWeights;
387 static unsigned g_uMuscleIdCount;
388 
GetMuscleSeqWeightById(unsigned uId)389 WEIGHT GetMuscleSeqWeightById(unsigned uId)
390 	{
391 	if (0 == g_MuscleWeights)
392 		Quit("g_MuscleWeights = 0");
393 	if (uId >= g_uMuscleIdCount)
394 		Quit("GetMuscleSeqWeightById(%u): count=%u",
395 		  uId, g_uMuscleIdCount);
396 
397 	return g_MuscleWeights[uId];
398 	}
399 
SetMuscleTree(const Tree & tree)400 void SetMuscleTree(const Tree &tree)
401 	{
402 	g_ptrMuscleTree = &tree;
403 
404 	if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())
405 		return;
406 
407 	delete[] g_MuscleWeights;
408 
409 	const unsigned uLeafCount = tree.GetLeafCount();
410 	g_uMuscleIdCount = uLeafCount;
411 	g_MuscleWeights = new WEIGHT[uLeafCount];
412 	CalcClustalWWeights(tree, g_MuscleWeights);
413 	}
414 
SetClustalWWeightsMuscle(MSA & msa)415 void SetClustalWWeightsMuscle(MSA &msa)
416 	{
417 	if (0 == g_MuscleWeights)
418 		Quit("g_MuscleWeights = 0");
419 	const unsigned uSeqCount = msa.GetSeqCount();
420 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
421 		{
422 		const unsigned uId = msa.GetSeqId(uSeqIndex);
423 		if (uId >= g_uMuscleIdCount)
424 			Quit("SetClustalWWeightsMuscle: id out of range");
425 		msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);
426 		}
427 	msa.NormalizeWeights((WEIGHT) 1.0);
428 	}
429 
430 #define	LOCAL_VERBOSE	0
431 
SetThreeWayWeightsMuscle(MSA & msa)432 void SetThreeWayWeightsMuscle(MSA &msa)
433 	{
434 	if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)
435 		{
436 		msa.SetHenikoffWeightsPB();
437 		return;
438 		}
439 
440 	const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();
441 	WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];
442 
443 	CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,
444 	  Weights);
445 
446 	const unsigned uMSASeqCount = msa.GetSeqCount();
447 	for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)
448 		{
449 		const unsigned uId = msa.GetSeqId(uSeqIndex);
450 		if (uId >= uMuscleSeqCount)
451 			Quit("SetThreeWayWeightsMuscle: id out of range");
452 		msa.SetSeqWeight(uSeqIndex, Weights[uId]);
453 		}
454 #if	LOCAL_VERBOSE
455 	{
456 	Log("SetThreeWayWeightsMuscle\n");
457 	for (unsigned n = 0; n < uMSASeqCount; ++n)
458 		{
459 		const unsigned uId = msa.GetSeqId(n);
460 		Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);
461 		}
462 	}
463 #endif
464 	msa.NormalizeWeights((WEIGHT) 1.0);
465 
466 	delete[] Weights;
467 	}
468 
469 // Append msa2 at the end of msa1
MSAAppend(MSA & msa1,const MSA & msa2)470 void MSAAppend(MSA &msa1, const MSA &msa2)
471 	{
472 	const unsigned uSeqCount = msa1.GetSeqCount();
473 
474 	const unsigned uColCount1 = msa1.GetColCount();
475 	const unsigned uColCount2 = msa2.GetColCount();
476 	const unsigned uColCountCat = uColCount1 + uColCount2;
477 
478 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
479 		{
480 		unsigned uId = msa1.GetSeqId(uSeqIndex);
481 		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
482 		for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
483 			{
484 			const char c = msa2.GetChar(uSeqIndex2, uColIndex);
485 			msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
486 			}
487 		}
488 	}
489 
490 // "Catenate" two MSAs (by bad analogy with UNIX cat command).
491 // msa1 and msa2 must have same sequence names, but possibly
492 // in a different order.
493 // msaCat is the combined alignment produce by appending
494 // sequences in msa2 to sequences in msa1.
MSACat(const MSA & msa1,const MSA & msa2,MSA & msaCat)495 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)
496 	{
497 	const unsigned uSeqCount = msa1.GetSeqCount();
498 
499 	const unsigned uColCount1 = msa1.GetColCount();
500 	const unsigned uColCount2 = msa2.GetColCount();
501 	const unsigned uColCountCat = uColCount1 + uColCount2;
502 
503 	msaCat.SetSize(uSeqCount, uColCountCat);
504 
505 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
506 		{
507 		for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)
508 			{
509 			const char c = msa1.GetChar(uSeqIndex, uColIndex);
510 			msaCat.SetChar(uSeqIndex, uColIndex, c);
511 			}
512 
513 		const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);
514 		unsigned uSeqIndex2;
515 		msaCat.SetSeqName(uSeqIndex, ptrSeqName);
516 		bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);
517 		if (bFound)
518 			{
519 			for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
520 				{
521 				const char c = msa2.GetChar(uSeqIndex2, uColIndex);
522 				msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
523 				}
524 			}
525 		else
526 			{
527 			for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
528 				msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
529 			}
530 		}
531 	}
532