1 #include "muscle.h"
2 #include "msa.h"
3 #include "seqvect.h"
4 #include "profile.h"
5 #include "tree.h"
6
7 // These global variables are a hack to allow the tree
8 // dependent iteration code to communicate the edge
9 // used to divide the tree. The three-way weighting
10 // scheme needs to know this edge in order to compute
11 // sequence weights.
12 static const Tree *g_ptrMuscleTree = 0;
13 unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;
14 unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;
15
GetFractionalWeightedCounts(unsigned uColIndex,bool bNormalize,FCOUNT fcCounts[],FCOUNT * ptrfcGapStart,FCOUNT * ptrfcGapEnd,FCOUNT * ptrfcGapExtend,FCOUNT * ptrfOcc,FCOUNT * ptrfcLL,FCOUNT * ptrfcLG,FCOUNT * ptrfcGL,FCOUNT * ptrfcGG) const16 void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
17 FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
18 FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
19 FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
20 {
21 const unsigned uSeqCount = GetSeqCount();
22 const unsigned uColCount = GetColCount();
23
24 memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
25 WEIGHT wTotal = 0;
26 FCOUNT fGap = 0;
27 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
28 {
29 const WEIGHT w = GetSeqWeight(uSeqIndex);
30 if (IsGap(uSeqIndex, uColIndex))
31 {
32 fGap += w;
33 continue;
34 }
35 else if (IsWildcard(uSeqIndex, uColIndex))
36 {
37 const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
38 switch (g_Alpha)
39 {
40 case ALPHA_Amino:
41 switch (uLetter)
42 {
43 case AX_B: // D or N
44 fcCounts[AX_D] += w/2;
45 fcCounts[AX_N] += w/2;
46 break;
47 case AX_Z: // E or Q
48 fcCounts[AX_E] += w/2;
49 fcCounts[AX_Q] += w/2;
50 break;
51 default: // any
52 {
53 const FCOUNT f = w/20;
54 for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
55 fcCounts[uLetter] += f;
56 break;
57 }
58 }
59 break;
60
61 case ALPHA_DNA:
62 case ALPHA_RNA:
63 switch (uLetter)
64 {
65 case AX_R: // G or A
66 fcCounts[NX_G] += w/2;
67 fcCounts[NX_A] += w/2;
68 break;
69 case AX_Y: // C or T/U
70 fcCounts[NX_C] += w/2;
71 fcCounts[NX_T] += w/2;
72 break;
73 default: // any
74 const FCOUNT f = w/20;
75 for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
76 fcCounts[uLetter] += f;
77 break;
78 }
79 break;
80
81 default:
82 Quit("Alphabet %d not supported", g_Alpha);
83 }
84 continue;
85 }
86 unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
87 fcCounts[uLetter] += w;
88 wTotal += w;
89 }
90 *ptrfOcc = (float) (1.0 - fGap);
91
92 if (bNormalize && wTotal > 0)
93 {
94 if (wTotal > 1.001)
95 Quit("wTotal=%g\n", wTotal);
96 for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
97 fcCounts[uLetter] /= wTotal;
98 // AssertNormalized(fcCounts);
99 }
100
101 FCOUNT fcStartCount = 0;
102 if (uColIndex == 0)
103 {
104 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
105 if (IsGap(uSeqIndex, uColIndex))
106 fcStartCount += GetSeqWeight(uSeqIndex);
107 }
108 else
109 {
110 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
111 if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
112 fcStartCount += GetSeqWeight(uSeqIndex);
113 }
114
115 FCOUNT fcEndCount = 0;
116 if (uColCount - 1 == uColIndex)
117 {
118 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
119 if (IsGap(uSeqIndex, uColIndex))
120 fcEndCount += GetSeqWeight(uSeqIndex);
121 }
122 else
123 {
124 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
125 if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
126 fcEndCount += GetSeqWeight(uSeqIndex);
127 }
128
129 FCOUNT LL = 0;
130 FCOUNT LG = 0;
131 FCOUNT GL = 0;
132 FCOUNT GG = 0;
133 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
134 {
135 WEIGHT w = GetSeqWeight(uSeqIndex);
136 bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
137 bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
138 if (bLetterHere)
139 {
140 if (bLetterPrev)
141 LL += w;
142 else
143 GL += w;
144 }
145 else
146 {
147 if (bLetterPrev)
148 LG += w;
149 else
150 GG += w;
151 }
152 }
153
154 FCOUNT fcExtendCount = 0;
155 if (uColIndex > 0 && uColIndex < GetColCount() - 1)
156 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
157 if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
158 IsGap(uSeqIndex, uColIndex + 1))
159 fcExtendCount += GetSeqWeight(uSeqIndex);
160
161 *ptrfcLL = LL;
162 *ptrfcLG = LG;
163 *ptrfcGL = GL;
164 *ptrfcGG = GG;
165 *ptrfcGapStart = fcStartCount;
166 *ptrfcGapEnd = fcEndCount;
167 *ptrfcGapExtend = fcExtendCount;
168 }
169
170 // Return true if the given column has no gaps and all
171 // its residues are in the same biochemical group.
MSAColIsConservative(const MSA & msa,unsigned uColIndex)172 bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)
173 {
174 extern unsigned ResidueGroup[];
175
176 const unsigned uSeqCount = msa.GetColCount();
177 if (0 == uSeqCount)
178 Quit("MSAColIsConservative: empty alignment");
179
180 if (msa.IsGap(0, uColIndex))
181 return false;
182
183 unsigned uLetter = msa.GetLetterEx(0, uColIndex);
184 const unsigned uGroup = ResidueGroup[uLetter];
185
186 for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)
187 {
188 if (msa.IsGap(uSeqIndex, uColIndex))
189 return false;
190 uLetter = msa.GetLetter(uSeqIndex, uColIndex);
191 if (ResidueGroup[uLetter] != uGroup)
192 return false;
193 }
194 return true;
195 }
196
MSAFromSeqRange(const MSA & msaIn,unsigned uFromSeqIndex,unsigned uSeqCount,MSA & msaOut)197 void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,
198 MSA &msaOut)
199 {
200 const unsigned uColCount = msaIn.GetColCount();
201 msaOut.SetSize(uSeqCount, uColCount);
202
203 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
204 {
205 const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);
206 msaOut.SetSeqName(uSeqIndex, ptrName);
207
208 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
209 {
210 const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);
211 msaOut.SetChar(uSeqIndex, uColIndex, c);
212 }
213 }
214 }
215
MSAFromColRange(const MSA & msaIn,unsigned uFromColIndex,unsigned uColCount,MSA & msaOut)216 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
217 MSA &msaOut)
218 {
219 const unsigned uSeqCount = msaIn.GetSeqCount();
220 const unsigned uInColCount = msaIn.GetColCount();
221
222 if (uFromColIndex + uColCount - 1 > uInColCount)
223 Quit("MSAFromColRange, out of bounds");
224
225 msaOut.SetSize(uSeqCount, uColCount);
226
227 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
228 {
229 const char *ptrName = msaIn.GetSeqName(uSeqIndex);
230 unsigned uId = msaIn.GetSeqId(uSeqIndex);
231 msaOut.SetSeqName(uSeqIndex, ptrName);
232 msaOut.SetSeqId(uSeqIndex, uId);
233
234 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
235 {
236 const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);
237 msaOut.SetChar(uSeqIndex, uColIndex, c);
238 }
239 }
240 }
241
SeqVectFromMSA(const MSA & msa,SeqVect & v)242 void SeqVectFromMSA(const MSA &msa, SeqVect &v)
243 {
244 v.Clear();
245 const unsigned uSeqCount = msa.GetSeqCount();
246 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
247 {
248 Seq s;
249 msa.GetSeq(uSeqIndex, s);
250
251 s.StripGaps();
252 //if (0 == s.Length())
253 // continue;
254
255 const char *ptrName = msa.GetSeqName(uSeqIndex);
256 s.SetName(ptrName);
257
258 v.AppendSeq(s);
259 }
260 }
261
DeleteGappedCols(MSA & msa)262 void DeleteGappedCols(MSA &msa)
263 {
264 unsigned uColIndex = 0;
265 for (;;)
266 {
267 if (uColIndex >= msa.GetColCount())
268 break;
269 if (msa.IsGapColumn(uColIndex))
270 msa.DeleteCol(uColIndex);
271 else
272 ++uColIndex;
273 }
274 }
275
MSAFromSeqSubset(const MSA & msaIn,const unsigned uSeqIndexes[],unsigned uSeqCount,MSA & msaOut)276 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
277 MSA &msaOut)
278 {
279 const unsigned uColCount = msaIn.GetColCount();
280 msaOut.SetSize(uSeqCount, uColCount);
281 for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)
282 {
283 unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];
284 const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
285 unsigned uId = msaIn.GetSeqId(uSeqIndexIn);
286 msaOut.SetSeqName(uSeqIndexOut, ptrName);
287 msaOut.SetSeqId(uSeqIndexOut, uId);
288 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
289 {
290 const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
291 msaOut.SetChar(uSeqIndexOut, uColIndex, c);
292 }
293 }
294 }
295
AssertMSAEqIgnoreCaseAndGaps(const MSA & msa1,const MSA & msa2)296 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)
297 {
298 const unsigned uSeqCount1 = msa1.GetSeqCount();
299 const unsigned uSeqCount2 = msa2.GetSeqCount();
300 if (uSeqCount1 != uSeqCount2)
301 Quit("Seq count differs");
302
303 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
304 {
305 Seq seq1;
306 msa1.GetSeq(uSeqIndex, seq1);
307
308 unsigned uId = msa1.GetSeqId(uSeqIndex);
309 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
310
311 Seq seq2;
312 msa2.GetSeq(uSeqIndex2, seq2);
313
314 if (!seq1.EqIgnoreCaseAndGaps(seq2))
315 {
316 Log("Input:\n");
317 seq1.LogMe();
318 Log("Output:\n");
319 seq2.LogMe();
320 Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
321 }
322 }
323 }
324
AssertMSAEq(const MSA & msa1,const MSA & msa2)325 void AssertMSAEq(const MSA &msa1, const MSA &msa2)
326 {
327 const unsigned uSeqCount1 = msa1.GetSeqCount();
328 const unsigned uSeqCount2 = msa2.GetSeqCount();
329 if (uSeqCount1 != uSeqCount2)
330 Quit("Seq count differs");
331
332 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
333 {
334 Seq seq1;
335 msa1.GetSeq(uSeqIndex, seq1);
336
337 unsigned uId = msa1.GetSeqId(uSeqIndex);
338 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
339
340 Seq seq2;
341 msa2.GetSeq(uSeqIndex2, seq2);
342
343 if (!seq1.Eq(seq2))
344 {
345 Log("Input:\n");
346 seq1.LogMe();
347 Log("Output:\n");
348 seq2.LogMe();
349 Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
350 }
351 }
352 }
353
SetMSAWeightsMuscle(MSA & msa)354 void SetMSAWeightsMuscle(MSA &msa)
355 {
356 SEQWEIGHT Method = GetSeqWeightMethod();
357 switch (Method)
358 {
359 case SEQWEIGHT_None:
360 msa.SetUniformWeights();
361 return;
362
363 case SEQWEIGHT_Henikoff:
364 msa.SetHenikoffWeights();
365 return;
366
367 case SEQWEIGHT_HenikoffPB:
368 msa.SetHenikoffWeightsPB();
369 return;
370
371 case SEQWEIGHT_GSC:
372 msa.SetGSCWeights();
373 return;
374
375 case SEQWEIGHT_ClustalW:
376 SetClustalWWeightsMuscle(msa);
377 return;
378
379 case SEQWEIGHT_ThreeWay:
380 SetThreeWayWeightsMuscle(msa);
381 return;
382 }
383 Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);
384 }
385
386 static WEIGHT *g_MuscleWeights;
387 static unsigned g_uMuscleIdCount;
388
GetMuscleSeqWeightById(unsigned uId)389 WEIGHT GetMuscleSeqWeightById(unsigned uId)
390 {
391 if (0 == g_MuscleWeights)
392 Quit("g_MuscleWeights = 0");
393 if (uId >= g_uMuscleIdCount)
394 Quit("GetMuscleSeqWeightById(%u): count=%u",
395 uId, g_uMuscleIdCount);
396
397 return g_MuscleWeights[uId];
398 }
399
SetMuscleTree(const Tree & tree)400 void SetMuscleTree(const Tree &tree)
401 {
402 g_ptrMuscleTree = &tree;
403
404 if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())
405 return;
406
407 delete[] g_MuscleWeights;
408
409 const unsigned uLeafCount = tree.GetLeafCount();
410 g_uMuscleIdCount = uLeafCount;
411 g_MuscleWeights = new WEIGHT[uLeafCount];
412 CalcClustalWWeights(tree, g_MuscleWeights);
413 }
414
SetClustalWWeightsMuscle(MSA & msa)415 void SetClustalWWeightsMuscle(MSA &msa)
416 {
417 if (0 == g_MuscleWeights)
418 Quit("g_MuscleWeights = 0");
419 const unsigned uSeqCount = msa.GetSeqCount();
420 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
421 {
422 const unsigned uId = msa.GetSeqId(uSeqIndex);
423 if (uId >= g_uMuscleIdCount)
424 Quit("SetClustalWWeightsMuscle: id out of range");
425 msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);
426 }
427 msa.NormalizeWeights((WEIGHT) 1.0);
428 }
429
430 #define LOCAL_VERBOSE 0
431
SetThreeWayWeightsMuscle(MSA & msa)432 void SetThreeWayWeightsMuscle(MSA &msa)
433 {
434 if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)
435 {
436 msa.SetHenikoffWeightsPB();
437 return;
438 }
439
440 const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();
441 WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];
442
443 CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,
444 Weights);
445
446 const unsigned uMSASeqCount = msa.GetSeqCount();
447 for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)
448 {
449 const unsigned uId = msa.GetSeqId(uSeqIndex);
450 if (uId >= uMuscleSeqCount)
451 Quit("SetThreeWayWeightsMuscle: id out of range");
452 msa.SetSeqWeight(uSeqIndex, Weights[uId]);
453 }
454 #if LOCAL_VERBOSE
455 {
456 Log("SetThreeWayWeightsMuscle\n");
457 for (unsigned n = 0; n < uMSASeqCount; ++n)
458 {
459 const unsigned uId = msa.GetSeqId(n);
460 Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);
461 }
462 }
463 #endif
464 msa.NormalizeWeights((WEIGHT) 1.0);
465
466 delete[] Weights;
467 }
468
469 // Append msa2 at the end of msa1
MSAAppend(MSA & msa1,const MSA & msa2)470 void MSAAppend(MSA &msa1, const MSA &msa2)
471 {
472 const unsigned uSeqCount = msa1.GetSeqCount();
473
474 const unsigned uColCount1 = msa1.GetColCount();
475 const unsigned uColCount2 = msa2.GetColCount();
476 const unsigned uColCountCat = uColCount1 + uColCount2;
477
478 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
479 {
480 unsigned uId = msa1.GetSeqId(uSeqIndex);
481 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
482 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
483 {
484 const char c = msa2.GetChar(uSeqIndex2, uColIndex);
485 msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
486 }
487 }
488 }
489
490 // "Catenate" two MSAs (by bad analogy with UNIX cat command).
491 // msa1 and msa2 must have same sequence names, but possibly
492 // in a different order.
493 // msaCat is the combined alignment produce by appending
494 // sequences in msa2 to sequences in msa1.
MSACat(const MSA & msa1,const MSA & msa2,MSA & msaCat)495 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)
496 {
497 const unsigned uSeqCount = msa1.GetSeqCount();
498
499 const unsigned uColCount1 = msa1.GetColCount();
500 const unsigned uColCount2 = msa2.GetColCount();
501 const unsigned uColCountCat = uColCount1 + uColCount2;
502
503 msaCat.SetSize(uSeqCount, uColCountCat);
504
505 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
506 {
507 for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)
508 {
509 const char c = msa1.GetChar(uSeqIndex, uColIndex);
510 msaCat.SetChar(uSeqIndex, uColIndex, c);
511 }
512
513 const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);
514 unsigned uSeqIndex2;
515 msaCat.SetSeqName(uSeqIndex, ptrSeqName);
516 bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);
517 if (bFound)
518 {
519 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
520 {
521 const char c = msa2.GetChar(uSeqIndex2, uColIndex);
522 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
523 }
524 }
525 else
526 {
527 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
528 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
529 }
530 }
531 }
532