1 #include "muscle.h"
2 #include "msa.h"
3 
4 /***
5 Compute Henikoff weights.
6 Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
7 J. Mol. Biol., 243(4):574-578.
8 
9 Award each different residue an equal share of the weight, and then to divide up
10 that weight equally among the sequences sharing the same residue. So if in a
11 position of a multiple alignment, r different residues are represented, a residue
12 represented in only one sequence contributes a score of 1/r to that sequence, whereas a
13 residue represented in s sequences contributes a score of 1/rs to each of the s
14 sequences. For each sequence, the contributions from each position are summed to give
15 a sequence weight.
16 
17 See also HenikoffWeightPB.
18 ***/
19 
CalcHenikoffWeightsCol(unsigned uColIndex) const20 void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const
21 	{
22 	const unsigned uSeqCount = GetSeqCount();
23 
24 // Compute letter counts in this column
25 	unsigned uLetterCount[MAX_ALPHA];
26 	memset(uLetterCount, 0, sizeof(uLetterCount));
27 	unsigned uDifferentLetterCount = 0;
28 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
29 		{
30 		unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
31 		if (uLetter >= 20)
32 			continue;
33 		unsigned uNewCount = uLetterCount[uLetter] + 1;
34 		uLetterCount[uLetter] = uNewCount;
35 		if (1 == uNewCount)
36 			++uDifferentLetterCount;
37 		}
38 
39 // Compute weight contributions
40 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
41 		{
42 		unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
43 		if (uLetter >= 20)
44 			continue;
45 		const unsigned uCount = uLetterCount[uLetter];
46 		unsigned uDenom = uCount*uDifferentLetterCount;
47 		if (uDenom == 0)
48 			continue;
49 		m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom);
50 		}
51 	}
52 
SetHenikoffWeights() const53 void MSA::SetHenikoffWeights() const
54 	{
55 	const unsigned uColCount = GetColCount();
56 	const unsigned uSeqCount = GetSeqCount();
57 
58 	if (0 == uSeqCount)
59 		return;
60 	else if (1 == uSeqCount)
61 		{
62 		m_Weights[0] = (WEIGHT) 1.0;
63 		return;
64 		}
65 	else if (2 == uSeqCount)
66 		{
67 		m_Weights[0] = (WEIGHT) 0.5;
68 		m_Weights[1] = (WEIGHT) 0.5;
69 		return;
70 		}
71 
72 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
73 		m_Weights[uSeqIndex] = 0.0;
74 
75 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
76 		CalcHenikoffWeightsCol(uColIndex);
77 
78 // Set all-gap seqs weight to 0
79 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
80 		if (IsGapSeq(uSeqIndex))
81 			m_Weights[uSeqIndex] = 0.0;
82 
83 	Normalize(m_Weights, uSeqCount);
84 	}
85