1 #include "muscle.h"
2 #include "msa.h"
3
4 /***
5 Compute Henikoff weights.
6 Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
7 J. Mol. Biol., 243(4):574-578.
8
9 Award each different residue an equal share of the weight, and then to divide up
10 that weight equally among the sequences sharing the same residue. So if in a
11 position of a multiple alignment, r different residues are represented, a residue
12 represented in only one sequence contributes a score of 1/r to that sequence, whereas a
13 residue represented in s sequences contributes a score of 1/rs to each of the s
14 sequences. For each sequence, the contributions from each position are summed to give
15 a sequence weight.
16
17 See also HenikoffWeightPB.
18 ***/
19
CalcHenikoffWeightsCol(unsigned uColIndex) const20 void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const
21 {
22 const unsigned uSeqCount = GetSeqCount();
23
24 // Compute letter counts in this column
25 unsigned uLetterCount[MAX_ALPHA];
26 memset(uLetterCount, 0, sizeof(uLetterCount));
27 unsigned uDifferentLetterCount = 0;
28 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
29 {
30 unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
31 if (uLetter >= 20)
32 continue;
33 unsigned uNewCount = uLetterCount[uLetter] + 1;
34 uLetterCount[uLetter] = uNewCount;
35 if (1 == uNewCount)
36 ++uDifferentLetterCount;
37 }
38
39 // Compute weight contributions
40 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
41 {
42 unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
43 if (uLetter >= 20)
44 continue;
45 const unsigned uCount = uLetterCount[uLetter];
46 unsigned uDenom = uCount*uDifferentLetterCount;
47 if (uDenom == 0)
48 continue;
49 m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom);
50 }
51 }
52
SetHenikoffWeights() const53 void MSA::SetHenikoffWeights() const
54 {
55 const unsigned uColCount = GetColCount();
56 const unsigned uSeqCount = GetSeqCount();
57
58 if (0 == uSeqCount)
59 return;
60 else if (1 == uSeqCount)
61 {
62 m_Weights[0] = (WEIGHT) 1.0;
63 return;
64 }
65 else if (2 == uSeqCount)
66 {
67 m_Weights[0] = (WEIGHT) 0.5;
68 m_Weights[1] = (WEIGHT) 0.5;
69 return;
70 }
71
72 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
73 m_Weights[uSeqIndex] = 0.0;
74
75 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
76 CalcHenikoffWeightsCol(uColIndex);
77
78 // Set all-gap seqs weight to 0
79 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
80 if (IsGapSeq(uSeqIndex))
81 m_Weights[uSeqIndex] = 0.0;
82
83 Normalize(m_Weights, uSeqCount);
84 }
85