1 /***
2 Conservation value for a column in an MSA is defined as the number
3 of times the most common letter appears divided by the number of
4 sequences.
5 ***/
6 
7 #include "muscle.h"
8 #include "msa.h"
9 #include <math.h>
10 
GetAvgCons() const11 double MSA::GetAvgCons() const
12 	{
13 	assert(GetSeqCount() > 0);
14 	double dSum = 0;
15 	unsigned uNonGapColCount = 0;
16 	for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)
17 		{
18 		if (!IsGapColumn(uColIndex))
19 			{
20 			dSum += GetCons(uColIndex);
21 			++uNonGapColCount;
22 			}
23 		}
24 	assert(uNonGapColCount > 0);
25 	double dAvg = dSum / uNonGapColCount;
26 	assert(dAvg > 0 && dAvg <= 1);
27 	return dAvg;
28 	}
29 
GetCons(unsigned uColIndex) const30 double MSA::GetCons(unsigned uColIndex) const
31 	{
32 	unsigned Counts[MAX_ALPHA];
33 	for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
34 		Counts[uLetter] = 0;
35 
36 	unsigned uMaxCount = 0;
37 	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
38 		{
39 		if (IsGap(uSeqIndex, uColIndex))
40 			continue;
41 		char c = GetChar(uSeqIndex, uColIndex);
42 		c = toupper(c);
43 		if ('X' == c || 'B' == c || 'Z' == c)
44 			continue;
45 		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
46 		unsigned uCount = Counts[uLetter] + 1;
47 		if (uCount > uMaxCount)
48 			uMaxCount = uCount;
49 		Counts[uLetter] = uCount;
50 		}
51 
52 // Cons is undefined for all-gap column
53 	if (0 == uMaxCount)
54 		{
55 //		assert(false);
56 		return 1;
57 		}
58 
59 	double dCons = (double) uMaxCount / (double) GetSeqCount();
60 	assert(dCons > 0 && dCons <= 1);
61 	return dCons;
62 	}
63 
64 // Perecent identity of a pair of sequences.
65 // Positions with one or both gapped are ignored.
GetPctIdentityPair(unsigned uSeqIndex1,unsigned uSeqIndex2) const66 double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const
67 	{
68 	const unsigned uColCount = GetColCount();
69 	unsigned uPosCount = 0;
70 	unsigned uSameCount = 0;
71 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
72 		{
73 		const char c1 = GetChar(uSeqIndex1, uColIndex);
74 		const char c2 = GetChar(uSeqIndex2, uColIndex);
75 		if (IsGapChar(c1) || IsGapChar(c2))
76 			continue;
77 		if (c1 == c2)
78 			++uSameCount;
79 		++uPosCount;
80 		}
81 	if (0 == uPosCount)
82 		return 0;
83 	return (double) uSameCount / (double) uPosCount;
84 	}
85 
86 // Perecent group identity of a pair of sequences.
87 // Positions with one or both gapped are ignored.
GetPctGroupIdentityPair(unsigned uSeqIndex1,unsigned uSeqIndex2) const88 double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,
89   unsigned uSeqIndex2) const
90 	{
91 	extern unsigned ResidueGroup[];
92 
93 	const unsigned uColCount = GetColCount();
94 	unsigned uPosCount = 0;
95 	unsigned uSameCount = 0;
96 	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
97 		{
98 		if (IsGap(uSeqIndex1, uColIndex))
99 			continue;
100 		if (IsGap(uSeqIndex2, uColIndex))
101 			continue;
102 		if (IsWildcard(uSeqIndex1, uColIndex))
103 			continue;
104 		if (IsWildcard(uSeqIndex2, uColIndex))
105 			continue;
106 
107 		const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);
108 		const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);
109 		const unsigned uGroup1 = ResidueGroup[uLetter1];
110 		const unsigned uGroup2 = ResidueGroup[uLetter2];
111 		if (uGroup1 == uGroup2)
112 			++uSameCount;
113 		++uPosCount;
114 		}
115 	if (0 == uPosCount)
116 		return 0;
117 	return (double) uSameCount / (double) uPosCount;
118 	}
119