1 #include "muscle.h"
2 #include "msa.h"
3 #include "seqvect.h"
4 #include "seq.h"
5 #include "distfunc.h"
6 #include <math.h>
7 #include "muscle_context.h"
8 
9 #define TRACE	0
10 
11 /***
12 Some candidate alphabets considered because they
13 have high correlations and small table sizes.
14 Correlation coefficent is between k-mer distance
15 and %id D measured from a CLUSTALW alignment.
16 Table size is N^k where N is size of alphabet.
17 A is standard (uncompressed) amino alphabet.
18 
19                            Correlation
20 Alpha   N  k  Table Size   all   25-50%
21 -----  --  -  ----------   ----  ------
22 A      20  3       8,000  0.943   0.575
23 A      20  4     160,000  0.962   0.685 <<
24 LiA    14  4      38,416  0.966   0.645
25 SEB    14  4      38,416  0.964   0.634
26 LiA    13  4      28,561  0.965   0.640
27 LiA    12  4      20,736  0.963   0.620
28 LiA    10  5     100,000  0.964   0.652
29 
30 We select A with k=4 because it has the best
31 correlations. The only drawback is a large table
32 size, but space is readily available and the only
33 additional time cost is in resetting the table to
34 zero, which can be done quickly with memset or by
35 keeping a list of the k-mers that were found (should
36 test to see which is faster, and may vary by compiler
37 and processor type). It also has the minor advantage
38 that we don't need to convert the alphabet.
39 
40 Fractional identity d is estimated as follows.
41 
42 	F = fractional k-mer count
43 	if F is 0: F = 0.01
44 	Y = log(0.02 + F)
45 	d = -4.1 + 4.12*Y
46 
47 The constant 0.02 was chosen to make the relationship
48 between Y and D linear. The constants -4.1 and 4.12
49 were chosen to fit a straight line to the scatterplot
50 of Y vs D.
51 ***/
52 
53 #define MIN(x, y)	(((x) < (y)) ? (x) : (y))
54 
55 const unsigned K = 4;
56 const unsigned N = 20;
57 const unsigned N_2 = 20*20;
58 const unsigned N_3 = 20*20*20;
59 const unsigned N_4 = 20*20*20*20;
60 
61 const unsigned TABLE_SIZE = N_4;
62 
63 // For debug output
64 // warning: unsafe
65 //const char *KmerToStr(unsigned Kmer)
66 //	{
67 //    MuscleContext *ctx = getMuscleContext();
68 //    char* g_LetterToChar = ctx->alpha.g_LetterToChar;
69 //	static char s[5];
70 //
71 //	unsigned c3 = (Kmer/N_3)%N;
72 //	unsigned c2 = (Kmer/N_2)%N;
73 //	unsigned c1 = (Kmer/N)%N;
74 //	unsigned c0 = Kmer%N;
75 //
76 //	s[0] = LetterToChar(c3);
77 //	s[1] = LetterToChar(c2);
78 //	s[2] = LetterToChar(c1);
79 //	s[3] = LetterToChar(c0);
80 //	return s;
81 //	}
82 
CountKmers(const byte s[],unsigned uSeqLength,byte KmerCounts[])83 void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
84 	{
85 #if	TRACE
86 	Log("CountKmers\n");
87 #endif
88 	memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
89 
90 	const byte *ptrKmerStart = s;
91 	const byte *ptrKmerEnd = s + 4;
92 	const byte *ptrSeqEnd = s + uSeqLength;
93 
94 	unsigned c3 = s[0]*N_3;
95 	unsigned c2 = s[1]*N_2;
96 	unsigned c1 = s[2]*N;
97 	unsigned c0 = s[3];
98 
99 	unsigned Kmer = c3 + c2 + c1 + c0;
100 
101 	for (;;)
102 		{
103 		assert(Kmer < TABLE_SIZE);
104 
105 #if	TRACE
106 		//Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
107 #endif
108 		++(KmerCounts[Kmer]);
109 
110 		if (ptrKmerEnd == ptrSeqEnd)
111 			break;
112 
113 	// Compute k-mer as function of previous k-mer:
114 	// 1. Subtract first letter from previous k-mer.
115 	// 2. Multiply by N.
116 	// 3. Add next letter.
117 		c3 = (*ptrKmerStart++) * N_3;
118 		Kmer = (Kmer - c3)*N;
119 		Kmer += *ptrKmerEnd++;
120 		}
121 	}
122 
CommonKmerCount(const byte Seq[],unsigned uSeqLength,const byte KmerCounts1[],const byte Seq2[],unsigned uSeqLength2)123 unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
124   const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
125 	{
126 	byte KmerCounts2[TABLE_SIZE];
127 	CountKmers(Seq2, uSeqLength2, KmerCounts2);
128 
129 	const byte *ptrKmerStart = Seq;
130 	const byte *ptrKmerEnd = Seq + 4;
131 	const byte *ptrSeqEnd = Seq + uSeqLength;
132 
133 	unsigned c3 = Seq[0]*N_3;
134 	unsigned c2 = Seq[1]*N_2;
135 	unsigned c1 = Seq[2]*N;
136 	unsigned c0 = Seq[3];
137 
138 	unsigned Kmer = c3 + c2 + c1 + c0;
139 
140 	unsigned uCommonCount = 0;
141 	for (;;)
142 		{
143 		assert(Kmer < TABLE_SIZE);
144 
145 		const byte Count1 = KmerCounts1[Kmer];
146 		const byte Count2 = KmerCounts2[Kmer];
147 
148 		uCommonCount += MIN(Count1, Count2);
149 
150 	// Hack so we don't double-count
151 		KmerCounts2[Kmer] = 0;
152 
153 		if (ptrKmerEnd == ptrSeqEnd)
154 			break;
155 
156 	// Compute k-mer as function of previous k-mer:
157 	// 1. Subtract first letter from previous k-mer.
158 	// 2. Multiply by N.
159 	// 3. Add next letter.
160 		c3 = (*ptrKmerStart++) * N_3;
161 		Kmer = (Kmer - c3)*N;
162 		Kmer += *ptrKmerEnd++;
163 		}
164 	return uCommonCount;
165 	}
166 
SeqToLetters(const Seq & s,byte Letters[])167 static void SeqToLetters(const Seq &s, byte Letters[])
168 	{
169     MuscleContext *ctx = getMuscleContext();
170     bool* g_IsWildcardChar = ctx->alpha.g_IsWildcardChar;
171     unsigned* g_CharToLetter = ctx->alpha.g_CharToLetter;
172 
173 	const unsigned uSeqLength = s.Length();
174 	for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
175 		{
176 		char c = s.GetChar(uCol);
177 	// Ugly hack. My k-mer counting code isn't wild-card
178 	// aware. Arbitrarily replace wildcards by a specific
179 	// amino acid.
180 		if (IsWildcardChar(c))
181 			c = 'A';
182 		*Letters++ = CharToLetter(c);
183 		}
184 	}
185 
FastDistKmer(const SeqVect & v,DistFunc & DF)186 void FastDistKmer(const SeqVect &v, DistFunc &DF)
187 	{
188 	byte KmerCounts[TABLE_SIZE];
189 
190 	const unsigned uSeqCount = v.GetSeqCount();
191 
192 	DF.SetCount(uSeqCount);
193 	if (0 == uSeqCount)
194 		return;
195 
196 // Initialize distance matrix to zero
197 	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
198 		{
199 		DF.SetDist(uSeq1, uSeq1, 0);
200 		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
201 			DF.SetDist(uSeq1, uSeq2, 0);
202 		}
203 
204 	unsigned uMaxLength = 0;
205 	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
206 		{
207 		const Seq &s = v.GetSeq(uSeqIndex);
208 		unsigned uSeqLength = s.Length();
209 		if (uSeqLength > uMaxLength)
210 			uMaxLength = uSeqLength;
211 		}
212 	if (0 == uMaxLength)
213 		return;
214 
215 	byte *Seq1Letters = new byte[uMaxLength];
216 	byte *Seq2Letters = new byte[uMaxLength];
217 
218 	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
219 		{
220 		const Seq &s1 = v.GetSeq(uSeqIndex1);
221 		const unsigned uSeqLength1 = s1.Length();
222 
223 		SeqToLetters(s1, Seq1Letters);
224 		CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
225 
226 		for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
227 		  ++uSeqIndex2)
228 			{
229 			const Seq &s2 = v.GetSeq(uSeqIndex2);
230 			const unsigned uSeqLength2 = s2.Length();
231 
232 			SeqToLetters(s2, Seq2Letters);
233 
234 			unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
235 			  KmerCounts, Seq2Letters, uSeqLength2);
236 
237 			unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
238 			double F = (double) uCommonKmerCount / (uMinLength - K + 1);
239 			if (0.0 == F)
240 				F = 0.01;
241 //			double Y = log(0.02 + F);
242 //			double EstimatedPctId = Y/4.12 + 0.995;
243 //			DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
244 			DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
245 #if	TRACE
246             double KD = KimuraDist(EstimatedPctId);
247 			Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
248 			  uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
249 #endif
250 			}
251 		}
252 
253 	delete[] Seq1Letters;
254 	delete[] Seq2Letters;
255 	}
256