1 #include "muscle.h"
2 #include "msa.h"
3 #include "seqvect.h"
4 #include "seq.h"
5 #include "distfunc.h"
6 #include <math.h>
7 #include "muscle_context.h"
8
9 #define TRACE 0
10
11 /***
12 Some candidate alphabets considered because they
13 have high correlations and small table sizes.
14 Correlation coefficent is between k-mer distance
15 and %id D measured from a CLUSTALW alignment.
16 Table size is N^k where N is size of alphabet.
17 A is standard (uncompressed) amino alphabet.
18
19 Correlation
20 Alpha N k Table Size all 25-50%
21 ----- -- - ---------- ---- ------
22 A 20 3 8,000 0.943 0.575
23 A 20 4 160,000 0.962 0.685 <<
24 LiA 14 4 38,416 0.966 0.645
25 SEB 14 4 38,416 0.964 0.634
26 LiA 13 4 28,561 0.965 0.640
27 LiA 12 4 20,736 0.963 0.620
28 LiA 10 5 100,000 0.964 0.652
29
30 We select A with k=4 because it has the best
31 correlations. The only drawback is a large table
32 size, but space is readily available and the only
33 additional time cost is in resetting the table to
34 zero, which can be done quickly with memset or by
35 keeping a list of the k-mers that were found (should
36 test to see which is faster, and may vary by compiler
37 and processor type). It also has the minor advantage
38 that we don't need to convert the alphabet.
39
40 Fractional identity d is estimated as follows.
41
42 F = fractional k-mer count
43 if F is 0: F = 0.01
44 Y = log(0.02 + F)
45 d = -4.1 + 4.12*Y
46
47 The constant 0.02 was chosen to make the relationship
48 between Y and D linear. The constants -4.1 and 4.12
49 were chosen to fit a straight line to the scatterplot
50 of Y vs D.
51 ***/
52
53 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
54
55 const unsigned K = 4;
56 const unsigned N = 20;
57 const unsigned N_2 = 20*20;
58 const unsigned N_3 = 20*20*20;
59 const unsigned N_4 = 20*20*20*20;
60
61 const unsigned TABLE_SIZE = N_4;
62
63 // For debug output
64 // warning: unsafe
65 //const char *KmerToStr(unsigned Kmer)
66 // {
67 // MuscleContext *ctx = getMuscleContext();
68 // char* g_LetterToChar = ctx->alpha.g_LetterToChar;
69 // static char s[5];
70 //
71 // unsigned c3 = (Kmer/N_3)%N;
72 // unsigned c2 = (Kmer/N_2)%N;
73 // unsigned c1 = (Kmer/N)%N;
74 // unsigned c0 = Kmer%N;
75 //
76 // s[0] = LetterToChar(c3);
77 // s[1] = LetterToChar(c2);
78 // s[2] = LetterToChar(c1);
79 // s[3] = LetterToChar(c0);
80 // return s;
81 // }
82
CountKmers(const byte s[],unsigned uSeqLength,byte KmerCounts[])83 void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
84 {
85 #if TRACE
86 Log("CountKmers\n");
87 #endif
88 memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
89
90 const byte *ptrKmerStart = s;
91 const byte *ptrKmerEnd = s + 4;
92 const byte *ptrSeqEnd = s + uSeqLength;
93
94 unsigned c3 = s[0]*N_3;
95 unsigned c2 = s[1]*N_2;
96 unsigned c1 = s[2]*N;
97 unsigned c0 = s[3];
98
99 unsigned Kmer = c3 + c2 + c1 + c0;
100
101 for (;;)
102 {
103 assert(Kmer < TABLE_SIZE);
104
105 #if TRACE
106 //Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
107 #endif
108 ++(KmerCounts[Kmer]);
109
110 if (ptrKmerEnd == ptrSeqEnd)
111 break;
112
113 // Compute k-mer as function of previous k-mer:
114 // 1. Subtract first letter from previous k-mer.
115 // 2. Multiply by N.
116 // 3. Add next letter.
117 c3 = (*ptrKmerStart++) * N_3;
118 Kmer = (Kmer - c3)*N;
119 Kmer += *ptrKmerEnd++;
120 }
121 }
122
CommonKmerCount(const byte Seq[],unsigned uSeqLength,const byte KmerCounts1[],const byte Seq2[],unsigned uSeqLength2)123 unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
124 const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
125 {
126 byte KmerCounts2[TABLE_SIZE];
127 CountKmers(Seq2, uSeqLength2, KmerCounts2);
128
129 const byte *ptrKmerStart = Seq;
130 const byte *ptrKmerEnd = Seq + 4;
131 const byte *ptrSeqEnd = Seq + uSeqLength;
132
133 unsigned c3 = Seq[0]*N_3;
134 unsigned c2 = Seq[1]*N_2;
135 unsigned c1 = Seq[2]*N;
136 unsigned c0 = Seq[3];
137
138 unsigned Kmer = c3 + c2 + c1 + c0;
139
140 unsigned uCommonCount = 0;
141 for (;;)
142 {
143 assert(Kmer < TABLE_SIZE);
144
145 const byte Count1 = KmerCounts1[Kmer];
146 const byte Count2 = KmerCounts2[Kmer];
147
148 uCommonCount += MIN(Count1, Count2);
149
150 // Hack so we don't double-count
151 KmerCounts2[Kmer] = 0;
152
153 if (ptrKmerEnd == ptrSeqEnd)
154 break;
155
156 // Compute k-mer as function of previous k-mer:
157 // 1. Subtract first letter from previous k-mer.
158 // 2. Multiply by N.
159 // 3. Add next letter.
160 c3 = (*ptrKmerStart++) * N_3;
161 Kmer = (Kmer - c3)*N;
162 Kmer += *ptrKmerEnd++;
163 }
164 return uCommonCount;
165 }
166
SeqToLetters(const Seq & s,byte Letters[])167 static void SeqToLetters(const Seq &s, byte Letters[])
168 {
169 MuscleContext *ctx = getMuscleContext();
170 bool* g_IsWildcardChar = ctx->alpha.g_IsWildcardChar;
171 unsigned* g_CharToLetter = ctx->alpha.g_CharToLetter;
172
173 const unsigned uSeqLength = s.Length();
174 for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
175 {
176 char c = s.GetChar(uCol);
177 // Ugly hack. My k-mer counting code isn't wild-card
178 // aware. Arbitrarily replace wildcards by a specific
179 // amino acid.
180 if (IsWildcardChar(c))
181 c = 'A';
182 *Letters++ = CharToLetter(c);
183 }
184 }
185
FastDistKmer(const SeqVect & v,DistFunc & DF)186 void FastDistKmer(const SeqVect &v, DistFunc &DF)
187 {
188 byte KmerCounts[TABLE_SIZE];
189
190 const unsigned uSeqCount = v.GetSeqCount();
191
192 DF.SetCount(uSeqCount);
193 if (0 == uSeqCount)
194 return;
195
196 // Initialize distance matrix to zero
197 for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
198 {
199 DF.SetDist(uSeq1, uSeq1, 0);
200 for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
201 DF.SetDist(uSeq1, uSeq2, 0);
202 }
203
204 unsigned uMaxLength = 0;
205 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
206 {
207 const Seq &s = v.GetSeq(uSeqIndex);
208 unsigned uSeqLength = s.Length();
209 if (uSeqLength > uMaxLength)
210 uMaxLength = uSeqLength;
211 }
212 if (0 == uMaxLength)
213 return;
214
215 byte *Seq1Letters = new byte[uMaxLength];
216 byte *Seq2Letters = new byte[uMaxLength];
217
218 for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
219 {
220 const Seq &s1 = v.GetSeq(uSeqIndex1);
221 const unsigned uSeqLength1 = s1.Length();
222
223 SeqToLetters(s1, Seq1Letters);
224 CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
225
226 for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
227 ++uSeqIndex2)
228 {
229 const Seq &s2 = v.GetSeq(uSeqIndex2);
230 const unsigned uSeqLength2 = s2.Length();
231
232 SeqToLetters(s2, Seq2Letters);
233
234 unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
235 KmerCounts, Seq2Letters, uSeqLength2);
236
237 unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
238 double F = (double) uCommonKmerCount / (uMinLength - K + 1);
239 if (0.0 == F)
240 F = 0.01;
241 // double Y = log(0.02 + F);
242 // double EstimatedPctId = Y/4.12 + 0.995;
243 // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
244 DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
245 #if TRACE
246 double KD = KimuraDist(EstimatedPctId);
247 Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
248 uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
249 #endif
250 }
251 }
252
253 delete[] Seq1Letters;
254 delete[] Seq2Letters;
255 }
256