1 /* Routines for manipulating sequence alignment score matrices,
2 * such as the BLOSUM and PAM matrices.
3 *
4 * Contents:
5 * 1. The ESL_SCOREMATRIX object.
6 * 2. Some classic score matrices.
7 * 3. Deriving a score matrix probabilistically.
8 * 4. Reading/writing matrices from/to files.
9 * 5. Implicit probabilistic basis, I: given bg.
10 * 6. Implicit probabilistic basis, II: bg unknown. [Yu/Altschul03,05]
11 * 7. Experiment driver.
12 * 8 Utility programs.
13 * 9. Unit tests.
14 * 10. Test driver.
15 * 11. Example program.
16 */
17 #include "esl_config.h"
18
19 #include <string.h>
20 #include <math.h>
21
22 #include "easel.h"
23 #include "esl_alphabet.h"
24 #include "esl_composition.h"
25 #include "esl_dmatrix.h"
26 #include "esl_fileparser.h"
27 #include "esl_rootfinder.h"
28 #include "esl_ratematrix.h"
29 #include "esl_vectorops.h"
30
31 #include "esl_scorematrix.h"
32
33 /*****************************************************************
34 *# 1. The ESL_SCOREMATRIX object
35 *****************************************************************/
36
37 /* Function: esl_scorematrix_Create()
38 * Synopsis: Allocate and initialize an <ESL_SCOREMATRIX> object.
39 *
40 * Purpose: Allocates a score matrix for alphabet <abc>, initializes
41 * all scores to zero.
42 *
43 * Args: abc - pointer to digital alphabet
44 *
45 * Returns: a pointer to the new object.
46 *
47 * Throws: <NULL> on allocation failure.
48 */
49 ESL_SCOREMATRIX *
esl_scorematrix_Create(const ESL_ALPHABET * abc)50 esl_scorematrix_Create(const ESL_ALPHABET *abc)
51 {
52 ESL_SCOREMATRIX *S = NULL;
53 int status;
54 int i;
55
56 ESL_ALLOC(S, sizeof(ESL_SCOREMATRIX));
57 S->s = NULL;
58 S->K = abc->K;
59 S->Kp = abc->Kp;
60 S->isval = NULL;
61 S->abc_r = abc;
62 S->nc = 0;
63 S->outorder = NULL;
64 S->name = NULL;
65 S->path = NULL;
66
67 ESL_ALLOC(S->s, sizeof(int *) * abc->Kp);
68 S->s[0] = NULL;
69 ESL_ALLOC(S->isval, sizeof(char) * abc->Kp);
70 for (i = 0; i < abc->Kp; i++) S->isval[i] = FALSE;
71 ESL_ALLOC(S->outorder, sizeof(char) * (abc->Kp+1));
72 S->outorder[0] = '\0'; /* init to empty string. */
73
74 ESL_ALLOC(S->s[0], sizeof(int) * abc->Kp * abc->Kp);
75 for (i = 1; i < abc->Kp; i++) S->s[i] = S->s[0] + abc->Kp * i;
76
77 for (i = 0; i < abc->Kp*abc->Kp; i++) S->s[0][i] = 0;
78 return S;
79
80 ERROR:
81 esl_scorematrix_Destroy(S);
82 return NULL;
83 }
84
85
86
87 /* Function: esl_scorematrix_Copy()
88 * Synopsis: Copy <src> matrix to <dest>.
89 *
90 * Purpose: Copy <src> score matrix into <dest>. Caller
91 * has allocated <dest> for the same alphabet as
92 * <src>.
93 *
94 * Returns: <eslOK> on success.
95 *
96 * Throws: <eslEINCOMPAT> if <dest> isn't allocated for
97 * the same alphabet as <src>.
98 * <eslEMEM> on allocation error.
99 */
100 int
esl_scorematrix_Copy(const ESL_SCOREMATRIX * src,ESL_SCOREMATRIX * dest)101 esl_scorematrix_Copy(const ESL_SCOREMATRIX *src, ESL_SCOREMATRIX *dest)
102 {
103 int i,j;
104 int status;
105
106 if (src->abc_r->type != dest->abc_r->type || src->K != dest->K || src->Kp != dest->Kp)
107 ESL_EXCEPTION(eslEINCOMPAT, "source and dest score matrix types don't match");
108
109 for (i = 0; i < src->Kp; i++)
110 for (j = 0; j < src->Kp; j++)
111 dest->s[i][j] = src->s[i][j];
112 for (i = 0; i < src->Kp; i++)
113 dest->isval[i] = src->isval[i];
114 dest->nc = src->nc;
115 for (i = 0; i < src->nc; i++)
116 dest->outorder[i] = src->outorder[i];
117 dest->outorder[dest->nc] = '\0';
118
119 if ((status = esl_strdup(src->name, -1, &(dest->name))) != eslOK) return status;
120 if ((status = esl_strdup(src->path, -1, &(dest->path))) != eslOK) return status;
121 return eslOK;
122 }
123
124 /* Function: esl_scorematrix_Clone()
125 * Synopsis: Allocate a duplicate of a matrix.
126 *
127 * Purpose: Allocates a new matrix and makes it a duplicate
128 * of <S>. Return a pointer to the new matrix.
129 *
130 * Throws: <NULL> on allocation failure.
131 */
132 ESL_SCOREMATRIX *
esl_scorematrix_Clone(const ESL_SCOREMATRIX * S)133 esl_scorematrix_Clone(const ESL_SCOREMATRIX *S)
134 {
135 ESL_SCOREMATRIX *dup = NULL;
136
137 if ((dup = esl_scorematrix_Create(S->abc_r)) == NULL) return NULL;
138 if (esl_scorematrix_Copy(S, dup) != eslOK) { esl_scorematrix_Destroy(dup); return NULL; }
139 return dup;
140 }
141
142
143 /* Function: esl_scorematrix_Compare()
144 * Synopsis: Compare two matrices for equality.
145 *
146 * Purpose: Compares two score matrices. Returns <eslOK> if they
147 * are identical, <eslFAIL> if they differ. Every aspect
148 * of the two matrices is compared.
149 *
150 * The annotation (name, filename path) are not
151 * compared; we may want to compare an internally
152 * generated scorematrix to one read from a file.
153 */
154 int
esl_scorematrix_Compare(const ESL_SCOREMATRIX * S1,const ESL_SCOREMATRIX * S2)155 esl_scorematrix_Compare(const ESL_SCOREMATRIX *S1, const ESL_SCOREMATRIX *S2)
156 {
157 int a,b;
158
159 if (strcmp(S1->outorder, S2->outorder) != 0) return eslFAIL;
160 if (S1->nc != S2->nc) return eslFAIL;
161
162 for (a = 0; a < S1->nc; a++)
163 if (S1->isval[a] != S2->isval[a]) return eslFAIL;
164
165 for (a = 0; a < S1->Kp; a++)
166 for (b = 0; b < S1->Kp; b++)
167 if (S1->s[a][b] != S2->s[a][b]) return eslFAIL;
168
169 return eslOK;
170 }
171
172 /* Function: esl_scorematrix_CompareCanon()
173 * Synopsis: Compares scores of canonical residues for equality.
174 *
175 * Purpose: Compares the scores of canonical residues in
176 * two score matrices <S1> and <S2> for equality.
177 * Returns <eslOK> if they are identical, <eslFAIL>
178 * if they differ. Peripheral aspects of the scoring matrices
179 * having to do with noncanonical residues, output
180 * order, and suchlike are ignored.
181 */
182 int
esl_scorematrix_CompareCanon(const ESL_SCOREMATRIX * S1,const ESL_SCOREMATRIX * S2)183 esl_scorematrix_CompareCanon(const ESL_SCOREMATRIX *S1, const ESL_SCOREMATRIX *S2)
184 {
185 int a,b;
186
187 for (a = 0; a < S1->K; a++)
188 for (b = 0; b < S1->K; b++)
189 if (S1->s[a][b] != S2->s[a][b]) return eslFAIL;
190 return eslOK;
191 }
192
193
194
195 /* Function: esl_scorematrix_Max()
196 * Synopsis: Returns maximum value in score matrix.
197 *
198 * Purpose: Returns the maximum value in score matrix <S>.
199 */
200 int
esl_scorematrix_Max(const ESL_SCOREMATRIX * S)201 esl_scorematrix_Max(const ESL_SCOREMATRIX *S)
202 {
203 int i,j;
204 int max = S->s[0][0];
205
206 for (i = 0; i < S->K; i++)
207 for (j = 0; j < S->K; j++)
208 if (S->s[i][j] > max) max = S->s[i][j];
209 return max;
210 }
211
212 /* Function: esl_scorematrix_Min()
213 * Synopsis: Returns minimum value in score matrix.
214 *
215 * Purpose: Returns the minimum value in score matrix <S>.
216 */
217 int
esl_scorematrix_Min(const ESL_SCOREMATRIX * S)218 esl_scorematrix_Min(const ESL_SCOREMATRIX *S)
219 {
220 int i,j;
221 int min = S->s[0][0];
222
223 for (i = 0; i < S->K; i++)
224 for (j = 0; j < S->K; j++)
225 if (S->s[i][j] < min) min = S->s[i][j];
226 return min;
227 }
228
229
230 /* Function: esl_scorematrix_IsSymmetric()
231 * Synopsis: Returns <TRUE> for symmetric matrix.
232 *
233 * Purpose: Returns <TRUE> if matrix <S> is symmetric,
234 * or <FALSE> if it's not.
235 */
236 int
esl_scorematrix_IsSymmetric(const ESL_SCOREMATRIX * S)237 esl_scorematrix_IsSymmetric(const ESL_SCOREMATRIX *S)
238 {
239 int i,j;
240
241 for (i = 0; i < S->K; i++)
242 for (j = i; j < S->K; j++)
243 if (S->s[i][j] != S->s[j][i]) return FALSE;
244 return TRUE;
245 }
246
247 /* Function: esl_scorematrix_ExpectedScore()
248 * Synopsis: Calculates the expected score of a matrix.
249 *
250 * Purpose: Calculates the expected score of a matrix <S>,
251 * given background frequencies <fi> and <fj>;
252 * return it in <*ret_E>.
253 *
254 * The expected score is defined as
255 * $\sum_{ab} f_a f_b \sigma_{ab}$.
256 *
257 * The expected score is in whatever units the score matrix
258 * <S> is in. If you know $\lambda$, you can convert it to
259 * units of bits ($\log 2$) by multiplying it by $\lambda /
260 * \log 2$.
261 *
262 * Args: S - score matrix
263 * fi - background frequencies $f_i$ (0..K-1)
264 * fj - background frequencies $f_j$ (0..K-1)
265 * ret_E - RETURN: expected score
266 *
267 * Returns: <eslOK> on success.
268 */
269 int
esl_scorematrix_ExpectedScore(ESL_SCOREMATRIX * S,double * fi,double * fj,double * ret_E)270 esl_scorematrix_ExpectedScore(ESL_SCOREMATRIX *S, double *fi, double *fj, double *ret_E)
271 {
272 double E = 0.;
273 int a,b;
274
275 for (a = 0; a < S->K; a++)
276 for (b = 0; b < S->K; b++)
277 E += fi[a] * fj[b] * (double) S->s[a][b];
278
279 *ret_E = E;
280 return eslOK;
281 }
282
283
284 /* Function: esl_scorematrix_RelEntropy()
285 * Synopsis: Calculates relative entropy of a matrix.
286 *
287 * Purpose: Calculates the relative entropy of score matrix <S> in
288 * bits, given its background distributions <fi> and <fj> and
289 * its scale <lambda>.
290 *
291 * The relative entropy is defined as $\sum_{ab} p_{ab}
292 * \log_2 \frac{p_{ab}} {f_a f_b}$, the average score (in
293 * bits) of homologous aligned sequences. In general it is
294 * $\geq 0$ (and certainly so in the case when background
295 * frequencies $f_a$ and $f_b$ are the marginals of the
296 * $p_{ab}$ joint ptobabilities).
297 *
298 * Args: S - score matrix
299 * fi - background freqs for sequence i
300 * fj - background freqs for sequence j
301 * lambda - scale factor $\lambda$ for <S>
302 * ret_D - RETURN: relative entropy.
303 *
304 * Returns: <eslOK> on success, and <ret_D> contains the relative
305 * entropy.
306 *
307 * Throws: <eslEMEM> on allocation error.
308 * <eslEINVAL> if the implied $p_{ij}$'s don't sum to one,
309 * probably indicating that <lambda> was not the correct
310 * <lambda> for <S>, <fi>, and <fj>.
311 * In either exception, <ret_D> is returned as 0.0.
312 */
313 int
esl_scorematrix_RelEntropy(const ESL_SCOREMATRIX * S,const double * fi,const double * fj,double lambda,double * ret_D)314 esl_scorematrix_RelEntropy(const ESL_SCOREMATRIX *S, const double *fi, const double *fj, double lambda, double *ret_D)
315 {
316 int status;
317 double pij;
318 double sum = 0.;
319 int i,j;
320 double D = 0;
321
322 for (i = 0; i < S->K; i++)
323 for (j = 0; j < S->K; j++)
324 {
325 pij = fi[i] * fj[j] * exp(lambda * (double) S->s[i][j]);
326 sum += pij;
327 if (pij > 0.) D += pij * log(pij / (fi[i] * fj[j]));
328
329 }
330 if (esl_DCompare(sum, 1.0, 1e-3) != eslOK)
331 ESL_XEXCEPTION(eslEINVAL, "pij's don't sum to one (%.4f): bad lambda or bad bg?", sum);
332
333 D /= eslCONST_LOG2;
334 *ret_D = D;
335 return eslOK;
336
337 ERROR:
338 *ret_D = 0.;
339 return status;
340 }
341
342
343 /* Function: esl_scorematrix_JointToConditionalOnQuery()
344 * Synopsis: Convert a joint probability matrix to conditional probs P(b|a)
345 *
346 * Purpose: Given a joint probability matrix <P> that has been calculated
347 * by <esl_scorematrix_ProbifyGivenBG()> or <esl_scorematrix_Probify()>
348 * (or one that obeys the same conditions; see below),
349 * convert the joint probabilities <P(a,b)> to conditional
350 * probabilities <P(b | a)>, where <b> is a residue in the target,
351 * and <a> is a residue in the query.
352 *
353 * $P(b \mid a) = P(ab) / P(a)$, where $P(a) = \sum_b P(ab)$.
354 *
355 * The value stored in <P->mx[a][b]> is $P(b \mid a)$.
356 *
357 * All values in <P> involving the codes for gap,
358 * nonresidue, and missing data (codes <K>,<Kp-2>, and
359 * <Kp-1>) are 0.0, not probabilities. Only rows/columns
360 * <i=0..K,K+1..Kp-3> are valid probability vectors.
361 *
362 * Returns: <eslOK> on success.
363 *
364 * Throws: (no abnormal error conditions)
365 *
366 * Xref: J9/87.
367 */
368 int
esl_scorematrix_JointToConditionalOnQuery(const ESL_ALPHABET * abc,ESL_DMATRIX * P)369 esl_scorematrix_JointToConditionalOnQuery(const ESL_ALPHABET *abc, ESL_DMATRIX *P)
370 {
371 int a,b;
372
373 /* P(b|a) = P(ab) / P(a)
374 * and P(a) = P(a,X), the value at [a][Kp-3]
375 */
376 for (a = 0; a < abc->Kp-2; a++)
377 for (b = 0; b < abc->Kp-2; b++)
378 P->mx[a][b] = (P->mx[a][abc->Kp-3] == 0.0 ? 0.0 : P->mx[a][b] / P->mx[a][abc->Kp-3]);
379 return eslOK;
380 }
381
382
383
384 /* Function: esl_scorematrix_Destroy()
385 * Synopsis: Frees a matrix.
386 *
387 * Purpose: Frees a score matrix.
388 */
389 void
esl_scorematrix_Destroy(ESL_SCOREMATRIX * S)390 esl_scorematrix_Destroy(ESL_SCOREMATRIX *S)
391 {
392 if (S == NULL) return;
393 if (S->s != NULL) {
394 if (S->s[0] != NULL) free(S->s[0]);
395 free(S->s);
396 }
397 if (S->isval != NULL) free(S->isval);
398 if (S->outorder != NULL) free(S->outorder);
399 if (S->name != NULL) free(S->name);
400 if (S->path != NULL) free(S->path);
401 free(S);
402 return;
403 }
404
405
406 /*------------------- end, scorematrix object -------------------*/
407
408
409
410
411 /*****************************************************************
412 *# 2. Some classic score matrices.
413 *****************************************************************/
414 /* PAM30, PAM70, PAM120, PAM240, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90 */
415 /* Standard matrices are reformatted to Easel static data by the UTILITY1 program; see below */
416
417 /* TODO: Instead of storing the classical low-precision versions of
418 * these, we should recalculate each one from its original
419 * probabilistic basis, and store it at higher integer precision,
420 * allowing the Yu/Altschul procedure to work. If we do that, we might also store
421 * lambda and background probabilities.
422 */
423
424 #define eslAADIM 29
425
426 struct esl_scorematrix_aa_preload_s {
427 char *name;
428 int matrix[eslAADIM][eslAADIM];
429 };
430
431 static const struct esl_scorematrix_aa_preload_s ESL_SCOREMATRIX_AA_PRELOADS[] = {
432 { "PAM30", {
433 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
434 { 6, -6, -3, -2, -8, -2, -7, -5, -7, -6, -5, -4, -2, -4, -7, 0, -1, -2, -13, -8, 0, -3, 0, -3, 0, 0, -3, -17, 0, }, /* A */
435 { -6, 10, -14, -14, -13, -9, -7, -6, -14, -15, -13, -11, -8, -14, -8, -3, -8, -6, -15, -4, 0, -12, 0, -14, 0, 0, -9, -17, 0, }, /* C */
436 { -3, -14, 8, 2, -15, -3, -4, -7, -4, -12, -11, 2, -8, -2, -10, -4, -5, -8, -15, -11, 0, 6, 0, 1, 0, 0, -5, -17, 0, }, /* D */
437 { -2, -14, 2, 8, -14, -4, -5, -5, -4, -9, -7, -2, -5, 1, -9, -4, -6, -6, -17, -8, 0, 1, 0, 6, 0, 0, -5, -17, 0, }, /* E */
438 { -8, -13, -15, -14, 9, -9, -6, -2, -14, -3, -4, -9, -10, -13, -9, -6, -9, -8, -4, 2, 0, -10, 0, -13, 0, 0, -8, -17, 0, }, /* F */
439 { -2, -9, -3, -4, -9, 6, -9, -11, -7, -10, -8, -3, -6, -7, -9, -2, -6, -5, -15, -14, 0, -3, 0, -5, 0, 0, -5, -17, 0, }, /* G */
440 { -7, -7, -4, -5, -6, -9, 9, -9, -6, -6, -10, 0, -4, 1, -2, -6, -7, -6, -7, -3, 0, -1, 0, -1, 0, 0, -5, -17, 0, }, /* H */
441 { -5, -6, -7, -5, -2, -11, -9, 8, -6, -1, -1, -5, -8, -8, -5, -7, -2, 2, -14, -6, 0, -6, 0, -6, 0, 0, -5, -17, 0, }, /* I */
442 { -7, -14, -4, -4, -14, -7, -6, -6, 7, -8, -2, -1, -6, -3, 0, -4, -3, -9, -12, -9, 0, -2, 0, -4, 0, 0, -5, -17, 0, }, /* K */
443 { -6, -15, -12, -9, -3, -10, -6, -1, -8, 7, 1, -7, -7, -5, -8, -8, -7, -2, -6, -7, 0, -9, 0, -7, 0, 0, -6, -17, 0, }, /* L */
444 { -5, -13, -11, -7, -4, -8, -10, -1, -2, 1, 11, -9, -8, -4, -4, -5, -4, -1, -13, -11, 0, -10, 0, -5, 0, 0, -5, -17, 0, }, /* M */
445 { -4, -11, 2, -2, -9, -3, 0, -5, -1, -7, -9, 8, -6, -3, -6, 0, -2, -8, -8, -4, 0, 6, 0, -3, 0, 0, -3, -17, 0, }, /* N */
446 { -2, -8, -8, -5, -10, -6, -4, -8, -6, -7, -8, -6, 8, -3, -4, -2, -4, -6, -14, -13, 0, -7, 0, -4, 0, 0, -5, -17, 0, }, /* P */
447 { -4, -14, -2, 1, -13, -7, 1, -8, -3, -5, -4, -3, -3, 8, -2, -5, -5, -7, -13, -12, 0, -3, 0, 6, 0, 0, -5, -17, 0, }, /* Q */
448 { -7, -8, -10, -9, -9, -9, -2, -5, 0, -8, -4, -6, -4, -2, 8, -3, -6, -8, -2, -10, 0, -7, 0, -4, 0, 0, -6, -17, 0, }, /* R */
449 { 0, -3, -4, -4, -6, -2, -6, -7, -4, -8, -5, 0, -2, -5, -3, 6, 0, -6, -5, -7, 0, -1, 0, -5, 0, 0, -3, -17, 0, }, /* S */
450 { -1, -8, -5, -6, -9, -6, -7, -2, -3, -7, -4, -2, -4, -5, -6, 0, 7, -3, -13, -6, 0, -3, 0, -6, 0, 0, -4, -17, 0, }, /* T */
451 { -2, -6, -8, -6, -8, -5, -6, 2, -9, -2, -1, -8, -6, -7, -8, -6, -3, 7, -15, -7, 0, -8, 0, -6, 0, 0, -5, -17, 0, }, /* V */
452 { -13, -15, -15, -17, -4, -15, -7, -14, -12, -6, -13, -8, -14, -13, -2, -5, -13, -15, 13, -5, 0, -10, 0, -14, 0, 0, -11, -17, 0, }, /* W */
453 { -8, -4, -11, -8, 2, -14, -3, -6, -9, -7, -11, -4, -13, -12, -10, -7, -6, -7, -5, 10, 0, -6, 0, -9, 0, 0, -7, -17, 0, }, /* Y */
454 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
455 { -3, -12, 6, 1, -10, -3, -1, -6, -2, -9, -10, 6, -7, -3, -7, -1, -3, -8, -10, -6, 0, 6, 0, 0, 0, 0, -5, -17, 0, }, /* B */
456 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
457 { -3, -14, 1, 6, -13, -5, -1, -6, -4, -7, -5, -3, -4, 6, -4, -5, -6, -6, -14, -9, 0, 0, 0, 6, 0, 0, -5, -17, 0, }, /* Z */
458 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
459 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
460 { -3, -9, -5, -5, -8, -5, -5, -5, -5, -6, -5, -3, -5, -5, -6, -3, -4, -5, -11, -7, 0, -5, 0, -5, 0, 0, -5, -17, 0, }, /* X */
461 { -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 0, -17, 0, -17, 0, 0, -17, 1, 0, }, /* * */
462 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
463 }},
464
465 { "PAM70", {
466 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
467 { 5, -4, -1, -1, -6, 0, -4, -2, -4, -4, -3, -2, 0, -2, -4, 1, 1, -1, -9, -5, 0, -1, 0, -1, 0, 0, -2, -11, 0, }, /* A */
468 { -4, 9, -9, -9, -8, -6, -5, -4, -9, -10, -9, -7, -5, -9, -5, -1, -5, -4, -11, -2, 0, -8, 0, -9, 0, 0, -6, -11, 0, }, /* C */
469 { -1, -9, 6, 3, -10, -1, -1, -5, -2, -8, -7, 3, -4, 0, -6, -1, -2, -5, -10, -7, 0, 5, 0, 2, 0, 0, -3, -11, 0, }, /* D */
470 { -1, -9, 3, 6, -9, -2, -2, -4, -2, -6, -4, 0, -3, 2, -5, -2, -3, -4, -11, -6, 0, 2, 0, 5, 0, 0, -3, -11, 0, }, /* E */
471 { -6, -8, -10, -9, 8, -7, -4, 0, -9, -1, -2, -6, -7, -9, -7, -4, -6, -5, -2, 4, 0, -7, 0, -9, 0, 0, -5, -11, 0, }, /* F */
472 { 0, -6, -1, -2, -7, 6, -6, -6, -5, -7, -6, -1, -3, -4, -6, 0, -3, -3, -10, -9, 0, -1, 0, -3, 0, 0, -3, -11, 0, }, /* G */
473 { -4, -5, -1, -2, -4, -6, 8, -6, -3, -4, -6, 1, -2, 2, 0, -3, -4, -4, -5, -1, 0, 0, 0, 1, 0, 0, -3, -11, 0, }, /* H */
474 { -2, -4, -5, -4, 0, -6, -6, 7, -4, 1, 1, -3, -5, -5, -3, -4, -1, 3, -9, -4, 0, -4, 0, -4, 0, 0, -3, -11, 0, }, /* I */
475 { -4, -9, -2, -2, -9, -5, -3, -4, 6, -5, 0, 0, -4, -1, 2, -2, -1, -6, -7, -7, 0, -1, 0, -2, 0, 0, -3, -11, 0, }, /* K */
476 { -4, -10, -8, -6, -1, -7, -4, 1, -5, 6, 2, -5, -5, -3, -6, -6, -4, 0, -4, -4, 0, -6, 0, -4, 0, 0, -4, -11, 0, }, /* L */
477 { -3, -9, -7, -4, -2, -6, -6, 1, 0, 2, 10, -5, -5, -2, -2, -3, -2, 0, -8, -7, 0, -6, 0, -3, 0, 0, -3, -11, 0, }, /* M */
478 { -2, -7, 3, 0, -6, -1, 1, -3, 0, -5, -5, 6, -3, -1, -3, 1, 0, -5, -6, -3, 0, 5, 0, -1, 0, 0, -2, -11, 0, }, /* N */
479 { 0, -5, -4, -3, -7, -3, -2, -5, -4, -5, -5, -3, 7, -1, -2, 0, -2, -3, -9, -9, 0, -4, 0, -2, 0, 0, -3, -11, 0, }, /* P */
480 { -2, -9, 0, 2, -9, -4, 2, -5, -1, -3, -2, -1, -1, 7, 0, -3, -3, -4, -8, -8, 0, -1, 0, 5, 0, 0, -2, -11, 0, }, /* Q */
481 { -4, -5, -6, -5, -7, -6, 0, -3, 2, -6, -2, -3, -2, 0, 8, -1, -4, -5, 0, -7, 0, -4, 0, -2, 0, 0, -3, -11, 0, }, /* R */
482 { 1, -1, -1, -2, -4, 0, -3, -4, -2, -6, -3, 1, 0, -3, -1, 5, 2, -3, -3, -5, 0, 0, 0, -2, 0, 0, -1, -11, 0, }, /* S */
483 { 1, -5, -2, -3, -6, -3, -4, -1, -1, -4, -2, 0, -2, -3, -4, 2, 6, -1, -8, -4, 0, -1, 0, -3, 0, 0, -2, -11, 0, }, /* T */
484 { -1, -4, -5, -4, -5, -3, -4, 3, -6, 0, 0, -5, -3, -4, -5, -3, -1, 6, -10, -5, 0, -5, 0, -4, 0, 0, -2, -11, 0, }, /* V */
485 { -9, -11, -10, -11, -2, -10, -5, -9, -7, -4, -8, -6, -9, -8, 0, -3, -8, -10, 13, -3, 0, -7, 0, -10, 0, 0, -7, -11, 0, }, /* W */
486 { -5, -2, -7, -6, 4, -9, -1, -4, -7, -4, -7, -3, -9, -8, -7, -5, -4, -5, -3, 9, 0, -4, 0, -7, 0, 0, -5, -11, 0, }, /* Y */
487 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
488 { -1, -8, 5, 2, -7, -1, 0, -4, -1, -6, -6, 5, -4, -1, -4, 0, -1, -5, -7, -4, 0, 5, 0, 1, 0, 0, -2, -11, 0, }, /* B */
489 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
490 { -1, -9, 2, 5, -9, -3, 1, -4, -2, -4, -3, -1, -2, 5, -2, -2, -3, -4, -10, -7, 0, 1, 0, 5, 0, 0, -3, -11, 0, }, /* Z */
491 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
492 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
493 { -2, -6, -3, -3, -5, -3, -3, -3, -3, -4, -3, -2, -3, -2, -3, -1, -2, -2, -7, -5, 0, -2, 0, -3, 0, 0, -3, -11, 0, }, /* X */
494 { -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, 0, -11, 0, -11, 0, 0, -11, 1, 0, }, /* * */
495 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
496 }},
497
498 { "PAM120", {
499 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
500 { 3, -3, 0, 0, -4, 1, -3, -1, -2, -3, -2, -1, 1, -1, -3, 1, 1, 0, -7, -4, 0, 0, 0, -1, 0, 0, -1, -8, 0, }, /* A */
501 { -3, 9, -7, -7, -6, -4, -4, -3, -7, -7, -6, -5, -4, -7, -4, 0, -3, -3, -8, -1, 0, -6, 0, -7, 0, 0, -4, -8, 0, }, /* C */
502 { 0, -7, 5, 3, -7, 0, 0, -3, -1, -5, -4, 2, -3, 1, -3, 0, -1, -3, -8, -5, 0, 4, 0, 3, 0, 0, -2, -8, 0, }, /* D */
503 { 0, -7, 3, 5, -7, -1, -1, -3, -1, -4, -3, 1, -2, 2, -3, -1, -2, -3, -8, -5, 0, 3, 0, 4, 0, 0, -1, -8, 0, }, /* E */
504 { -4, -6, -7, -7, 8, -5, -3, 0, -7, 0, -1, -4, -5, -6, -5, -3, -4, -3, -1, 4, 0, -5, 0, -6, 0, 0, -3, -8, 0, }, /* F */
505 { 1, -4, 0, -1, -5, 5, -4, -4, -3, -5, -4, 0, -2, -3, -4, 1, -1, -2, -8, -6, 0, 0, 0, -2, 0, 0, -2, -8, 0, }, /* G */
506 { -3, -4, 0, -1, -3, -4, 7, -4, -2, -3, -4, 2, -1, 3, 1, -2, -3, -3, -3, -1, 0, 1, 0, 1, 0, 0, -2, -8, 0, }, /* H */
507 { -1, -3, -3, -3, 0, -4, -4, 6, -3, 1, 1, -2, -3, -3, -2, -2, 0, 3, -6, -2, 0, -3, 0, -3, 0, 0, -1, -8, 0, }, /* I */
508 { -2, -7, -1, -1, -7, -3, -2, -3, 5, -4, 0, 1, -2, 0, 2, -1, -1, -4, -5, -5, 0, 0, 0, -1, 0, 0, -2, -8, 0, }, /* K */
509 { -3, -7, -5, -4, 0, -5, -3, 1, -4, 5, 3, -4, -3, -2, -4, -4, -3, 1, -3, -2, 0, -4, 0, -3, 0, 0, -2, -8, 0, }, /* L */
510 { -2, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8, -3, -3, -1, -1, -2, -1, 1, -6, -4, 0, -4, 0, -2, 0, 0, -2, -8, 0, }, /* M */
511 { -1, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4, -2, 0, -1, 1, 0, -3, -4, -2, 0, 3, 0, 0, 0, 0, -1, -8, 0, }, /* N */
512 { 1, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6, 0, -1, 1, -1, -2, -7, -6, 0, -2, 0, -1, 0, 0, -2, -8, 0, }, /* P */
513 { -1, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6, 1, -2, -2, -3, -6, -5, 0, 0, 0, 4, 0, 0, -1, -8, 0, }, /* Q */
514 { -3, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6, -1, -2, -3, 1, -5, 0, -2, 0, -1, 0, 0, -2, -8, 0, }, /* R */
515 { 1, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3, 2, -2, -2, -3, 0, 0, 0, -1, 0, 0, -1, -8, 0, }, /* S */
516 { 1, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4, 0, -6, -3, 0, 0, 0, -2, 0, 0, -1, -8, 0, }, /* T */
517 { 0, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5, -8, -3, 0, -3, 0, -3, 0, 0, -1, -8, 0, }, /* V */
518 { -7, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12, -2, 0, -6, 0, -7, 0, 0, -5, -8, 0, }, /* W */
519 { -4, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, 8, 0, -3, 0, -5, 0, 0, -3, -8, 0, }, /* Y */
520 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
521 { 0, -6, 4, 3, -5, 0, 1, -3, 0, -4, -4, 3, -2, 0, -2, 0, 0, -3, -6, -3, 0, 4, 0, 2, 0, 0, -1, -8, 0, }, /* B */
522 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
523 { -1, -7, 3, 4, -6, -2, 1, -3, -1, -3, -2, 0, -1, 4, -1, -1, -2, -3, -7, -5, 0, 2, 0, 4, 0, 0, -1, -8, 0, }, /* Z */
524 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
525 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
526 { -1, -4, -2, -1, -3, -2, -2, -1, -2, -2, -2, -1, -2, -1, -2, -1, -1, -1, -5, -3, 0, -1, 0, -1, 0, 0, -2, -8, 0, }, /* X */
527 { -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, 0, -8, 0, -8, 0, 0, -8, 1, 0, }, /* * */
528 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
529 }},
530
531 { "PAM240", {
532 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
533 { 2, -2, 0, 0, -4, 1, -1, -1, -1, -2, -1, 0, 1, 0, -2, 1, 1, 0, -6, -4, 0, 0, 0, 0, 0, 0, 0, -8, 0, }, /* A */
534 { -2, 12, -5, -6, -5, -4, -4, -2, -6, -6, -5, -4, -3, -6, -4, 0, -2, -2, -8, 0, 0, -5, 0, -6, 0, 0, -3, -8, 0, }, /* C */
535 { 0, -5, 4, 4, -6, 1, 1, -2, 0, -4, -3, 2, -1, 2, -1, 0, 0, -2, -7, -4, 0, 3, 0, 3, 0, 0, -1, -8, 0, }, /* D */
536 { 0, -6, 4, 4, -6, 0, 1, -2, 0, -3, -2, 1, -1, 3, -1, 0, 0, -2, -7, -4, 0, 3, 0, 3, 0, 0, -1, -8, 0, }, /* E */
537 { -4, -5, -6, -6, 9, -5, -2, 1, -5, 2, 0, -4, -5, -5, -5, -3, -3, -1, 0, 7, 0, -5, 0, -5, 0, 0, -2, -8, 0, }, /* F */
538 { 1, -4, 1, 0, -5, 5, -2, -3, -2, -4, -3, 0, -1, -1, -3, 1, 0, -1, -7, -5, 0, 0, 0, 0, 0, 0, -1, -8, 0, }, /* G */
539 { -1, -4, 1, 1, -2, -2, 7, -3, 0, -2, -2, 2, 0, 3, 2, -1, -1, -2, -3, 0, 0, 1, 0, 2, 0, 0, -1, -8, 0, }, /* H */
540 { -1, -2, -2, -2, 1, -3, -3, 5, -2, 2, 2, -2, -2, -2, -2, -1, 0, 4, -5, -1, 0, -2, 0, -2, 0, 0, -1, -8, 0, }, /* I */
541 { -1, -6, 0, 0, -5, -2, 0, -2, 5, -3, 0, 1, -1, 1, 3, 0, 0, -3, -4, -5, 0, 1, 0, 0, 0, 0, -1, -8, 0, }, /* K */
542 { -2, -6, -4, -3, 2, -4, -2, 2, -3, 6, 4, -3, -3, -2, -3, -3, -2, 2, -2, -1, 0, -4, 0, -3, 0, 0, -1, -8, 0, }, /* L */
543 { -1, -5, -3, -2, 0, -3, -2, 2, 0, 4, 7, -2, -2, -1, 0, -2, -1, 2, -4, -3, 0, -2, 0, -2, 0, 0, -1, -8, 0, }, /* M */
544 { 0, -4, 2, 1, -4, 0, 2, -2, 1, -3, -2, 2, -1, 1, 0, 1, 0, -2, -4, -2, 0, 2, 0, 1, 0, 0, 0, -8, 0, }, /* N */
545 { 1, -3, -1, -1, -5, -1, 0, -2, -1, -3, -2, -1, 6, 0, 0, 1, 0, -1, -6, -5, 0, -1, 0, 0, 0, 0, -1, -8, 0, }, /* P */
546 { 0, -6, 2, 3, -5, -1, 3, -2, 1, -2, -1, 1, 0, 4, 1, -1, -1, -2, -5, -4, 0, 1, 0, 3, 0, 0, -1, -8, 0, }, /* Q */
547 { -2, -4, -1, -1, -5, -3, 2, -2, 3, -3, 0, 0, 0, 1, 6, 0, -1, -3, 2, -4, 0, -1, 0, 0, 0, 0, -1, -8, 0, }, /* R */
548 { 1, 0, 0, 0, -3, 1, -1, -1, 0, -3, -2, 1, 1, -1, 0, 2, 1, -1, -3, -3, 0, 0, 0, 0, 0, 0, 0, -8, 0, }, /* S */
549 { 1, -2, 0, 0, -3, 0, -1, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3, 0, -5, -3, 0, 0, 0, -1, 0, 0, 0, -8, 0, }, /* T */
550 { 0, -2, -2, -2, -1, -1, -2, 4, -3, 2, 2, -2, -1, -2, -3, -1, 0, 4, -6, -3, 0, -2, 0, -2, 0, 0, -1, -8, 0, }, /* V */
551 { -6, -8, -7, -7, 0, -7, -3, -5, -4, -2, -4, -4, -6, -5, 2, -3, -5, -6, 17, 0, 0, -5, 0, -6, 0, 0, -4, -8, 0, }, /* W */
552 { -4, 0, -4, -4, 7, -5, 0, -1, -5, -1, -3, -2, -5, -4, -4, -3, -3, -3, 0, 10, 0, -3, 0, -4, 0, 0, -2, -8, 0, }, /* Y */
553 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
554 { 0, -5, 3, 3, -5, 0, 1, -2, 1, -4, -2, 2, -1, 1, -1, 0, 0, -2, -5, -3, 0, 3, 0, 2, 0, 0, -1, -8, 0, }, /* B */
555 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
556 { 0, -6, 3, 3, -5, 0, 2, -2, 0, -3, -2, 1, 0, 3, 0, 0, -1, -2, -6, -4, 0, 2, 0, 3, 0, 0, -1, -8, 0, }, /* Z */
557 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
558 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
559 { 0, -3, -1, -1, -2, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 0, 0, -1, -4, -2, 0, -1, 0, -1, 0, 0, -1, -8, 0, }, /* X */
560 { -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, 0, -8, 0, -8, 0, 0, -8, 1, 0, }, /* * */
561 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
562 }},
563
564 { "BLOSUM45", {
565 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
566 { 5, -1, -2, -1, -2, 0, -2, -1, -1, -1, -1, -1, -1, -1, -2, 1, 0, 0, -2, -2, 0, -1, 0, -1, 0, 0, 0, -5, 0, }, /* A */
567 { -1, 12, -3, -3, -2, -3, -3, -3, -3, -2, -2, -2, -4, -3, -3, -1, -1, -1, -5, -3, 0, -2, 0, -3, 0, 0, -2, -5, 0, }, /* C */
568 { -2, -3, 7, 2, -4, -1, 0, -4, 0, -3, -3, 2, -1, 0, -1, 0, -1, -3, -4, -2, 0, 5, 0, 1, 0, 0, -1, -5, 0, }, /* D */
569 { -1, -3, 2, 6, -3, -2, 0, -3, 1, -2, -2, 0, 0, 2, 0, 0, -1, -3, -3, -2, 0, 1, 0, 4, 0, 0, -1, -5, 0, }, /* E */
570 { -2, -2, -4, -3, 8, -3, -2, 0, -3, 1, 0, -2, -3, -4, -2, -2, -1, 0, 1, 3, 0, -3, 0, -3, 0, 0, -1, -5, 0, }, /* F */
571 { 0, -3, -1, -2, -3, 7, -2, -4, -2, -3, -2, 0, -2, -2, -2, 0, -2, -3, -2, -3, 0, -1, 0, -2, 0, 0, -1, -5, 0, }, /* G */
572 { -2, -3, 0, 0, -2, -2, 10, -3, -1, -2, 0, 1, -2, 1, 0, -1, -2, -3, -3, 2, 0, 0, 0, 0, 0, 0, -1, -5, 0, }, /* H */
573 { -1, -3, -4, -3, 0, -4, -3, 5, -3, 2, 2, -2, -2, -2, -3, -2, -1, 3, -2, 0, 0, -3, 0, -3, 0, 0, -1, -5, 0, }, /* I */
574 { -1, -3, 0, 1, -3, -2, -1, -3, 5, -3, -1, 0, -1, 1, 3, -1, -1, -2, -2, -1, 0, 0, 0, 1, 0, 0, -1, -5, 0, }, /* K */
575 { -1, -2, -3, -2, 1, -3, -2, 2, -3, 5, 2, -3, -3, -2, -2, -3, -1, 1, -2, 0, 0, -3, 0, -2, 0, 0, -1, -5, 0, }, /* L */
576 { -1, -2, -3, -2, 0, -2, 0, 2, -1, 2, 6, -2, -2, 0, -1, -2, -1, 1, -2, 0, 0, -2, 0, -1, 0, 0, -1, -5, 0, }, /* M */
577 { -1, -2, 2, 0, -2, 0, 1, -2, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2, 0, 4, 0, 0, 0, 0, -1, -5, 0, }, /* N */
578 { -1, -4, -1, 0, -3, -2, -2, -2, -1, -3, -2, -2, 9, -1, -2, -1, -1, -3, -3, -3, 0, -2, 0, -1, 0, 0, -1, -5, 0, }, /* P */
579 { -1, -3, 0, 2, -4, -2, 1, -2, 1, -2, 0, 0, -1, 6, 1, 0, -1, -3, -2, -1, 0, 0, 0, 4, 0, 0, -1, -5, 0, }, /* Q */
580 { -2, -3, -1, 0, -2, -2, 0, -3, 3, -2, -1, 0, -2, 1, 7, -1, -1, -2, -2, -1, 0, -1, 0, 0, 0, 0, -1, -5, 0, }, /* R */
581 { 1, -1, 0, 0, -2, 0, -1, -2, -1, -3, -2, 1, -1, 0, -1, 4, 2, -1, -4, -2, 0, 0, 0, 0, 0, 0, 0, -5, 0, }, /* S */
582 { 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5, 0, -3, -1, 0, 0, 0, -1, 0, 0, 0, -5, 0, }, /* T */
583 { 0, -1, -3, -3, 0, -3, -3, 3, -2, 1, 1, -3, -3, -3, -2, -1, 0, 5, -3, -1, 0, -3, 0, -3, 0, 0, -1, -5, 0, }, /* V */
584 { -2, -5, -4, -3, 1, -2, -3, -2, -2, -2, -2, -4, -3, -2, -2, -4, -3, -3, 15, 3, 0, -4, 0, -2, 0, 0, -2, -5, 0, }, /* W */
585 { -2, -3, -2, -2, 3, -3, 2, 0, -1, 0, 0, -2, -3, -1, -1, -2, -1, -1, 3, 8, 0, -2, 0, -2, 0, 0, -1, -5, 0, }, /* Y */
586 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
587 { -1, -2, 5, 1, -3, -1, 0, -3, 0, -3, -2, 4, -2, 0, -1, 0, 0, -3, -4, -2, 0, 4, 0, 2, 0, 0, -1, -5, 0, }, /* B */
588 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
589 { -1, -3, 1, 4, -3, -2, 0, -3, 1, -2, -1, 0, -1, 4, 0, 0, -1, -3, -2, -2, 0, 2, 0, 4, 0, 0, -1, -5, 0, }, /* Z */
590 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
591 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
592 { 0, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -2, -1, 0, -1, 0, -1, 0, 0, -1, -5, 0, }, /* X */
593 { -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 0, -5, 0, -5, 0, 0, -5, 1, 0, }, /* * */
594 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
595 }},
596
597 { "BLOSUM50", {
598 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
599 { 5, -1, -2, -1, -3, 0, -2, -1, -1, -2, -1, -1, -1, -1, -2, 1, 0, 0, -3, -2, 0, -2, 0, -1, 0, 0, -1, -5, 0, }, /* A */
600 { -1, 13, -4, -3, -2, -3, -3, -2, -3, -2, -2, -2, -4, -3, -4, -1, -1, -1, -5, -3, 0, -3, 0, -3, 0, 0, -2, -5, 0, }, /* C */
601 { -2, -4, 8, 2, -5, -1, -1, -4, -1, -4, -4, 2, -1, 0, -2, 0, -1, -4, -5, -3, 0, 5, 0, 1, 0, 0, -1, -5, 0, }, /* D */
602 { -1, -3, 2, 6, -3, -3, 0, -4, 1, -3, -2, 0, -1, 2, 0, -1, -1, -3, -3, -2, 0, 1, 0, 5, 0, 0, -1, -5, 0, }, /* E */
603 { -3, -2, -5, -3, 8, -4, -1, 0, -4, 1, 0, -4, -4, -4, -3, -3, -2, -1, 1, 4, 0, -4, 0, -4, 0, 0, -2, -5, 0, }, /* F */
604 { 0, -3, -1, -3, -4, 8, -2, -4, -2, -4, -3, 0, -2, -2, -3, 0, -2, -4, -3, -3, 0, -1, 0, -2, 0, 0, -2, -5, 0, }, /* G */
605 { -2, -3, -1, 0, -1, -2, 10, -4, 0, -3, -1, 1, -2, 1, 0, -1, -2, -4, -3, 2, 0, 0, 0, 0, 0, 0, -1, -5, 0, }, /* H */
606 { -1, -2, -4, -4, 0, -4, -4, 5, -3, 2, 2, -3, -3, -3, -4, -3, -1, 4, -3, -1, 0, -4, 0, -3, 0, 0, -1, -5, 0, }, /* I */
607 { -1, -3, -1, 1, -4, -2, 0, -3, 6, -3, -2, 0, -1, 2, 3, 0, -1, -3, -3, -2, 0, 0, 0, 1, 0, 0, -1, -5, 0, }, /* K */
608 { -2, -2, -4, -3, 1, -4, -3, 2, -3, 5, 3, -4, -4, -2, -3, -3, -1, 1, -2, -1, 0, -4, 0, -3, 0, 0, -1, -5, 0, }, /* L */
609 { -1, -2, -4, -2, 0, -3, -1, 2, -2, 3, 7, -2, -3, 0, -2, -2, -1, 1, -1, 0, 0, -3, 0, -1, 0, 0, -1, -5, 0, }, /* M */
610 { -1, -2, 2, 0, -4, 0, 1, -3, 0, -4, -2, 7, -2, 0, -1, 1, 0, -3, -4, -2, 0, 4, 0, 0, 0, 0, -1, -5, 0, }, /* N */
611 { -1, -4, -1, -1, -4, -2, -2, -3, -1, -4, -3, -2, 10, -1, -3, -1, -1, -3, -4, -3, 0, -2, 0, -1, 0, 0, -2, -5, 0, }, /* P */
612 { -1, -3, 0, 2, -4, -2, 1, -3, 2, -2, 0, 0, -1, 7, 1, 0, -1, -3, -1, -1, 0, 0, 0, 4, 0, 0, -1, -5, 0, }, /* Q */
613 { -2, -4, -2, 0, -3, -3, 0, -4, 3, -3, -2, -1, -3, 1, 7, -1, -1, -3, -3, -1, 0, -1, 0, 0, 0, 0, -1, -5, 0, }, /* R */
614 { 1, -1, 0, -1, -3, 0, -1, -3, 0, -3, -2, 1, -1, 0, -1, 5, 2, -2, -4, -2, 0, 0, 0, 0, 0, 0, -1, -5, 0, }, /* S */
615 { 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 2, 5, 0, -3, -2, 0, 0, 0, -1, 0, 0, 0, -5, 0, }, /* T */
616 { 0, -1, -4, -3, -1, -4, -4, 4, -3, 1, 1, -3, -3, -3, -3, -2, 0, 5, -3, -1, 0, -4, 0, -3, 0, 0, -1, -5, 0, }, /* V */
617 { -3, -5, -5, -3, 1, -3, -3, -3, -3, -2, -1, -4, -4, -1, -3, -4, -3, -3, 15, 2, 0, -5, 0, -2, 0, 0, -3, -5, 0, }, /* W */
618 { -2, -3, -3, -2, 4, -3, 2, -1, -2, -1, 0, -2, -3, -1, -1, -2, -2, -1, 2, 8, 0, -3, 0, -2, 0, 0, -1, -5, 0, }, /* Y */
619 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
620 { -2, -3, 5, 1, -4, -1, 0, -4, 0, -4, -3, 4, -2, 0, -1, 0, 0, -4, -5, -3, 0, 5, 0, 2, 0, 0, -1, -5, 0, }, /* B */
621 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
622 { -1, -3, 1, 5, -4, -2, 0, -3, 1, -3, -1, 0, -1, 4, 0, 0, -1, -3, -2, -2, 0, 2, 0, 5, 0, 0, -1, -5, 0, }, /* Z */
623 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
624 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
625 { -1, -2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, 0, -1, -3, -1, 0, -1, 0, -1, 0, 0, -1, -5, 0, }, /* X */
626 { -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 0, -5, 0, -5, 0, 0, -5, 1, 0, }, /* * */
627 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
628 }},
629
630 { "BLOSUM62", {
631 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
632 { 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2, 0, -2, 0, -1, 0, 0, 0, -4, 0, }, /* A */
633 { 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2, 0, -3, 0, -3, 0, 0, -2, -4, 0, }, /* C */
634 { -2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3, 0, 4, 0, 1, 0, 0, -1, -4, 0, }, /* D */
635 { -1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2, 0, 1, 0, 4, 0, 0, -1, -4, 0, }, /* E */
636 { -2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3, 0, -3, 0, -3, 0, 0, -1, -4, 0, }, /* F */
637 { 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3, 0, -1, 0, -2, 0, 0, -1, -4, 0, }, /* G */
638 { -2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2, 0, 0, 0, 0, 0, 0, -1, -4, 0, }, /* H */
639 { -1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1, 0, -3, 0, -3, 0, 0, -1, -4, 0, }, /* I */
640 { -1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2, 0, 0, 0, 1, 0, 0, -1, -4, 0, }, /* K */
641 { -1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1, 0, -4, 0, -3, 0, 0, -1, -4, 0, }, /* L */
642 { -1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1, 0, -3, 0, -1, 0, 0, -1, -4, 0, }, /* M */
643 { -2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2, 0, 3, 0, 0, 0, 0, -1, -4, 0, }, /* N */
644 { -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3, 0, -2, 0, -1, 0, 0, -2, -4, 0, }, /* P */
645 { -1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1, 0, 0, 0, 3, 0, 0, -1, -4, 0, }, /* Q */
646 { -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2, 0, -1, 0, 0, 0, 0, -1, -4, 0, }, /* R */
647 { 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2, 0, 0, 0, 0, 0, 0, 0, -4, 0, }, /* S */
648 { 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2, 0, -1, 0, -1, 0, 0, 0, -4, 0, }, /* T */
649 { 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1, 0, -3, 0, -2, 0, 0, -1, -4, 0, }, /* V */
650 { -3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2, 0, -4, 0, -3, 0, 0, -2, -4, 0, }, /* W */
651 { -2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7, 0, -3, 0, -2, 0, 0, -1, -4, 0, }, /* Y */
652 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
653 { -2, -3, 4, 1, -3, -1, 0, -3, 0, -4, -3, 3, -2, 0, -1, 0, -1, -3, -4, -3, 0, 4, 0, 1, 0, 0, -1, -4, 0, }, /* B */
654 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
655 { -1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -2, 0, 1, 0, 4, 0, 0, -1, -4, 0, }, /* Z */
656 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
657 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
658 { 0, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1, 0, -1, 0, -1, 0, 0, -1, -4, 0, }, /* X */
659 { -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 0, -4, 0, -4, 0, 0, -4, 1, 0, }, /* * */
660 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
661 }},
662
663 { "BLOSUM80", {
664 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
665 { 7, -1, -3, -2, -4, 0, -3, -3, -1, -3, -2, -3, -1, -2, -3, 2, 0, -1, -5, -4, 0, -3, 0, -2, 0, 0, -1, -8, 0, }, /* A */
666 { -1, 13, -7, -7, -4, -6, -7, -2, -6, -3, -3, -5, -6, -5, -6, -2, -2, -2, -5, -5, 0, -6, 0, -7, 0, 0, -4, -8, 0, }, /* C */
667 { -3, -7, 10, 2, -6, -3, -2, -7, -2, -7, -6, 2, -3, -1, -3, -1, -2, -6, -8, -6, 0, 6, 0, 1, 0, 0, -3, -8, 0, }, /* D */
668 { -2, -7, 2, 8, -6, -4, 0, -6, 1, -6, -4, -1, -2, 3, -1, -1, -2, -4, -6, -5, 0, 1, 0, 6, 0, 0, -2, -8, 0, }, /* E */
669 { -4, -4, -6, -6, 10, -6, -2, -1, -5, 0, 0, -6, -6, -5, -5, -4, -4, -2, 0, 4, 0, -6, 0, -6, 0, 0, -3, -8, 0, }, /* F */
670 { 0, -6, -3, -4, -6, 9, -4, -7, -3, -7, -5, -1, -5, -4, -4, -1, -3, -6, -6, -6, 0, -2, 0, -4, 0, 0, -3, -8, 0, }, /* G */
671 { -3, -7, -2, 0, -2, -4, 12, -6, -1, -5, -4, 1, -4, 1, 0, -2, -3, -5, -4, 3, 0, -1, 0, 0, 0, 0, -2, -8, 0, }, /* H */
672 { -3, -2, -7, -6, -1, -7, -6, 7, -5, 2, 2, -6, -5, -5, -5, -4, -2, 4, -5, -3, 0, -6, 0, -6, 0, 0, -2, -8, 0, }, /* I */
673 { -1, -6, -2, 1, -5, -3, -1, -5, 8, -4, -3, 0, -2, 2, 3, -1, -1, -4, -6, -4, 0, -1, 0, 1, 0, 0, -2, -8, 0, }, /* K */
674 { -3, -3, -7, -6, 0, -7, -5, 2, -4, 6, 3, -6, -5, -4, -4, -4, -3, 1, -4, -2, 0, -7, 0, -5, 0, 0, -2, -8, 0, }, /* L */
675 { -2, -3, -6, -4, 0, -5, -4, 2, -3, 3, 9, -4, -4, -1, -3, -3, -1, 1, -3, -3, 0, -5, 0, -3, 0, 0, -2, -8, 0, }, /* M */
676 { -3, -5, 2, -1, -6, -1, 1, -6, 0, -6, -4, 9, -4, 0, -1, 1, 0, -5, -7, -4, 0, 5, 0, -1, 0, 0, -2, -8, 0, }, /* N */
677 { -1, -6, -3, -2, -6, -5, -4, -5, -2, -5, -4, -4, 12, -3, -3, -2, -3, -4, -7, -6, 0, -4, 0, -2, 0, 0, -3, -8, 0, }, /* P */
678 { -2, -5, -1, 3, -5, -4, 1, -5, 2, -4, -1, 0, -3, 9, 1, -1, -1, -4, -4, -3, 0, -1, 0, 5, 0, 0, -2, -8, 0, }, /* Q */
679 { -3, -6, -3, -1, -5, -4, 0, -5, 3, -4, -3, -1, -3, 1, 9, -2, -2, -4, -5, -4, 0, -2, 0, 0, 0, 0, -2, -8, 0, }, /* R */
680 { 2, -2, -1, -1, -4, -1, -2, -4, -1, -4, -3, 1, -2, -1, -2, 7, 2, -3, -6, -3, 0, 0, 0, -1, 0, 0, -1, -8, 0, }, /* S */
681 { 0, -2, -2, -2, -4, -3, -3, -2, -1, -3, -1, 0, -3, -1, -2, 2, 8, 0, -5, -3, 0, -1, 0, -2, 0, 0, -1, -8, 0, }, /* T */
682 { -1, -2, -6, -4, -2, -6, -5, 4, -4, 1, 1, -5, -4, -4, -4, -3, 0, 7, -5, -3, 0, -6, 0, -4, 0, 0, -2, -8, 0, }, /* V */
683 { -5, -5, -8, -6, 0, -6, -4, -5, -6, -4, -3, -7, -7, -4, -5, -6, -5, -5, 16, 3, 0, -8, 0, -5, 0, 0, -5, -8, 0, }, /* W */
684 { -4, -5, -6, -5, 4, -6, 3, -3, -4, -2, -3, -4, -6, -3, -4, -3, -3, -3, 3, 11, 0, -5, 0, -4, 0, 0, -3, -8, 0, }, /* Y */
685 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
686 { -3, -6, 6, 1, -6, -2, -1, -6, -1, -7, -5, 5, -4, -1, -2, 0, -1, -6, -8, -5, 0, 6, 0, 0, 0, 0, -3, -8, 0, }, /* B */
687 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
688 { -2, -7, 1, 6, -6, -4, 0, -6, 1, -5, -3, -1, -2, 5, 0, -1, -2, -4, -5, -4, 0, 0, 0, 6, 0, 0, -1, -8, 0, }, /* Z */
689 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
690 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
691 { -1, -4, -3, -2, -3, -3, -2, -2, -2, -2, -2, -2, -3, -2, -2, -1, -1, -2, -5, -3, 0, -3, 0, -1, 0, 0, -2, -8, 0, }, /* X */
692 { -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, -8, 0, -8, 0, -8, 0, 0, -8, 1, 0, }, /* * */
693 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
694 }},
695
696 { "BLOSUM90", {
697 /* A C D E F G H I K L M N P Q R S T V W Y - B J Z O U X * ~ */
698 { 5, -1, -3, -1, -3, 0, -2, -2, -1, -2, -2, -2, -1, -1, -2, 1, 0, -1, -4, -3, 0, -2, 0, -1, 0, 0, -1, -6, 0, }, /* A */
699 { -1, 9, -5, -6, -3, -4, -5, -2, -4, -2, -2, -4, -4, -4, -5, -2, -2, -2, -4, -4, 0, -4, 0, -5, 0, 0, -3, -6, 0, }, /* C */
700 { -3, -5, 7, 1, -5, -2, -2, -5, -1, -5, -4, 1, -3, -1, -3, -1, -2, -5, -6, -4, 0, 4, 0, 0, 0, 0, -2, -6, 0, }, /* D */
701 { -1, -6, 1, 6, -5, -3, -1, -4, 0, -4, -3, -1, -2, 2, -1, -1, -1, -3, -5, -4, 0, 0, 0, 4, 0, 0, -2, -6, 0, }, /* E */
702 { -3, -3, -5, -5, 7, -5, -2, -1, -4, 0, -1, -4, -4, -4, -4, -3, -3, -2, 0, 3, 0, -4, 0, -4, 0, 0, -2, -6, 0, }, /* F */
703 { 0, -4, -2, -3, -5, 6, -3, -5, -2, -5, -4, -1, -3, -3, -3, -1, -3, -5, -4, -5, 0, -2, 0, -3, 0, 0, -2, -6, 0, }, /* G */
704 { -2, -5, -2, -1, -2, -3, 8, -4, -1, -4, -3, 0, -3, 1, 0, -2, -2, -4, -3, 1, 0, -1, 0, 0, 0, 0, -2, -6, 0, }, /* H */
705 { -2, -2, -5, -4, -1, -5, -4, 5, -4, 1, 1, -4, -4, -4, -4, -3, -1, 3, -4, -2, 0, -5, 0, -4, 0, 0, -2, -6, 0, }, /* I */
706 { -1, -4, -1, 0, -4, -2, -1, -4, 6, -3, -2, 0, -2, 1, 2, -1, -1, -3, -5, -3, 0, -1, 0, 1, 0, 0, -1, -6, 0, }, /* K */
707 { -2, -2, -5, -4, 0, -5, -4, 1, -3, 5, 2, -4, -4, -3, -3, -3, -2, 0, -3, -2, 0, -5, 0, -4, 0, 0, -2, -6, 0, }, /* L */
708 { -2, -2, -4, -3, -1, -4, -3, 1, -2, 2, 7, -3, -3, 0, -2, -2, -1, 0, -2, -2, 0, -4, 0, -2, 0, 0, -1, -6, 0, }, /* M */
709 { -2, -4, 1, -1, -4, -1, 0, -4, 0, -4, -3, 7, -3, 0, -1, 0, 0, -4, -5, -3, 0, 4, 0, -1, 0, 0, -2, -6, 0, }, /* N */
710 { -1, -4, -3, -2, -4, -3, -3, -4, -2, -4, -3, -3, 8, -2, -3, -2, -2, -3, -5, -4, 0, -3, 0, -2, 0, 0, -2, -6, 0, }, /* P */
711 { -1, -4, -1, 2, -4, -3, 1, -4, 1, -3, 0, 0, -2, 7, 1, -1, -1, -3, -3, -3, 0, -1, 0, 4, 0, 0, -1, -6, 0, }, /* Q */
712 { -2, -5, -3, -1, -4, -3, 0, -4, 2, -3, -2, -1, -3, 1, 6, -1, -2, -3, -4, -3, 0, -2, 0, 0, 0, 0, -2, -6, 0, }, /* R */
713 { 1, -2, -1, -1, -3, -1, -2, -3, -1, -3, -2, 0, -2, -1, -1, 5, 1, -2, -4, -3, 0, 0, 0, -1, 0, 0, -1, -6, 0, }, /* S */
714 { 0, -2, -2, -1, -3, -3, -2, -1, -1, -2, -1, 0, -2, -1, -2, 1, 6, -1, -4, -2, 0, -1, 0, -1, 0, 0, -1, -6, 0, }, /* T */
715 { -1, -2, -5, -3, -2, -5, -4, 3, -3, 0, 0, -4, -3, -3, -3, -2, -1, 5, -3, -3, 0, -4, 0, -3, 0, 0, -2, -6, 0, }, /* V */
716 { -4, -4, -6, -5, 0, -4, -3, -4, -5, -3, -2, -5, -5, -3, -4, -4, -4, -3, 11, 2, 0, -6, 0, -4, 0, 0, -3, -6, 0, }, /* W */
717 { -3, -4, -4, -4, 3, -5, 1, -2, -3, -2, -2, -3, -4, -3, -3, -3, -2, -3, 2, 8, 0, -4, 0, -3, 0, 0, -2, -6, 0, }, /* Y */
718 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* - */
719 { -2, -4, 4, 0, -4, -2, -1, -5, -1, -5, -4, 4, -3, -1, -2, 0, -1, -4, -6, -4, 0, 4, 0, 0, 0, 0, -2, -6, 0, }, /* B */
720 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* J */
721 { -1, -5, 0, 4, -4, -3, 0, -4, 1, -4, -2, -1, -2, 4, 0, -1, -1, -3, -4, -3, 0, 0, 0, 4, 0, 0, -1, -6, 0, }, /* Z */
722 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* O */
723 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* U */
724 { -1, -3, -2, -2, -2, -2, -2, -2, -1, -2, -1, -2, -2, -1, -2, -1, -1, -2, -3, -2, 0, -2, 0, -1, 0, 0, -2, -6, 0, }, /* X */
725 { -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, 0, -6, 0, -6, 0, 0, -6, 1, 0, }, /* * */
726 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* ~ */
727 }},
728 };
729
730
731 #define eslNTDIM 18
732
733 struct esl_scorematrix_nt_preload_s {
734 char *name;
735 int matrix[eslNTDIM][eslNTDIM];
736 };
737
738 /* "DNA1" matrix
739 *
740 * Travis Wheeler created the "DNA1" custom matrix for nhmmer. It's
741 * derived from the DNA prior (see <p7_prior_CreateNucleic()>), by
742 * computing mean posterior joint probabilities p_ij for a single
743 * observed count of each residue, assuming uniform background, and
744 * symmetricizing the result by taking means; then calling
745 * <esl_scorematrix_SetFromProbs()> with lambda = 0.02.
746 *
747 * The p_ij matrix was:
748 * A C G T
749 * 0.143 0.033 0.037 0.037 A
750 * 0.033 0.136 0.029 0.044 C
751 * 0.037 0.029 0.157 0.034 G
752 * 0.037 0.044 0.034 0.136 T
753 *
754 * Travis estimated the DNA prior from a subset of Rfam 10.0 seed
755 * alignments, based on a procedure from Eric Nawrocki: remove
756 * columns with >50% gaps, collect weighted counts, and estimate
757 * a four-component Dirichlet mixture.
758 *
759 * [xref email from Travis 8/21/2017]
760 *
761 */
762 static const struct esl_scorematrix_nt_preload_s ESL_SCOREMATRIX_NT_PRELOADS[] = {
763 { "DNA1", {
764 /* A C G T - R Y M K S W H B V D N * ~ */
765 { 41, -32, -26, -26, 0, 18, -29, 17, -26, -29, 18, 6, -28, 6, 7, 0, -38, 0, }, /*A*/
766 { -32, 39, -38, -17, 0, -35, 18, 15, -26, 14, -24, 6, 6, 3, -28, -1, -38, 0, }, /*C*/
767 { -26, -38, 46, -31, 0, 22, -34, -32, 21, 20, -29, -32, 8, 9, 10, 1, -38, 0, }, /*G*/
768 { -26, -17, -31, 39, 0, -28, 18, -21, 15, -23, 16, 7, 7, -24, 5, 0, -38, 0, }, /*T*/
769 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /*-*/
770 { 18, -35, 22, -28, 0, 20, -32, -2, 3, 1, 0, -9, -7, 7, 8, 1, -38, 0, }, /*R*/
771 { -29, 18, -34, 18, 0, -32, 18, 0, -1, -1, 0, 7, 6, -9, -9, -1, -38, 0, }, /*Y*/
772 { 17, 15, -32, -21, 0, -2, 0, 16, -26, -3, 1, 6, -8, 4, -7, -1, -38, 0, }, /*M*/
773 { -26, -26, 21, 15, 0, 3, -1, -26, 18, 3, -1, -8, 7, -5, 7, 1, -38, 0, }, /*K*/
774 { -29, 14, 20, -23, 0, 1, -1, -3, 3, 17, -26, -9, 7, 6, -6, 0, -38, 0, }, /*S*/
775 { 18, -24, -29, 16, 0, 0, 0, 1, -1, -26, 17, 7, -8, -7, 6, 0, -38, 0, }, /*W*/
776 { 6, 6, -32, 7, 0, -9, 7, 6, -8, -9, 7, 7, -3, -3, -3, 0, -38, 0, }, /*H*/
777 { -28, 6, 8, 7, 0, -7, 6, -8, 7, 7, -8, -3, 7, -2, -2, 0, -38, 0, }, /*B*/
778 { 6, 3, 9, -24, 0, 7, -9, 4, -5, 6, -7, -3, -2, 6, -1, 0, -38, 0, }, /*V*/
779 { 7, -28, 10, 5, 0, 8, -9, -7, 7, -6, 6, -3, -2, -1, 7, 0, -38, 0, }, /*D*/
780 { 0, -1, 1, 0, 0, 1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /*N*/
781 { -38, -38, -38, -38, 0, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, 0, -38, 0, }, /***/
782 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /*~*/
783 }},
784
785 };
786
787
788
789
790
791
792 /* Function: esl_scorematrix_Set()
793 * Synopsis: Set one of several standard matrices.
794 *
795 * Purpose: Set the allocated score matrix <S> to standard score
796 * matrix <name>, where <name> is the name of one of
797 * several matrices built-in to Easel. For example,
798 * <esl_scorematrix_Set("BLOSUM62", S)>.
799 *
800 * The alphabet for <S> (<S->abc_r>) must be set already.
801 *
802 * Built-in amino acid score matrices in Easel include
803 * BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, PAM30,
804 * PAM70, PAM120, and PAM240.
805 *
806 * Returns: <eslOK> on success, and the scores in <S> are set.
807 *
808 * <eslENOTFOUND> if <name> is not available as a built-in matrix
809 * for the alphabet that's set in <S>.
810 *
811 * Throws: <eslEMEM> on allocation error.
812 */
813 int
esl_scorematrix_Set(const char * name,ESL_SCOREMATRIX * S)814 esl_scorematrix_Set(const char *name, ESL_SCOREMATRIX *S)
815 {
816 int which;
817 int x, y;
818
819 if (S->abc_r->type == eslAMINO)
820 {
821 int nmat = sizeof(ESL_SCOREMATRIX_AA_PRELOADS) / sizeof(struct esl_scorematrix_aa_preload_s);
822 for (which = 0; which < nmat; which++)
823 if (strcmp(ESL_SCOREMATRIX_AA_PRELOADS[which].name, name) == 0) break;
824 if (which >= nmat) return eslENOTFOUND;
825
826 ESL_DASSERT1(( S->Kp >= 24 )); // strcpy below is safe. The assertion tries to convince static analyzer of that.
827 strcpy(S->outorder, "ARNDCQEGHILKMFPSTWYVBZX*");
828 /* All standard PAM, BLOSUM matrices have same list of valid
829 * residues. If that ever changes, make <outorder> a data elem in the
830 * structures above.
831 */
832
833 /* Transfer scores from static built-in storage */
834 for (x = 0; x < S->Kp; x++)
835 for (y = 0; y < S->Kp; y++)
836 S->s[x][y] = ESL_SCOREMATRIX_AA_PRELOADS[which].matrix[x][y];
837
838 }
839 else if (S->abc_r->type == eslDNA || S->abc_r->type == eslRNA)
840 {
841 int nmat = sizeof(ESL_SCOREMATRIX_NT_PRELOADS) / sizeof(struct esl_scorematrix_nt_preload_s);
842 for (which = 0; which < nmat; which++)
843 if (strcmp(ESL_SCOREMATRIX_NT_PRELOADS[which].name, name) == 0) break;
844 if (which >= nmat) return eslENOTFOUND;
845
846 ESL_DASSERT1(( S->Kp >= 15 )); // strcpy below is safe. The assertion tries to convince static analyzer of that.
847 strcpy(S->outorder, "ACGTRYMKSWHBVDN");
848
849 /* Transfer scores from static built-in storage */
850 for (x = 0; x < S->Kp; x++)
851 for (y = 0; y < S->Kp; y++)
852 S->s[x][y] = ESL_SCOREMATRIX_NT_PRELOADS[which].matrix[x][y];
853
854 }
855 else return eslENOTFOUND; /* no DNA matrices are built in yet! */
856
857
858 /* Use <outorder> list to set <isval[x]> */
859 S->nc = strlen(S->outorder);
860 for (y = 0; y < S->nc; y++) {
861 x = esl_abc_DigitizeSymbol(S->abc_r, S->outorder[y]);
862 S->isval[x] = TRUE;
863 }
864
865 /* Copy the name */
866 if (esl_strdup(name, -1, &(S->name)) != eslOK) return eslEMEM;
867 return eslOK;
868 }
869
870
871 /* Function: esl_scorematrix_SetIdentity()
872 * Synopsis: Set matrix to +1 match, 0 mismatch.
873 *
874 * Purpose: Sets score matrix <S> to be +1 for a match,
875 * 0 for a mismatch. <S> may be for any alphabet.
876 *
877 * Rarely useful in real use, but may be useful to create
878 * simple examples (including debugging).
879 *
880 * Returns: <eslOK> on success, and the scores in <S> are set.
881 */
882 int
esl_scorematrix_SetIdentity(ESL_SCOREMATRIX * S)883 esl_scorematrix_SetIdentity(ESL_SCOREMATRIX *S)
884 {
885 int a;
886 int x;
887
888 for (a = 0; a < S->abc_r->Kp*S->abc_r->Kp; a++) S->s[0][a] = 0;
889 for (a = 0; a < S->K; a++) S->s[a][a] = 1;
890
891 for (x = 0; x < S->K; x++) S->isval[x] = TRUE;
892 for (x = S->abc_r->K; x < S->Kp; x++) S->isval[x] = FALSE;
893
894 strncpy(S->outorder, S->abc_r->sym, S->K);
895 S->outorder[S->K] = '\0';
896 S->nc = S->K;
897 return eslOK;
898 }
899 /*---------------- end, some classic score matrices --------*/
900
901
902 /*****************************************************************
903 *# 3. Deriving a score matrix probabilistically.
904 *****************************************************************/
905
906 /* Function: esl_scorematrix_SetFromProbs()
907 * Synopsis: Set matrix from target and background probabilities.
908 *
909 * Purpose: Sets the scores in a new score matrix <S> from target joint
910 * probabilities in <P>, query background probabilities <fi>, and
911 * target background probabilities <fj>, with scale factor <lambda>:
912 * $s_{ij} = \frac{1}{\lambda} \frac{p_{ij}}{f_i f_j}$.
913 *
914 * Size of everything must match the canonical alphabet
915 * size in <S>. That is, <S->abc->K> is the canonical
916 * alphabet size of <S>; <P> must contain $K times K$
917 * probabilities $P_{ij}$, and <fi>,<fj> must be vectors of
918 * K probabilities. All probabilities must be nonzero.
919 *
920 * Args: S - score matrix to set scores in
921 * lambda - scale factor
922 * P - matrix of joint probabilities P_ij (KxK)
923 * fi - query background probabilities (0..K-1)
924 * fj - target background probabilities
925 *
926 * Returns: <eslOK> on success, and <S> contains the calculated score matrix.
927 */
928 int
esl_scorematrix_SetFromProbs(ESL_SCOREMATRIX * S,double lambda,const ESL_DMATRIX * P,const double * fi,const double * fj)929 esl_scorematrix_SetFromProbs(ESL_SCOREMATRIX *S, double lambda, const ESL_DMATRIX *P, const double *fi, const double *fj)
930 {
931 int i,j;
932 double sc;
933
934 for (i = 0; i < S->abc_r->K; i++)
935 for (j = 0; j < S->abc_r->K; j++)
936 {
937 sc = log(P->mx[i][j] / (fi[i] * fj[j])) / lambda;
938 S->s[i][j] = (int) (sc + (sc>0 ? 0.5 : -0.5)); /* that's rounding to the nearest integer */
939 }
940
941 for (i = 0; i < S->abc_r->K; i++)
942 S->isval[i] = TRUE;
943 S->nc = S->abc_r->K;
944
945 strncpy(S->outorder, S->abc_r->sym, S->abc_r->K);
946 S->outorder[S->nc] = '\0';
947 return eslOK;
948 }
949
950
951 /* Function: esl_scorematrix_SetWAG()
952 * Synopsis: Set matrix using the WAG evolutionary model.
953 *
954 * Purpose: Parameterize an amino acid score matrix <S> using the WAG
955 * rate matrix \citep{WhelanGoldman01} as the underlying
956 * evolutionary model, at a distance of <t>
957 * substitutions/site, with scale factor <lambda>.
958 *
959 * Args: S - score matrix to set parameters in. Must be created for
960 * an amino acid alphabet.
961 * lambda - scale factor for scores
962 * t - distance to exponentiate WAG to, in substitutions/site
963 *
964 * Returns: <eslOK> on success, and the 20x20 residue scores in <S> are set.
965 *
966 * Throws: <eslEINVAL> if <S> isn't an allocated amino acid score matrix.
967 * <eslEMEM> on allocation failure.
968 */
969 int
esl_scorematrix_SetWAG(ESL_SCOREMATRIX * S,double lambda,double t)970 esl_scorematrix_SetWAG(ESL_SCOREMATRIX *S, double lambda, double t)
971 {
972 ESL_DMATRIX *Q = NULL;
973 ESL_DMATRIX *P = NULL;
974 static double wagpi[20];
975 int i,j;
976 int status;
977
978 if (S->K != 20) ESL_EXCEPTION(eslEINVAL, "Must be using an amino acid alphabet (K=20) to make WAG-based matrices");
979
980 if (( Q = esl_dmatrix_Create(20, 20)) == NULL) { status = eslEMEM; goto ERROR; }
981 if (( P = esl_dmatrix_Create(20, 20)) == NULL) { status = eslEMEM; goto ERROR; }
982 if ((status = esl_composition_WAG(wagpi)) != eslOK) goto ERROR;
983 if ((status = esl_rmx_SetWAG(Q, wagpi)) != eslOK) goto ERROR;
984 if ((status = esl_dmx_Exp(Q, t, P)) != eslOK) goto ERROR;
985
986 for (i = 0; i < 20; i++)
987 for (j = 0; j < 20; j++)
988 P->mx[i][j] *= wagpi[i]; /* P_ij = P(j|i) pi_i */
989
990 esl_scorematrix_SetFromProbs(S, lambda, P, wagpi, wagpi);
991
992 if ((status = esl_strdup("WAG", -1, &(S->name))) != eslOK) goto ERROR;
993
994 esl_dmatrix_Destroy(Q);
995 esl_dmatrix_Destroy(P);
996 return eslOK;
997
998 ERROR:
999 if (Q != NULL) esl_dmatrix_Destroy(Q);
1000 if (Q != NULL) esl_dmatrix_Destroy(P);
1001 return status;
1002 }
1003 /*--------------- end, deriving score matrices ------------------*/
1004
1005
1006
1007 /*****************************************************************
1008 *# 4. Reading/writing matrices from/to files
1009 *****************************************************************/
1010
1011 /* Function: esl_scorematrix_Read()
1012 * Synopsis: Read a standard matrix input file.
1013 *
1014 * Purpose: Given a pointer <efp> to an open file parser for a file
1015 * containing a score matrix (such as a PAM or BLOSUM
1016 * matrix), parse the file and create a new score matrix
1017 * object. The scores are expected to be for the alphabet
1018 * <abc>.
1019 *
1020 * The score matrix file is in the format that BLAST or
1021 * FASTA use. The first line is a header contains N
1022 * single-letter codes for the residues. Each of N
1023 * subsequent rows optionally contains a residue row label
1024 * (in the same order as the columns), followed by N
1025 * residue scores. (Older matrix files do not contain the
1026 * leading row label; newer ones do.) The residues may
1027 * appear in any order. They must minimally include the
1028 * canonical K residues (K=4 for DNA, K=20 for protein),
1029 * and may also contain none, some, or all degeneracy
1030 * codes. Any other residue code that is not in the Easel
1031 * digital alphabet (including, in particular, the '*' code
1032 * for a stop codon) is ignored by the parser.
1033 *
1034 * Returns: <eslOK> on success, and <ret_S> points to a newly allocated
1035 * score matrix.
1036 *
1037 * Returns <eslEFORMAT> on parsing error; in which case, <ret_S> is
1038 * returned <NULL>, and <efp->errbuf> contains an informative
1039 * error message.
1040 *
1041 * Throws: <eslEMEM> on allocation error.
1042 */
1043 int
esl_scorematrix_Read(ESL_FILEPARSER * efp,const ESL_ALPHABET * abc,ESL_SCOREMATRIX ** ret_S)1044 esl_scorematrix_Read(ESL_FILEPARSER *efp, const ESL_ALPHABET *abc, ESL_SCOREMATRIX **ret_S)
1045 {
1046 int status;
1047 ESL_SCOREMATRIX *S = NULL;
1048 int *map = NULL; /* maps col/row index to digital alphabet x */
1049 char *tok;
1050 int toklen;
1051 int c, x;
1052 int row,col;
1053
1054 /* Allocate the matrix
1055 */
1056 if ((S = esl_scorematrix_Create(abc)) == NULL) { status = eslEMEM; goto ERROR; }
1057
1058 /* Make sure we've got the comment character set properly in the fileparser.
1059 * Score matrices use #.
1060 */
1061 esl_fileparser_SetCommentChar(efp, '#');
1062
1063 /* Look for the first non-blank, non-comment line in the file. That line
1064 * gives us the single-letter codes in the order that the file's using.
1065 */
1066 if ((status = esl_fileparser_NextLine(efp)) != eslOK) ESL_XFAIL(eslEFORMAT, efp->errbuf, "file appears to be empty");
1067
1068 /* Read the characters: count them and store them in order in label[0..nc-1].
1069 * nc cannot exceed Kp+1 in our expected alphabet (+1, for the stop character *)
1070 */
1071 S->nc = 0;
1072 while ((status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen)) == eslOK)
1073 {
1074 if (S->nc >= abc->Kp) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Header contains more residues than expected for alphabet");
1075 if (toklen != 1) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Header can only contain single-char labels; %s is invalid", tok);
1076 S->outorder[S->nc++] = *tok;
1077 }
1078 if (status != eslEOL) ESL_XFAIL(status, efp->errbuf, "Unexpected failure of esl_fileparser_GetTokenOnLine()");
1079 S->outorder[S->nc] = '\0'; /* NUL terminate */
1080
1081 /* Verify that these labels for the score matrix seem plausible, given our alphabet.
1082 * This sets S->isval array: which residues we have scores for.
1083 * It also sets the map[] array, which maps coord in label[] to x in alphabet.
1084 */
1085 ESL_ALLOC(map, sizeof(int) * S->nc);
1086 for (c = 0; c < S->nc; c++)
1087 {
1088 if (esl_abc_CIsValid(abc, S->outorder[c]))
1089 {
1090 x = esl_abc_DigitizeSymbol(abc, S->outorder[c]);
1091 map[c] = x;
1092 S->isval[x] = TRUE;
1093 }
1094 else
1095 ESL_XFAIL(eslEFORMAT, efp->errbuf, "Don't know how to deal with residue %c in matrix file", S->outorder[c]);
1096 }
1097 for (x = 0; x < abc->K; x++)
1098 if (! S->isval[x]) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Expected to see a column for residue %c", abc->sym[x]);
1099
1100
1101 /* Read nc rows, one at a time;
1102 * on each row, read nc+1 or nc tokens, of which nc are scores (may lead with a label or not)
1103 */
1104 for (row = 0; row < S->nc; row++)
1105 {
1106 if ((status = esl_fileparser_NextLine(efp)) != eslOK) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Unexpectedly ran out of lines in file");
1107 for (col = 0; col < S->nc; col++)
1108 {
1109 if ((status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen)) != eslOK) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Unexpectedly ran out of fields on line");
1110 if (col == 0 && *tok == S->outorder[row]) { col--; continue; } /* skip leading label */
1111
1112 S->s[map[row]][map[col]] = atoi(tok);
1113 }
1114 if ((status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen)) != eslEOL) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Too many fields on line");
1115 }
1116 if ((status = esl_fileparser_NextLine(efp)) != eslEOF) ESL_XFAIL(eslEFORMAT, efp->errbuf, "Too many lines in file. (Make sure it's square & symmetric. E.g. use NUC.4.4 not NUC.4.2)");
1117
1118
1119 /* Annotate the score matrix */
1120 if ((status = esl_strdup (efp->filename, -1, &(S->path))) != eslOK) goto ERROR;
1121 if ((status = esl_FileTail(efp->filename, FALSE, &(S->name))) != eslOK) goto ERROR;
1122
1123 free(map);
1124 *ret_S = S;
1125 return eslOK;
1126
1127 ERROR:
1128 esl_scorematrix_Destroy(S);
1129 if (map != NULL) free(map);
1130 *ret_S = NULL;
1131 return status;
1132 }
1133
1134 /* Function: esl_scorematrix_Write()
1135 * Synopsis: Write a BLAST-compatible score matrix file.
1136 *
1137 * Purpose: Writes a score matrix <S> to an open stream <fp>, in
1138 * format compatible with BLAST, FASTA, and other common
1139 * sequence alignment software.
1140 *
1141 * Returns: <eslOK> on success.
1142 *
1143 * Throws: <eslEWRITE> on any system write error, such as filled disk.
1144 */
1145 int
esl_scorematrix_Write(FILE * fp,const ESL_SCOREMATRIX * S)1146 esl_scorematrix_Write(FILE *fp, const ESL_SCOREMATRIX *S)
1147 {
1148 int a,b;
1149 int x,y;
1150 int nc = S->nc;
1151
1152 /* The header line, with column labels for residues */
1153 if (fprintf(fp, " ") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed");
1154 for (a = 0; a < nc; a++)
1155 { if (fprintf(fp, " %c ", S->outorder[a]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed"); }
1156 if (fprintf(fp, "\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed");
1157
1158 /* The data */
1159 for (a = 0; a < nc; a++)
1160 {
1161 if (fprintf(fp, "%c ", S->outorder[a]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed");
1162 for (b = 0; b < nc; b++)
1163 {
1164 x = esl_abc_DigitizeSymbol(S->abc_r, S->outorder[a]);
1165 y = esl_abc_DigitizeSymbol(S->abc_r, S->outorder[b]);
1166 if (fprintf(fp, "%3d ", S->s[x][y]) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed");
1167 }
1168 if (fprintf(fp, "\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "score matrix write failed");
1169 }
1170 return eslOK;
1171 }
1172 /*-------------- end, reading/writing matrices ------------------*/
1173
1174
1175
1176 /*****************************************************************
1177 *# 5. Implicit probabilistic basis, I: given bg.
1178 *****************************************************************/
1179
1180 static int set_degenerate_probs(const ESL_ALPHABET *abc, ESL_DMATRIX *P, double *fi, double *fj);
1181
1182 struct lambda_params {
1183 const double *fi;
1184 const double *fj;
1185 const ESL_SCOREMATRIX *S;
1186 };
1187
1188 static int
lambda_fdf(double lambda,void * params,double * ret_fx,double * ret_dfx)1189 lambda_fdf(double lambda, void *params, double *ret_fx, double *ret_dfx)
1190 {
1191 struct lambda_params *p = (struct lambda_params *) params;
1192 int i,j;
1193 double tmp;
1194
1195 *ret_fx = 0.;
1196 *ret_dfx = 0.;
1197 for (i = 0; i < p->S->K; i++)
1198 for (j = 0; j < p->S->K; j++)
1199 {
1200 tmp = p->fi[i] * p->fj[j] * exp(lambda * (double) p->S->s[i][j]);
1201 *ret_fx += tmp;
1202 *ret_dfx += tmp * (double) p->S->s[i][j];
1203 }
1204 *ret_fx -= 1.0;
1205 return eslOK;
1206 }
1207
1208 /* Function: esl_scorematrix_ProbifyGivenBG()
1209 * Synopsis: Obtain $P_{ij}$ for matrix with known $\lambda$ and background.
1210 *
1211 * Purpose: Given a score matrix <S> and known query and target
1212 * background frequencies <fi> and <fj> respectively, calculate scale
1213 * <lambda> and implicit target probabilities \citep{Altschul01}.
1214 * Optionally returns either (or both) in <opt_lambda> and <opt_P>.
1215 *
1216 * The implicit target probabilities are returned in a
1217 * newly allocated $Kp \times Kp$ <ESL_DMATRIX>, over both
1218 * the canonical (typically K=4 or K=20) residues in the
1219 * residue alphabet, and the degenerate residue codes.
1220 * Values involving degenerate residue codes are marginal
1221 * probabilities (i.e. summed over the degeneracy).
1222 * Only actual residue degeneracy can have nonzero values
1223 * for <p_ij>; by convention, all values involving the
1224 * special codes for gap, nonresidue, and missing data
1225 * (<K>, <Kp-2>, <Kp-1>) are 0.
1226 *
1227 * If the caller wishes to convert this joint probability
1228 * matrix to conditionals, it can take advantage of the
1229 * fact that the degenerate probability <P(X,j)> is our
1230 * marginalized <pj>, and <P(i,X)> is <pi>.
1231 * i.e., <P(j|i) = P(i,j) / P(i) = P(i,j) / P(X,j)>.
1232 * Those X values are <P->mx[i][esl_abc_GetUnknown(abc)]>,
1233 * <P->mx[esl_abc_GetUnknown(abc)][j]>; equivalently, just use
1234 * code <Kp-3> for X.
1235 *
1236 * By convention, i is always the query sequence, and j is
1237 * always the target. We do not assume symmetry in the
1238 * scoring system, though that is usually the case.
1239 *
1240 * Args: S - score matrix
1241 * fi - background frequencies for query sequence i
1242 * fj - background frequencies for target sequence j
1243 * opt_lambda - optRETURN: calculated $\lambda$ parameter
1244 * opt_P - optRETURN: implicit target probabilities $p_{ij}$; a KxK DMATRIX.
1245 *
1246 * Returns: <eslOK> on success, <*ret_lambda> contains the
1247 * calculated $\lambda$ parameter, and <*ret_P> points to
1248 * the target probability matrix (which is allocated here,
1249 * and must be free'd by caller with <esl_dmatrix_Destroy(*ret_P)>.
1250 *
1251 * Throws: <eslEMEM> on allocation error;
1252 * <eslEINVAL> if matrix is invalid and has no solution for $\lambda$;
1253 * <eslENOHALT> if the solver fails to find $\lambda$.
1254 * In these cases, <*ret_lambda> is 0.0, and <*ret_P> is <NULL>.
1255 */
1256 int
esl_scorematrix_ProbifyGivenBG(const ESL_SCOREMATRIX * S,const double * fi,const double * fj,double * opt_lambda,ESL_DMATRIX ** opt_P)1257 esl_scorematrix_ProbifyGivenBG(const ESL_SCOREMATRIX *S, const double *fi, const double *fj,
1258 double *opt_lambda, ESL_DMATRIX **opt_P)
1259 {
1260 ESL_ROOTFINDER *R = NULL;
1261 ESL_DMATRIX *P = NULL;
1262 struct lambda_params p;
1263 double lambda_guess;
1264 double lambda;
1265 int i,j;
1266 double fx, dfx;
1267 int status;
1268
1269 /* First, solve for lambda by rootfinding. */
1270 /* Set up the data passed to the lambda_fdf function. */
1271 p.fi = fi;
1272 p.fj = fj;
1273 p.S = S;
1274
1275 /* Bracket the root.
1276 * It's important that we come at the root from the far side, where
1277 * f(lambda) is positive; else we may identify the root we don't want
1278 * at lambda=0.
1279 */
1280 fx = -1.0;
1281 lambda_guess = 1. / (double) esl_scorematrix_Max(S);
1282 for (; lambda_guess < 50.; lambda_guess *= 2.0) {
1283 lambda_fdf(lambda_guess, &p, &fx, &dfx);
1284 if (fx > 0) break;
1285 }
1286 if (fx <= 0) ESL_XEXCEPTION(eslEINVAL, "Failed to bracket root for solving lambda");
1287
1288 /* Create a solver and find lambda by Newton/Raphson */
1289 if (( R = esl_rootfinder_CreateFDF(lambda_fdf, &p) ) == NULL) { status = eslEMEM; goto ERROR; }
1290 if (( status = esl_root_NewtonRaphson(R, lambda_guess, &lambda)) != eslOK) goto ERROR;
1291
1292 /* Now, given solution for lambda, calculate P */
1293 if (opt_P != NULL)
1294 {
1295 if ((P = esl_dmatrix_Create(S->Kp, S->Kp)) == NULL) { status = eslEMEM; goto ERROR; }
1296 for (i = 0; i < S->K; i++)
1297 for (j = 0; j < S->K; j++)
1298 P->mx[i][j] = fi[i] * fj[j] * exp(lambda * (double) S->s[i][j]);
1299 set_degenerate_probs(S->abc_r, P, NULL, NULL);
1300 }
1301
1302 esl_rootfinder_Destroy(R);
1303 if (opt_lambda) *opt_lambda = lambda;
1304 if (opt_P) *opt_P = P;
1305 return eslOK;
1306
1307 ERROR:
1308 if (R) esl_rootfinder_Destroy(R);
1309 if (opt_lambda) *opt_lambda = 0.;
1310 if (opt_P) *opt_P = NULL;
1311 return status;
1312 }
1313
1314
1315 /* set_degenerate_probs()
1316 *
1317 * Used by both esl_scorematrix_Probify() and
1318 * esl_scorematrix_ProbifyGivenBG() to set degenerate residue
1319 * probabilities once probs for canonical residues are known.
1320 *
1321 * Input: P->mx[i][j] are joint probabilities p_ij for the canonical
1322 * alphabet 0..abc->K-1, but P matrix is allocated for Kp X Kp.
1323 *
1324 * Calculate marginal sums for all i,j pairs involving degeneracy
1325 * codes. Fill in [i][j'=K..Kp-1], [i'=K..Kp-1][j], and
1326 * [i'=K..Kp-1][j'=K..Kp-1] for degeneracies i',j'. Any p_ij involving
1327 * a gap (K), nonresidue (Kp-2), or missing data (Kp-1) character is
1328 * set to 0.0 by convention.
1329 *
1330 * Don't assume symmetry.
1331 *
1332 * If <fi> or <fj> background probability vectors are non-<NULL>, set
1333 * them too. (Corresponding to the assumption of background =
1334 * marginal probs, rather than background being given.) This takes
1335 * advantage of the fact that P(X,i) is already the marginalized p_i,
1336 * and P(j,X) is p_j.
1337 */
1338 static int
set_degenerate_probs(const ESL_ALPHABET * abc,ESL_DMATRIX * P,double * fi,double * fj)1339 set_degenerate_probs(const ESL_ALPHABET *abc, ESL_DMATRIX *P, double *fi, double *fj)
1340 {
1341 int i,j; /* indices into canonical codes */
1342 int ip,jp; /* indices into degenerate codes */
1343
1344 /* sum to get [i=0..K] canonicals to [jp=K+1..Kp-3] degeneracies;
1345 * and [jp=K,Kp-2,Kp-1] set to 0.0
1346 */
1347 for (i = 0; i < abc->K; i++)
1348 {
1349 P->mx[i][abc->K] = 0.0;
1350 for (jp = abc->K+1; jp < abc->Kp-2; jp++)
1351 {
1352 P->mx[i][jp] = 0.0;
1353 for (j = 0; j < abc->K; j++)
1354 if (abc->degen[jp][j]) P->mx[i][jp] += P->mx[i][j];
1355 }
1356 P->mx[i][abc->Kp-2] = 0.0;
1357 P->mx[i][abc->Kp-1] = 0.0;
1358 }
1359
1360 esl_vec_DSet(P->mx[abc->K], abc->Kp, 0.0); /* gap row: all 0.0 by convention */
1361
1362 /* [ip][all] */
1363 for (ip = abc->K+1; ip < abc->Kp-2; ip++)
1364 {
1365 /* [ip][j]: degenerate i, canonical j */
1366 for (j = 0; j < abc->K; j++)
1367 {
1368 P->mx[ip][j] = 0.0;
1369 for (i = 0; i < abc->K; i++)
1370 if (abc->degen[ip][i]) P->mx[ip][j] += P->mx[i][j];
1371 }
1372 P->mx[ip][abc->K] = 0.0;
1373
1374 /* [ip][jp]: both positions degenerate */
1375 for (jp = abc->K+1; jp < abc->Kp-2; jp++)
1376 {
1377 P->mx[ip][jp] = 0.0;
1378 for (j = 0; j < abc->K; j++)
1379 if (abc->degen[jp][j]) P->mx[ip][jp] += P->mx[ip][j];
1380 }
1381 P->mx[ip][abc->Kp-2] = 0.0;
1382 P->mx[ip][abc->Kp-1] = 0.0;
1383 }
1384
1385 esl_vec_DSet(P->mx[abc->Kp-2], abc->Kp, 0.0); /* nonresidue data * row, all 0.0 */
1386 esl_vec_DSet(P->mx[abc->Kp-1], abc->Kp, 0.0); /* missing data ~ row, all 0.0 */
1387
1388 if (fi != NULL) { /* fi[i'] = p(i',X) */
1389 fi[abc->K] = 0.0;
1390 for (ip = abc->K+1; ip < abc->Kp-2; ip++) fi[ip] = P->mx[ip][abc->Kp-3];
1391 fi[abc->Kp-2] = 0.0;
1392 fi[abc->Kp-1] = 0.0;
1393 }
1394
1395 if (fj != NULL) { /* fj[j'] = p(X,j')*/
1396 fj[abc->K] = 0.0;
1397 for (jp = abc->K+1; jp < abc->Kp-2; jp++) fj[jp] = P->mx[abc->Kp-3][jp];
1398 fj[abc->Kp-2] = 0.0;
1399 fj[abc->Kp-1] = 0.0;
1400 }
1401
1402 return eslOK;
1403 }
1404 /*------------- end, implicit prob basis, bg known --------------*/
1405
1406
1407 /*****************************************************************
1408 *# 6. Implicit probabilistic basis, II: bg unknown
1409 *****************************************************************/
1410
1411 /* This section implements one of the key ideas in Yu and Altschul,
1412 * PNAS 100:15688, 2003 [YuAltschul03], and Yu and Altschul,
1413 * Bioinformatics 21:902-911, 2005 [YuAltschul05]:
1414 *
1415 * Given a valid score matrix, calculate its probabilistic
1416 * basis (P_ij, f_i, f_j, and lambda), on the assumption that
1417 * the background probabilities are the marginals of P_ij.
1418 *
1419 * However, this procedure appears to be unreliable.
1420 * There are often numerous invalid solutions with negative
1421 * probabilities, and the Yu/Altschul Y function (that we've solving
1422 * for its root) is often discontinuous. Although Yu and Altschul say
1423 * they can just keep searching for solutions until a valid one is
1424 * found, and "this procedure presents no difficulties in practice", I
1425 * don't see how.
1426 *
1427 * For example, run the procedure on PAM190 and PAM200. For PAM190
1428 * you will obtain a valid solution with lambda = 0.2301. For PAM200
1429 * you will obtain an *invalid* solution with lambda = 0.2321, and
1430 * negative probabilities f_{ENT} (and all p_ij involving ENT and
1431 * the other 17 aa). There is a discontinuity in the function, but
1432 * it's not near these lambdas, it's at about lambda=0.040, so it's
1433 * not that we fell into a discontinuity: the bisection procedure on
1434 * lambda is working smoothly. And if you calculate a score matrix again
1435 * from the invalid PAM200 solution, you get PAM200 back, so it's not
1436 * that there's an obvious bug -- we do obtain a "solution" to PAM200,
1437 * just not one with positive probabilities. It's not obvious how
1438 * we could find a different solution to PAM200 than the invalid one!
1439 *
1440 * What we're going to do [xref J7/126, Apr 2011] is to deprecate
1441 * the Yu/Altschul procedure altogether.
1442 */
1443 struct yualtschul_params {
1444 ESL_DMATRIX *S; /* pointer to the KxK score matrix w/ values cast to doubles */
1445 ESL_DMATRIX *M; /* not a param per se: alloc'ed storage for M matrix provided to the objective function */
1446 ESL_DMATRIX *Y; /* likewise, alloc'ed storage for Y (M^-1) matrix provided to obj function */
1447 };
1448
1449 /* yualtschul_scorematrix_validate
1450 * See start of section 3, p. 903, YuAltschul05
1451 * (Implementation could be more efficient here; don't really have
1452 * to sweep the entire matrix twice to do this.)
1453 */
1454 static int
yualtschul_scorematrix_validate(const ESL_SCOREMATRIX * S)1455 yualtschul_scorematrix_validate(const ESL_SCOREMATRIX *S)
1456 {
1457 int i, j;
1458 int has_neg, has_pos;
1459
1460 /* each row must have at least one positive and one negative score */
1461 for (i = 0; i < S->K; i++)
1462 {
1463 has_neg = has_pos = FALSE;
1464 for (j = 0; j < S->K; j++)
1465 {
1466 if (S->s[i][j] > 0) has_pos = TRUE;
1467 if (S->s[i][j] < 0) has_neg = TRUE;
1468 }
1469 if (! has_pos || ! has_neg) return eslFAIL;
1470 }
1471
1472 /* ditto for columns */
1473 for (j = 0; j < S->K; j++)
1474 {
1475 has_neg = has_pos = FALSE;
1476 for (i = 0; i < S->K; i++)
1477 {
1478 if (S->s[i][j] > 0) has_pos = TRUE;
1479 if (S->s[i][j] < 0) has_neg = TRUE;
1480 }
1481 if (! has_pos || ! has_neg) return eslFAIL;
1482 }
1483
1484 return eslOK;
1485 }
1486
1487 /* upper bound bracketing lambda solution: eqn (12) in [YuAltschul05] */
1488 static double
yualtschul_upper_bound(const ESL_DMATRIX * Sd)1489 yualtschul_upper_bound(const ESL_DMATRIX *Sd)
1490 {
1491 int i;
1492 double minimax;
1493 double maxlambda;
1494
1495 /* minimax = c in YuAltschul05 p.903 = smallest of the max scores in each row/col */
1496 minimax = esl_vec_DMax(Sd->mx[0], Sd->n);
1497 for (i = 1; i < Sd->n; i++)
1498 minimax = ESL_MIN(minimax, esl_vec_DMax(Sd->mx[i], Sd->n));
1499
1500 maxlambda = log((double) Sd->n) / minimax; /* eqn (12), YuAltschul05 */
1501 return maxlambda;
1502 }
1503
1504 static int
yualtschul_solution_validate(const ESL_DMATRIX * P,const double * fi,const double * fj)1505 yualtschul_solution_validate(const ESL_DMATRIX *P, const double *fi, const double *fj)
1506 {
1507
1508 if ( esl_dmx_Min(P) < 0.0) return eslFAIL;
1509 if ( esl_vec_DMin(fi, P->n) < 0.0) return eslFAIL;
1510 if ( esl_vec_DMin(fj, P->n) < 0.0) return eslFAIL;
1511
1512 return eslOK;
1513 }
1514
1515 /* yualtschul_func()
1516 *
1517 * This is the objective function we try to find a root of.
1518 * Its prototype is dictated by the esl_rootfinder API.
1519 */
1520 static int
yualtschul_func(double lambda,void * params,double * ret_fx)1521 yualtschul_func(double lambda, void *params, double *ret_fx)
1522 {
1523 int status;
1524 struct yualtschul_params *p = (struct yualtschul_params *) params;
1525 ESL_DMATRIX *S = p->S;
1526 ESL_DMATRIX *M = p->M;
1527 ESL_DMATRIX *Y = p->Y;
1528 int i,j;
1529
1530 /* the M matrix has entries M_ij = e^{lambda * s_ij} */
1531 for (i = 0; i < S->n; i++)
1532 for (j = 0; j < S->n; j++)
1533 M->mx[i][j] = exp(lambda * S->mx[i][j]);
1534
1535 /* the Y matrix is the inverse of M */
1536 if ((status = esl_dmx_Invert(M, Y)) != eslOK) goto ERROR;
1537
1538 /* We're trying to find the root of \sum_ij Y_ij - 1 = 0 */
1539 *ret_fx = esl_dmx_Sum(Y) - 1.;
1540 return eslOK;
1541
1542 ERROR:
1543 *ret_fx = 0.;
1544 return status;
1545 }
1546
1547 /* yualtschul_engine()
1548 *
1549 * This function backcalculates the probabilistic basis for a score
1550 * matrix S, when S is a double-precision matrix. Providing this
1551 * as a separate "engine" and writing esl_scorematrix_Probify()
1552 * as a wrapper around it allows us to separately test inaccuracy
1553 * due to numerical performance of our linear algebra, versus
1554 * inaccuracy due to integer roundoff in integer scoring matrices.
1555 *
1556 * It is not uncommon for this to fail when S is derived from
1557 * integer scores. Because the scores may have been provided by the
1558 * user, and this may be our first chance to detect the "user error"
1559 * of an invalid matrix, this engine returns <eslEINVAL> as a normal error
1560 * if it can't reach a valid solution.
1561 */
1562 static int
yualtschul_engine(ESL_DMATRIX * S,ESL_DMATRIX * P,double * fi,double * fj,double * ret_lambda)1563 yualtschul_engine(ESL_DMATRIX *S, ESL_DMATRIX *P, double *fi, double *fj, double *ret_lambda)
1564 {
1565 int status;
1566 ESL_ROOTFINDER *R = NULL;
1567 struct yualtschul_params p;
1568 double lambda;
1569 double xl, xr;
1570 double fx = -1.0;
1571 int i,j;
1572
1573 /* Set up a bisection method to find lambda */
1574 p.S = S;
1575 p.M = p.Y = NULL;
1576 if ((p.M = esl_dmatrix_Create(S->n, S->n)) == NULL) { status = eslEMEM; goto ERROR; }
1577 if ((p.Y = esl_dmatrix_Create(S->n, S->n)) == NULL) { status = eslEMEM; goto ERROR; }
1578 if ((R = esl_rootfinder_Create(yualtschul_func, &p)) == NULL) { status = eslEMEM; goto ERROR; }
1579
1580 /* Identify suitable brackets on lambda. */
1581 xr = yualtschul_upper_bound(S);
1582
1583 for (xl = xr; xl > 1e-10; xl /= 1.6) {
1584 if ((status = yualtschul_func(xl, &p, &fx)) != eslOK) goto ERROR;
1585 if (fx > 0.) break;
1586 }
1587 if (fx <= 0.) { status = eslEINVAL; goto ERROR; }
1588
1589 for (; xr < 100.; xr *= 1.6) {
1590 if ((status = yualtschul_func(xr, &p, &fx)) != eslOK) goto ERROR;
1591 if (fx < 0.) break;
1592 }
1593 if (fx >= 0.) { status = eslEINVAL; goto ERROR; }
1594
1595 /* Find lambda by bisection */
1596 if (( status = esl_root_Bisection(R, xl, xr, &lambda)) != eslOK) goto ERROR;
1597
1598 /* Find fi, fj from Y: fi are column sums, fj are row sums */
1599 for (i = 0; i < S->n; i++) {
1600 fi[i] = 0.;
1601 for (j = 0; j < S->n; j++) fi[i] += p.Y->mx[j][i];
1602 }
1603 for (j = 0; j < S->n; j++) {
1604 fj[j] = 0.;
1605 for (i = 0; i < S->n; i++) fj[j] += p.Y->mx[j][i];
1606 }
1607
1608 /* Find p_ij */
1609 for (i = 0; i < S->n; i++)
1610 for (j = 0; j < S->n; j++)
1611 P->mx[i][j] = fi[i] * fj[j] * p.M->mx[i][j];
1612
1613 *ret_lambda = lambda;
1614 esl_dmatrix_Destroy(p.M);
1615 esl_dmatrix_Destroy(p.Y);
1616 esl_rootfinder_Destroy(R);
1617 return eslOK;
1618
1619 ERROR:
1620 if (p.M) esl_dmatrix_Destroy(p.M);
1621 if (p.Y) esl_dmatrix_Destroy(p.Y);
1622 if (R) esl_rootfinder_Destroy(R);
1623 return status;
1624 }
1625
1626
1627 /* Function: esl_scorematrix_Probify()
1628 * Synopsis: Calculate the probabilistic basis of a score matrix.
1629 *
1630 * Purpose: Reverse engineering of a score matrix: given a "valid"
1631 * substitution matrix <S>, obtain implied joint
1632 * probabilities $p_{ij}$, query composition $f_i$, target
1633 * composition $f_j$, and scale $\lambda$, by assuming that
1634 * $f_i$ and $f_j$ are the appropriate marginals of $p_{ij}$.
1635 * Optionally return any or all of these solutions in
1636 * <*opt_P>, <*opt_fi>, <*opt_fj>, and <*opt_lambda>.
1637 *
1638 * The calculation is run only on canonical residue scores
1639 * $0..K-1$ in S, to calculate joint probabilities for all
1640 * canonical residues. Joint and background probabilities
1641 * involving degenerate residues are then calculated by
1642 * appropriate marginalizations. See notes on
1643 * <esl_scorematrix_ProbifyGivenBG()> about how probabilities
1644 * involving degeneracy codes are calculated.
1645 *
1646 * This implements an algorithm described in
1647 * \citep{YuAltschul03} and \citep{YuAltschul05}.
1648 *
1649 * Although this procedure may succeed in many cases,
1650 * it is unreliable and should be used with great caution.
1651 * Yu and Altschul note that it can find invalid solutions
1652 * (negative probabilities), and although they say that one
1653 * can keep searching until a valid solution is found,
1654 * one can produce examples where this does not seem to be
1655 * the case. The caller MUST check return status, and
1656 * MUST expect <eslENORESULT>.
1657 *
1658 * Args: S - score matrix
1659 * opt_P - optRETURN: Kp X Kp matrix of implied target probs $p_{ij}$
1660 * opt_fi - optRETURN: vector of Kp $f_i$ background probs, 0..Kp-1
1661 * opt_fj - optRETURN: vector of Kp $f_j$ background probs, 0..Kp-1
1662 * opt_lambda - optRETURN: calculated $\lambda$ parameter
1663 *
1664 * Returns: <eslOK> on success, and <opt_P>, <opt_fi>, <opt_fj>, and <opt_lambda>
1665 * point to the results (for any of these that were passed non-<NULL>).
1666 *
1667 * <opt_P>, <opt_fi>, and <opt_fj>, if requested, are new
1668 * allocations, and must be freed by the caller.
1669 *
1670 * Returns <eslENORESULT> if the algorithm fails to determine a valid solution,
1671 * but the solution is still returned (and caller needs to free).
1672 *
1673 * Returns <eslEINVAL> if input score matrix isn't valid (sensu YuAltschul05);
1674 * now <opt_P>, <opt_fi>, <opt_fj> are returned NULL and <opt_lambda> is returned
1675 * as 0.
1676 *
1677 * Throws: <eslEMEM> on allocation failure.
1678 *
1679 * Xref: SRE:J1/35; SRE:J7/126.
1680 */
1681 int
esl_scorematrix_Probify(const ESL_SCOREMATRIX * S,ESL_DMATRIX ** opt_P,double ** opt_fi,double ** opt_fj,double * opt_lambda)1682 esl_scorematrix_Probify(const ESL_SCOREMATRIX *S, ESL_DMATRIX **opt_P, double **opt_fi, double **opt_fj, double *opt_lambda)
1683 {
1684 int status;
1685 ESL_DMATRIX *Sd = NULL;
1686 ESL_DMATRIX *P = NULL;
1687 double *fi = NULL;
1688 double *fj = NULL;
1689 double lambda;
1690 int i,j;
1691
1692 /* Check the input matrix for validity */
1693 if ( yualtschul_scorematrix_validate(S) != eslOK) { status = eslEINVAL; goto ERROR; }
1694
1695 if (( Sd = esl_dmatrix_Create(S->K, S->K)) == NULL) {status = eslEMEM; goto ERROR; }
1696 if (( P = esl_dmatrix_Create(S->Kp, S->Kp)) == NULL) {status = eslEMEM; goto ERROR; }
1697 ESL_ALLOC(fi, sizeof(double) * S->Kp);
1698 ESL_ALLOC(fj, sizeof(double) * S->Kp);
1699
1700 /* Construct a double-precision dmatrix from S.
1701 * I've tried integrating over the rounding uncertainty by
1702 * averaging over trials with values jittered by +/- 0.5,
1703 * but it doesn't appear to help.
1704 */
1705 for (i = 0; i < S->K; i++)
1706 for (j = 0; j < S->K; j++)
1707 Sd->mx[i][j] = (double) S->s[i][j];
1708
1709 /* Reverse engineer the doubles */
1710 if ((status = yualtschul_engine(Sd, P, fi, fj, &lambda)) != eslOK) goto ERROR;
1711 set_degenerate_probs(S->abc_r, P, fi, fj);
1712
1713 /* Done. */
1714 if (yualtschul_solution_validate(P, fi, fj) != eslOK) status = eslENORESULT;
1715 else status = eslOK;
1716
1717 esl_dmatrix_Destroy(Sd);
1718 if (opt_P != NULL) *opt_P = P; else esl_dmatrix_Destroy(P);
1719 if (opt_fi != NULL) *opt_fi = fi; else free(fi);
1720 if (opt_fj != NULL) *opt_fj = fj; else free(fj);
1721 if (opt_lambda != NULL) *opt_lambda = lambda;
1722 return status;
1723
1724 ERROR:
1725 if (Sd != NULL) esl_dmatrix_Destroy(Sd);
1726 if (P != NULL) esl_dmatrix_Destroy(P);
1727 if (fi != NULL) free(fi);
1728 if (fj != NULL) free(fj);
1729 if (opt_P != NULL) *opt_P = NULL;
1730 if (opt_fi != NULL) *opt_fi = NULL;
1731 if (opt_fj != NULL) *opt_fj = NULL;
1732 if (opt_lambda != NULL) *opt_lambda = 0.;
1733 return status;
1734 }
1735 /*---------- end, implicit prob basis, bg unknown ---------------*/
1736
1737
1738
1739
1740 /*****************************************************************
1741 * 7. Experiment driver
1742 *****************************************************************/
1743
1744 #ifdef eslSCOREMATRIX_EXPERIMENT
1745 #include <stdio.h>
1746 #include <stdlib.h>
1747
1748 #include "easel.h"
1749 #include "esl_alphabet.h"
1750 #include "esl_dmatrix.h"
1751 #include "esl_getopts.h"
1752 #include "esl_scorematrix.h"
1753 #include "esl_vectorops.h"
1754
1755 static ESL_OPTIONS options[] = {
1756 /* name type default env range togs reqs incomp help docgrp */
1757 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1758 {"-l", eslARG_REAL, "0.3466", NULL, NULL, NULL, NULL, NULL, "set base lambda (units of score mx) to <x>", 0},
1759 {"-s", eslARG_REAL, "1.0", NULL, NULL, NULL, NULL, NULL, "additional scale factor applied to lambda", 0},
1760 {"-t", eslARG_REAL, "1.37", NULL, NULL, NULL, NULL, NULL, "set WAG time (branch length) to <x>", 0},
1761 {"--yfile", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save xy file of Yu/Altschul root eqn to <f>", 0},
1762 {"--mfile", eslARG_OUTFILE, NULL, NULL, NULL, NULL, NULL, NULL, "save WAG score matrix to <f>", 0},
1763 { 0,0,0,0,0,0,0,0,0,0},
1764 };
1765 static char usage[] = "[-options]";
1766 static char banner[] = "Yu/Altschul experiment driver for scorematrix module";
1767
1768 /* yualtschul_graph_dump()
1769 * Dump an XY plot of (\sum Y -1) vs. lambda for a score matrix.
1770 * X-axis of graph starts at <lambda0>, ends at <lambda1>, stepping by <stepsize>.
1771 */
1772 static int
yualtschul_graph_dump(FILE * ofp,ESL_SCOREMATRIX * S,double scale,double lambda0,double lambda1,double stepsize)1773 yualtschul_graph_dump(FILE *ofp, ESL_SCOREMATRIX *S, double scale, double lambda0, double lambda1, double stepsize)
1774 {
1775 struct yualtschul_params p;
1776 int a,b;
1777 double fx;
1778 double lambda;
1779
1780 /* Set up a bisection method to find lambda */
1781 p.S = esl_dmatrix_Create(S->K, S->K);
1782 p.M = esl_dmatrix_Create(S->K, S->K);
1783 p.Y = esl_dmatrix_Create(S->K, S->K);
1784
1785 for (a = 0; a < S->K; a++)
1786 for (b = 0; b < S->K; b++)
1787 p.S->mx[a][b] = (double) S->s[a][b];
1788
1789 for (lambda = lambda0; lambda <= lambda1; lambda += stepsize)
1790 {
1791 yualtschul_func(lambda/scale, &p, &fx);
1792 fprintf(ofp, "%f %f\n", lambda, fx);
1793 }
1794 fprintf(ofp, "&\n");
1795 fprintf(ofp, "%f 0.0\n", lambda0);
1796 fprintf(ofp, "%f 0.0\n", lambda1);
1797 fprintf(ofp, "&\n");
1798
1799 esl_dmatrix_Destroy(p.S);
1800 esl_dmatrix_Destroy(p.M);
1801 esl_dmatrix_Destroy(p.Y);
1802 return 0;
1803 }
1804
1805 int
main(int argc,char ** argv)1806 main(int argc, char **argv)
1807 {
1808 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1809 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); /* protein matrices 20x20 */
1810 ESL_DMATRIX *Q = esl_dmatrix_Create(abc->K, abc->K); /* WAG rate matrix */
1811 ESL_DMATRIX *P0 = esl_dmatrix_Create(abc->K, abc->K); /* p_ij joint probabilities calculated from WAG */
1812 double *wagpi = malloc(sizeof(double) * abc->K);
1813 ESL_SCOREMATRIX *S0 = esl_scorematrix_Create(abc); /* score matrix calculated from WAG p_ij's */
1814 double lambda0 = esl_opt_GetReal(go, "-l");
1815 double t = esl_opt_GetReal(go, "-t");
1816 double scale = esl_opt_GetReal(go, "-s");
1817 char *yfile = esl_opt_GetString(go, "--yfile");
1818 char *mfile = esl_opt_GetString(go, "--mfile");
1819 ESL_DMATRIX *P = NULL; /* p_ij's from Yu/Altschul reverse eng of S0 */
1820 double *fi = NULL;
1821 double *fj = NULL;
1822 double lambda;
1823 double D;
1824 int status;
1825
1826 /* Calculate an integer score matrix from a probabilistic rate matrix (WAG) */
1827 esl_scorematrix_SetWAG(S0, lambda0/scale, t);
1828 esl_composition_WAG(wagpi);
1829 printf("WAG matrix calculated at t=%.3f, lambda=%.4f (/%.1f)\n", t, lambda0, scale);
1830
1831 /* Save the matrix, if asked */
1832 if (mfile)
1833 {
1834 FILE *ofp = NULL;
1835 if ( (ofp = fopen(mfile, "w")) == NULL) esl_fatal("failed to open %s for writing scorematrix", mfile);
1836 ESL_DASSERT1(( S0->Kp >= 20 )); // the strcpy below is fine. The assertion tries to convince static analyzers of that.
1837 strcpy(S0->outorder, "ARNDCQEGHILKMFPSTWYV");
1838 esl_scorematrix_Write(ofp, S0);
1839 fclose(ofp);
1840 }
1841
1842 /* Because of integer roundoff, the actual probability basis is a little different */
1843 esl_scorematrix_ProbifyGivenBG(S0, wagpi, wagpi, &lambda, NULL);
1844 printf("Integer roundoff shifts implicit lambda (given wagpi's) to %.4f (/%.1f)\n", lambda*scale, scale);
1845 printf("Scores in matrix range from %d to %d\n", esl_scorematrix_Min(S0), esl_scorematrix_Max(S0));
1846
1847 esl_scorematrix_RelEntropy(S0, wagpi, wagpi, lambda, &D);
1848 printf("Relative entropy: %.3f bits\n", D);
1849
1850 if (yfile)
1851 {
1852 FILE *ofp = NULL;
1853 if ( (ofp = fopen(yfile, "w")) == NULL) esl_fatal("failed to open XY file %s for writing\n", yfile);
1854 yualtschul_graph_dump(ofp, S0, scale, 0.01, 1.0, 0.0001);
1855 fclose(ofp);
1856 printf("XY plot of Yu/Altschul rootfinding saved to : %s\n", yfile);
1857 }
1858
1859 status = esl_scorematrix_Probify(S0, &P, &fi, &fj, &lambda);
1860 printf("Yu/Altschul reverse engineering gives lambda = %.4f (/%.1f)\n", lambda*scale, scale);
1861
1862 //printf("fi's are: \n"); esl_vec_DDump(stdout, fi, S0->K, abc->sym);
1863
1864 if (status != eslOK) printf("however, the solution is INVALID!\n");
1865 else printf("and the joint and marginals are a valid probabilistic basis.\n");
1866
1867 free(fj);
1868 free(fi);
1869 esl_scorematrix_Destroy(S0);
1870 esl_dmatrix_Destroy(P);
1871 esl_dmatrix_Destroy(P0);
1872 esl_dmatrix_Destroy(Q);
1873 esl_alphabet_Destroy(abc);
1874 esl_getopts_Destroy(go);
1875 return 0;
1876 }
1877 #endif /* eslSCOREMATRIX_EXPERIMENT */
1878 /*------------------ end, experiment driver ---------------------*/
1879
1880
1881
1882 /*****************************************************************
1883 * 8. Utility programs
1884 *****************************************************************/
1885
1886 /* Reformat a score matrix file into Easel internal digital alphabet order, suitable for making
1887 * one of the static data structures in our section of preloaded matrices.
1888 */
1889 #ifdef eslSCOREMATRIX_UTILITY1
1890 /*
1891 gcc -g -Wall -o utility -I. -L. -DeslSCOREMATRIX_UTILITY1 esl_scorematrix.c -leasel -lm
1892 ./utility BLOSUM62
1893 */
1894 #include "easel.h"
1895 #include "esl_alphabet.h"
1896 #include "esl_scorematrix.h"
1897 #include "esl_fileparser.h"
1898
1899 int
main(int argc,char ** argv)1900 main(int argc, char **argv)
1901 {
1902 char *infile = argv[1];
1903 ESL_ALPHABET *abc;
1904 ESL_FILEPARSER *efp;
1905 ESL_SCOREMATRIX *S;
1906 int x,y;
1907
1908 abc = esl_alphabet_Create(eslAMINO);
1909
1910 if (esl_fileparser_Open(infile, NULL, &efp) != eslOK) esl_fatal("Failed to open %s\n", infile);
1911 if (esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("parse failed: %s", efp->errbuf);
1912
1913 printf(" /*");
1914 for (y = 0; y < abc->Kp; y++)
1915 printf(" %c ", abc->sym[y]);
1916 printf(" */\n");
1917
1918 for (x = 0; x < abc->Kp; x++) {
1919 printf(" { ");
1920 for (y = 0; y < abc->Kp; y++)
1921 printf("%3d, ", S->s[x][y]);
1922 printf(" }, /* %c */\n", abc->sym[x]);
1923 }
1924
1925 esl_scorematrix_Destroy(S);
1926 esl_fileparser_Close(efp);
1927 esl_alphabet_Destroy(abc);
1928 return eslOK;
1929 }
1930 #endif /*eslSCOREMATRIX_UTILITY1*/
1931
1932
1933
1934
1935 /* Utility 2: joint or conditional probabilities from BLOSUM62 (depending on how compiled)
1936 */
1937 #ifdef eslSCOREMATRIX_UTILITY2
1938 /*
1939 gcc -g -Wall -o utility2 -I. -L. -DeslSCOREMATRIX_UTILITY2 esl_scorematrix.c -leasel -lm
1940 ./utility2
1941 */
1942 #include "easel.h"
1943 #include "esl_alphabet.h"
1944 #include "esl_dmatrix.h"
1945 #include "esl_scorematrix.h"
1946
1947 int
main(int argc,char ** argv)1948 main(int argc, char **argv)
1949 {
1950 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
1951 ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc);
1952 ESL_DMATRIX *Q = NULL;
1953 double *fa = NULL;
1954 double *fb = NULL;
1955 double slambda;
1956 int a,b;
1957
1958 esl_scorematrix_Set("BLOSUM62", S);
1959 esl_scorematrix_Probify(S, &Q, &fa, &fb, &slambda);
1960 #if 0
1961 esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Q->mx[a][b] is now P(b | a) */
1962 #endif
1963 esl_dmatrix_Dump(stdout, Q, abc->sym, abc->sym);
1964
1965 esl_dmatrix_Destroy(Q);
1966 esl_scorematrix_Destroy(S);
1967 esl_alphabet_Destroy(abc);
1968 return eslOK;
1969 }
1970 #endif /*eslSCOREMATRIX_UTILITY2*/
1971
1972
1973
1974
1975
1976
1977 /*****************************************************************
1978 * 9. Unit tests.
1979 *****************************************************************/
1980
1981 #ifdef eslSCOREMATRIX_TESTDRIVE
1982 #include "esl_dirichlet.h"
1983
1984 static void
utest_ReadWrite(ESL_ALPHABET * abc,ESL_SCOREMATRIX * S)1985 utest_ReadWrite(ESL_ALPHABET *abc, ESL_SCOREMATRIX *S)
1986 {
1987 char tmpfile[16] = "esltmpXXXXXX";
1988 FILE *fp = NULL;
1989 ESL_SCOREMATRIX *S2 = NULL;
1990 ESL_FILEPARSER *efp = NULL;
1991
1992 if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal("failed to open tmp file");
1993 if (esl_scorematrix_Write(fp, S) != eslOK) esl_fatal("failed to write test matrix");
1994 fclose(fp);
1995
1996 if (esl_fileparser_Open(tmpfile, NULL, &efp) != eslOK) esl_fatal("failed to open tmpfile containing BLOSUM62 matrix");
1997 if (esl_scorematrix_Read(efp, abc, &S2) != eslOK) esl_fatal("failed to read tmpfile containing BLOSUM62 matrix");
1998 if (esl_scorematrix_Compare(S, S2) != eslOK) esl_fatal("the two test matrices aren't identical");
1999
2000 remove(tmpfile);
2001 esl_fileparser_Close(efp);
2002 esl_scorematrix_Destroy(S2);
2003 return;
2004 }
2005
2006
2007 static void
utest_ProbifyGivenBG(ESL_SCOREMATRIX * S0,ESL_DMATRIX * P0,double * wagpi,double lambda0)2008 utest_ProbifyGivenBG(ESL_SCOREMATRIX *S0, ESL_DMATRIX *P0, double *wagpi, double lambda0)
2009 {
2010 char *msg = "ProbifyGivenBG() unit test failed";
2011 ESL_DMATRIX *P = NULL;
2012 double sum = 0.0;
2013 double lambda;
2014 int a,b;
2015
2016 if (esl_scorematrix_ProbifyGivenBG(S0, wagpi, wagpi, &lambda, &P) != eslOK) esl_fatal(msg);
2017
2018 if (esl_DCompare(lambda0, lambda, 1e-3) != eslOK) esl_fatal("lambda is wrong");
2019
2020 for (a = 0; a < 20; a++) /* you can't just call esl_dmx_Sum(P), because P includes */
2021 for (b = 0; b < 20; b++) /* marginalized degeneracies */
2022 sum += P->mx[a][b];
2023
2024 if (esl_DCompare(sum, 1.0, 1e-9) != eslOK) esl_fatal("P doesn't sum to 1");
2025
2026 for (a = 0; a < 20; a++) /* for the same reason, you can't dmatrix_Compare P and P0 */
2027 for (b = 0; b < 20; b++)
2028 if (esl_DCompare(P0->mx[a][b], P->mx[a][b], 1e-2) != eslOK) esl_fatal("P is wrong");
2029
2030 esl_dmatrix_Destroy(P);
2031 return;
2032 }
2033
2034
2035 /* The scores->pij reverse engineering engine works with scores in doubles,
2036 * so we can separate effects of rounding to integers in standard
2037 * score matrices.
2038 */
2039 static void
utest_yualtschul(ESL_DMATRIX * P0,double * wagpi)2040 utest_yualtschul(ESL_DMATRIX *P0, double *wagpi)
2041 {
2042 char *msg = "reverse engineering engine test failed";
2043 ESL_DMATRIX *S = NULL; /* original score matrix, in double form, not rounded to ints (calculated from P, fi, fj) */
2044 ESL_DMATRIX *P = NULL; /* backcalculated P_ij joint probabilities */
2045 double *fi = NULL; /* backcalculated f_i query composition */
2046 double *fj = NULL; /* backcalculated f'_j target composition */
2047 double lambda0; /* true lambda */
2048 double lambda; /* backcalculated lambda */
2049 double sum = 0.0;
2050 int i,j;
2051
2052 /* Allocations */
2053 if (( S = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal(msg);
2054 if (( P = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal(msg);
2055 if ((fi = malloc(sizeof(double) * 20)) == NULL) esl_fatal(msg);
2056 if ((fj = malloc(sizeof(double) * 20)) == NULL) esl_fatal(msg);
2057
2058 /* Make a WAG-based score matrix in double-precision, without rounding to integers */
2059 lambda0 = 0.3;
2060 for (i = 0; i < 20; i++)
2061 for (j = 0; j < 20; j++)
2062 S->mx[i][j] = log(P0->mx[i][j] / (wagpi[i] * wagpi[j])) / lambda0;
2063
2064 /* Reverse engineer it in double precision */
2065 if ( yualtschul_engine(S, P, fi, fj, &lambda) != eslOK) esl_fatal("reverse engineering engine failed");
2066
2067 /* Validate the solution (expect more accuracy from this than from integer scores) */
2068 if (esl_DCompare(lambda0, lambda, 1e-4) != eslOK) esl_fatal("failed to get right lambda");
2069
2070 for (i = 0; i < 20; i++) /* you can't just call esl_dmx_Sum(P), because P includes */
2071 for (j = 0; j < 20; j++) /* marginalized degeneracies */
2072 sum += P->mx[i][j];
2073 if (esl_DCompare(sum, 1.0, 1e-6) != eslOK) esl_fatal("reconstructed P doesn't sum to 1");
2074
2075 for (i = 0; i < 20; i++) /* for the same reason, you can't dmatrix_Compare P and P0 */
2076 for (j = 0; j < 20; j++)
2077 if (esl_DCompare(P0->mx[i][j], P->mx[i][j], 1e-2) != eslOK) esl_fatal("failed to recover correct P_ij");
2078 for (i = 0; i < 20; i++)
2079 {
2080 if (esl_DCompare(fi[i], fj[i], 1e-6) != eslOK) esl_fatal("background fi, fj not the same");
2081 if (esl_DCompare(wagpi[i], fi[i], 1e-3) != eslOK) esl_fatal("failed to reconstruct WAG backgrounds");
2082 }
2083
2084 free(fj);
2085 free(fi);
2086 esl_dmatrix_Destroy(S);
2087 esl_dmatrix_Destroy(P);
2088 return;
2089 }
2090
2091
2092 /* utest_Probify()
2093 * This tests Probify on a matrix that was calculated from probabilities in the first
2094 * place. It verifies that the reconstructed Pij matrix matches the original Pij's
2095 * that the score matrix was built from.
2096 */
2097 static void
utest_Probify(ESL_SCOREMATRIX * S0,ESL_DMATRIX * P0,double * wagpi,double lambda0)2098 utest_Probify(ESL_SCOREMATRIX *S0, ESL_DMATRIX *P0, double *wagpi, double lambda0)
2099 {
2100 ESL_DMATRIX *P = NULL;
2101 double *fi = NULL;
2102 double *fj = NULL;
2103 double lambda; /* reconstructed lambda */
2104 double sum = 0.0;
2105 int i,j;
2106
2107 if (esl_scorematrix_Probify(S0, &P, &fi, &fj, &lambda) != eslOK) esl_fatal("reverse engineering failed");
2108
2109 /* Validate the solution, gingerly (we expect significant error due to integer roundoff) */
2110 if (esl_DCompare(lambda0, lambda, 0.01) != eslOK) esl_fatal("failed to get right lambda");
2111 for (i = 0; i < 20; i++) /* you can't just call esl_dmx_Sum(P), because P includes */
2112 for (j = 0; j < 20; j++) /* marginalized degeneracies */
2113 sum += P->mx[i][j];
2114 if (esl_DCompare(sum, 1.0, 1e-6) != eslOK) esl_fatal("reconstructed P doesn't sum to 1");
2115
2116 for (i = 0; i < 20; i++) /* for the same reason, you can't dmatrix_Compare P and P0 */
2117 for (j = 0; j < 20; j++)
2118 if (esl_DCompare(P0->mx[i][j], P->mx[i][j], 0.1) != eslOK) esl_fatal("failed to recover correct P_ij");
2119 free(fj);
2120 free(fi);
2121 esl_dmatrix_Destroy(P);
2122 return;
2123 }
2124
2125 /* utest_ProbifyBLOSUM()
2126 * This tests Probify on a score matrix where the original Pij's are treated as
2127 * unknown. It verifies that if you create a new score matrix from the reconstructed
2128 * Pij's, you get the original score matrix back. BLOSUM62 makes a good example,
2129 * hence the name.
2130 */
2131 static void
utest_ProbifyBLOSUM(ESL_SCOREMATRIX * BL62)2132 utest_ProbifyBLOSUM(ESL_SCOREMATRIX *BL62)
2133 {
2134 char *msg = "failure in ProbifyBLOSUM() unit test";
2135 ESL_DMATRIX *P = NULL;
2136 double *fi = NULL;
2137 double *fj = NULL;
2138 double lambda;
2139 ESL_SCOREMATRIX *S2 = NULL;
2140
2141 if (( S2 = esl_scorematrix_Clone(BL62)) == NULL) esl_fatal(msg);
2142 if (esl_scorematrix_Probify(BL62, &P, &fi, &fj, &lambda) != eslOK) esl_fatal(msg);
2143 if (esl_scorematrix_SetFromProbs(S2, lambda, P, fi, fj) != eslOK) esl_fatal(msg);
2144 if (esl_scorematrix_CompareCanon(BL62, S2) != eslOK) esl_fatal(msg);
2145
2146 free(fj);
2147 free(fi);
2148 esl_scorematrix_Destroy(S2);
2149 esl_dmatrix_Destroy(P);
2150 return;
2151 }
2152
2153 #endif /*eslSCOREMATRIX_TESTDRIVE*/
2154
2155
2156 /*****************************************************************
2157 * 10. Test driver.
2158 *****************************************************************/
2159 /*
2160 gcc -g -Wall -I. -L. -o test -DeslSCOREMATRIX_TESTDRIVE esl_scorematrix.c -leasel -lm
2161 ./test
2162 */
2163 #ifdef eslSCOREMATRIX_TESTDRIVE
2164 #include "easel.h"
2165 #include "esl_scorematrix.h"
2166
2167 int
main(int argc,char ** argv)2168 main(int argc, char **argv)
2169 {
2170 ESL_ALPHABET *abc = NULL; /* amino acid alphabet */
2171 ESL_SCOREMATRIX *BL62= NULL; /* BLOSUM62 matrix */
2172 ESL_SCOREMATRIX *S0 = NULL; /* original score matrix (calculated from P, fi, fj) */
2173 ESL_DMATRIX *P0 = NULL; /* original P_ij joint probabilities */
2174 ESL_DMATRIX *Q = NULL; /* WAG rate matrix */
2175 double lambda0; /* true lambda used to construct S */
2176 double t;
2177 int i,j;
2178 static double wagpi[20];
2179
2180 /* Allocations */
2181 if ((abc = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal("allocation of alphabet failed");
2182 if ((BL62= esl_scorematrix_Create(abc)) == NULL) esl_fatal("allocation of BLOSUM62 failed");
2183 if ((S0 = esl_scorematrix_Create(abc)) == NULL) esl_fatal("allocation of scorematrix failed");
2184 if ((P0 = esl_dmatrix_Create(abc->K, abc->K)) == NULL) esl_fatal("P allocation failed");
2185 if ((Q = esl_dmatrix_Create(abc->K, abc->K)) == NULL) esl_fatal("Q allocation failed");
2186
2187 /* Make a BLOSUM matrix */
2188 if ( esl_scorematrix_Set("BLOSUM62", BL62) != eslOK) esl_fatal("failed to set a BLOSUM matrix");
2189
2190 /* Make a WAG-based score matrix with small lambda. */
2191 lambda0 = 0.00635;
2192 t = 2.0;
2193 esl_scorematrix_SetWAG(S0, lambda0, t);
2194 esl_composition_WAG(wagpi);
2195
2196 /* Redo some calculations to get the known probabilistic basis of that S */
2197 if ( esl_rmx_SetWAG(Q, wagpi) != eslOK) esl_fatal("failed to set WAG");
2198 if ( esl_dmx_Exp(Q, t, P0) != eslOK) esl_fatal("failed to exponentiate WAG");
2199 for (i = 0; i < 20; i++)
2200 for (j = 0; j < 20; j++)
2201 P0->mx[i][j] *= wagpi[i]; /* P_ij = P(j|i) pi_i */
2202
2203 /* The unit test battery
2204 */
2205 utest_ReadWrite(abc, BL62);
2206 utest_ReadWrite(abc, S0);
2207 utest_ProbifyGivenBG(S0, P0, wagpi, lambda0);
2208 utest_yualtschul(P0, wagpi);
2209 utest_Probify(S0, P0, wagpi, lambda0);
2210 utest_ProbifyBLOSUM(BL62);
2211
2212 esl_dmatrix_Destroy(Q);
2213 esl_dmatrix_Destroy(P0);
2214 esl_scorematrix_Destroy(BL62);
2215 esl_scorematrix_Destroy(S0);
2216 esl_alphabet_Destroy(abc);
2217
2218 return 0;
2219 }
2220 #endif /*eslSCOREMATRIX_TESTDRIVE*/
2221
2222 /*****************************************************************
2223 * 11. Example program
2224 *****************************************************************/
2225
2226 #ifdef eslSCOREMATRIX_EXAMPLE
2227 /*::cexcerpt::scorematrix_example::begin::*/
2228 #include "easel.h"
2229 #include "esl_alphabet.h"
2230 #include "esl_fileparser.h"
2231 #include "esl_getopts.h"
2232 #include "esl_dmatrix.h"
2233 #include "esl_vectorops.h"
2234 #include "esl_scorematrix.h"
2235
2236 static ESL_OPTIONS options[] = {
2237 /* name type default env range toggles reqs incomp help docgroup*/
2238 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2239 { "--dna", eslARG_NONE, FALSE, NULL, NULL, "--dna,--amino", NULL, NULL, "use DNA alphabet", 0 },
2240 { "--amino", eslARG_NONE, "TRUE", NULL, NULL, "--dna,--amino", NULL, NULL, "use protein alphabet", 0 },
2241 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2242 };
2243 static char usage[] = "[-options] <mxfile>";
2244 static char banner[] = "example of using easel scorematrix routines";
2245
2246
2247 int
main(int argc,char ** argv)2248 main(int argc, char **argv)
2249 {
2250 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
2251 char *scorefile = esl_opt_GetArg(go, 1);
2252 ESL_ALPHABET *abc = NULL;
2253 ESL_FILEPARSER *efp = NULL;
2254 ESL_SCOREMATRIX *S = NULL;
2255 ESL_DMATRIX *P1 = NULL; /* implicit probability basis, bg unknown */
2256 ESL_DMATRIX *P2 = NULL; /* implicit probability basis, bg known */
2257 double *fi = NULL;
2258 double *fj = NULL;
2259 double lambda, D, E;
2260 int vstatus;
2261
2262 if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
2263 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
2264
2265 /* Input a score matrix from a file. */
2266 if ( esl_fileparser_Open(scorefile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", scorefile);
2267 if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s:\n %s", scorefile, efp->errbuf);
2268 esl_fileparser_Close(efp);
2269
2270 /* Try to reverse engineer it to get implicit probabilistic model. This may fail! */
2271 vstatus = esl_scorematrix_Probify(S, &P1, &fi, &fj, &lambda);
2272
2273 if (vstatus == eslOK)
2274 { /* Print some info, and the joint probabilities. */
2275
2276 esl_scorematrix_RelEntropy (S, fi, fj, lambda, &D);
2277 esl_scorematrix_ExpectedScore(S, fi, fj, &E);
2278
2279 printf("By Yu/Altschul (2003,2005) procedure:\n");
2280 printf("Lambda = %.4f\n", lambda);
2281 printf("Relative entropy = %.4f bits\n", D);
2282 printf("Expected score = %.4f bits\n", E * lambda * eslCONST_LOG2R);
2283
2284 printf("p_ij's are:\n"); esl_dmatrix_Dump(stdout, P1, abc->sym, abc->sym);
2285 printf("fi's are:\n"); esl_vec_DDump(stdout, fi, S->K, abc->sym);
2286 printf("fj's are:\n"); esl_vec_DDump(stdout, fj, S->K, abc->sym);
2287 printf("============================================================\n\n");
2288 }
2289 else
2290 {
2291 printf("Yu/Altschul procedure FAILS to find a valid implicit probability basis!\n");
2292 printf("Lambda = %.4f\n", lambda);
2293 printf("p_ij's are:\n"); esl_dmatrix_Dump(stdout, P1, abc->sym, abc->sym);
2294 printf("fi's are:\n"); esl_vec_DDump(stdout, fi, S->K, abc->sym);
2295 printf("fj's are:\n"); esl_vec_DDump(stdout, fj, S->K, abc->sym);
2296 printf("============================================================\n\n");
2297
2298 esl_composition_BL62(fi); esl_composition_BL62(fj);
2299 }
2300
2301 /* Now reverse engineer it again, this time using "known" background probs */
2302 esl_scorematrix_ProbifyGivenBG(S, fi, fj, &lambda, &P2);
2303 esl_scorematrix_RelEntropy (S, fi, fj, lambda, &D);
2304 esl_scorematrix_ExpectedScore(S, fi, fj, &E);
2305
2306 printf("By solving for lambda from given background frequencies:\n");
2307 printf("Lambda = %.4f\n", lambda);
2308 printf("Relative entropy = %.4f bits\n", D);
2309 printf("Expected score = %.4f bits\n", E * lambda * eslCONST_LOG2R);
2310
2311 printf("p_ij's are:\n"); esl_dmatrix_Dump(stdout, P2, abc->sym, abc->sym);
2312 printf("fi's are:\n"); esl_vec_DDump(stdout, fi, S->K, abc->sym);
2313 printf("fj's are:\n"); esl_vec_DDump(stdout, fj, S->K, abc->sym);
2314 printf("============================================================\n\n");
2315
2316
2317 /* Now recalculate a score matrix from the probabilistic basis */
2318 printf("Before:\n");
2319 esl_scorematrix_Write(stdout, S);
2320 printf("After:\n");
2321 esl_scorematrix_SetFromProbs(S, lambda, P2, fi, fj);
2322 esl_scorematrix_Write(stdout, S);
2323
2324 free(fi); free(fj);
2325 esl_dmatrix_Destroy(P1); esl_dmatrix_Destroy(P2);
2326 esl_scorematrix_Destroy(S);
2327 esl_alphabet_Destroy(abc);
2328 esl_getopts_Destroy(go);
2329 return 0;
2330 }
2331 /*::cexcerpt::scorematrix_example::end::*/
2332 #endif /*eslSCOREMATRIX_EXAMPLE*/
2333