1 /* Pairwise identities, distances, and distance matrices.
2 *
3 * Contents:
4 * 1. Pairwise distances for aligned text sequences.
5 * 2. Pairwise distances for aligned digital seqs.
6 * 3. Distance matrices for aligned text sequences.
7 * 4. Distance matrices for aligned digital sequences.
8 * 5. Average pairwise identity for multiple alignments.
9 * 6. Private (static) functions.
10 * 7. Unit tests.
11 * 8. Test driver.
12 * 9. Example.
13 */
14 #include "esl_config.h"
15
16 #include <ctype.h>
17 #include <string.h>
18 #include <math.h>
19
20 #include "easel.h"
21 #include "esl_alphabet.h"
22 #include "esl_dmatrix.h"
23 #include "esl_random.h"
24
25 #include "esl_distance.h"
26
27 /* Forward declaration of our static functions.
28 */
29 static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance);
30
31
32 /*****************************************************************
33 * 1. Pairwise distances for aligned text sequences.
34 *****************************************************************/
35
36 /* Function: esl_dst_CPairId()
37 * Synopsis: Pairwise identity of two aligned text strings.
38 * Incept: SRE, Mon Apr 17 20:06:07 2006 [St. Louis]
39 *
40 * Purpose: Calculates pairwise fractional identity between two
41 * aligned character strings <asq1> and <asq2>.
42 * Return this distance in <opt_pid>; return the
43 * number of identities counted in <opt_nid>; and
44 * return the denominator <MIN(len1,len2)> in
45 * <opt_n>.
46 *
47 * Alphabetic symbols <[a-zA-Z]> are compared
48 * case-insensitively for identity. Any nonalphabetic
49 * character is assumed to be a gap symbol.
50 *
51 * This simple comparison rule is unaware of synonyms and
52 * degeneracies in biological alphabets. For a more
53 * sophisticated and biosequence-aware comparison, use
54 * digitized sequences and the <esl_dst_XPairId()> function
55 * instead. Note that currently <esl_dst_XPairId()> does
56 * not correctly handle degeneracies, but is set up to.
57 *
58 * Args: asq1 - aligned character string 1
59 * asq2 - aligned character string 2
60 * opt_pid - optRETURN: pairwise identity, 0<=x<=1
61 * opt_nid - optRETURN: # of identities
62 * opt_n - optRETURN: denominator MIN(len1,len2)
63 *
64 * Returns: <eslOK> on success. <opt_pid>, <opt_nid>, <opt_n>
65 * contain the answers (for whichever were passed non-NULL).
66 *
67 * Throws: <eslEINVAL> if the strings are different lengths
68 * (not aligned).
69 */
70 int
esl_dst_CPairId(const char * asq1,const char * asq2,double * opt_pid,int * opt_nid,int * opt_n)71 esl_dst_CPairId(const char *asq1, const char *asq2,
72 double *opt_pid, int *opt_nid, int *opt_n)
73 {
74 int status;
75 int idents; /* total identical positions */
76 int len1, len2; /* lengths of seqs */
77 int i; /* position in aligned seqs */
78
79 idents = len1 = len2 = 0;
80 for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
81 {
82 if (isalpha(asq1[i])) len1++;
83 if (isalpha(asq2[i])) len2++;
84 if (isalpha(asq1[i]) && isalpha(asq2[i])
85 && toupper(asq1[i]) == toupper(asq2[i]))
86 idents++;
87 }
88 if (asq1[i] != '\0' || asq2[i] != '\0')
89 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
90
91 if (opt_pid != NULL) *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2));
92 if (opt_nid != NULL) *opt_nid = idents;
93 if (opt_n != NULL) *opt_n = len1;
94 return eslOK;
95
96 ERROR:
97 if (opt_pid != NULL) *opt_pid = 0.;
98 if (opt_nid != NULL) *opt_nid = 0;
99 if (opt_n != NULL) *opt_n = 0;
100 return status;
101 }
102
103 /* Function: esl_dst_CPairMatch()
104 * Synopsis: Pairwise matches of two aligned text strings.
105 * Incept: ER, Wed Oct 29 09:02:35 EDT 2014 [janelia]
106 *
107 * Purpose: Calculates pairwise fractional matches between two
108 * aligned character strings <asq1> and <asq2>.
109 * Return this distance in <opt_pmatch>; return the
110 * number of matches counted in <opt_nmatch>; and
111 * return the denominator <alen - double_gaps> in
112 * <opt_n>.
113 *
114 * Alphabetic symbols <[a-zA-Z]> are compared
115 * case-insensitively for identity. Any nonalphabetic
116 * character is assumed to be a gap symbol.
117 *
118 * This simple comparison rule is unaware of synonyms and
119 * degeneracies in biological alphabets. For a more
120 * sophisticated and biosequence-aware comparison, use
121 * digitized sequences and the <esl_dst_XPairmatch()> function
122 * instead. Note that currently <esl_dst_XPairMatch()> does
123 * not correctly handle degeneracies, but is set up to.
124 *
125 * Args: asq1 - aligned character string 1
126 * asq2 - aligned character string 2
127 * opt_pmatch - optRETURN: pairwise matches, 0<=x<=1
128 * opt_nmatch - optRETURN: # of matches
129 * opt_n - optRETURN: denominator alen - double_gaps
130 *
131 * Returns: <eslOK> on success. <opt_pmatch>, <opt_nmatch>, <opt_n>
132 * contain the answers (for whichever were passed non-NULL).
133 *
134 * Throws: <eslEINVAL> if the strings are different lengths
135 * (not aligned).
136 */
137 int
esl_dst_CPairMatch(const char * asq1,const char * asq2,double * opt_pmatch,int * opt_nmatch,int * opt_n)138 esl_dst_CPairMatch(const char *asq1, const char *asq2,
139 double *opt_pmatch, int *opt_nmatch, int *opt_n)
140 {
141 int status;
142 int match; /* total matched positions */
143 int len; /* length of alignment (no double gaps) */
144 int i; /* position in aligned seqs */
145
146 match = len = 0;
147 for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
148 {
149 if (isalpha(asq1[i]) || isalpha(asq2[i])) len++;
150 if (isalpha(asq1[i]) && isalpha(asq2[i])) match++;
151 }
152 if (asq1[i] != '\0' || asq2[i] != '\0')
153 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
154
155 if (opt_pmatch != NULL) *opt_pmatch = ( len==0 ? 0. : (double)match / (double)len);
156 if (opt_nmatch != NULL) *opt_nmatch = match;
157 if (opt_n != NULL) *opt_n = len;
158 return eslOK;
159
160 ERROR:
161 if (opt_pmatch != NULL) *opt_pmatch = 0.;
162 if (opt_nmatch != NULL) *opt_nmatch = 0;
163 if (opt_n != NULL) *opt_n = 0;
164 return status;
165 }
166
167 /* Function: esl_dst_CJukesCantor()
168 * Synopsis: Jukes-Cantor distance for two aligned strings.
169 * Incept: SRE, Tue Apr 18 14:00:37 2006 [St. Louis]
170 *
171 * Purpose: Calculate the generalized Jukes-Cantor distance between
172 * two aligned character strings <as1> and <as2>, in
173 * substitutions/site, for an alphabet of <K> residues
174 * (<K=4> for nucleic acid, <K=20> for proteins). The
175 * maximum likelihood estimate for the distance is
176 * optionally returned in <opt_distance>. The large-sample
177 * variance for the distance estimate is
178 * optionally returned in <opt_variance>.
179 *
180 * Alphabetic symbols <[a-zA-Z]> are compared
181 * case-insensitively to count the number of identities
182 * (<n1>) and mismatches (<n2>>). Any nonalphabetic
183 * character is assumed to be a gap symbol, and aligned
184 * columns containing gap symbols are ignored. The
185 * fractional difference <D> used to calculate the
186 * Jukes/Cantor distance is <n2/n1+n2>.
187 *
188 * Args: K - size of the alphabet (4 or 20)
189 * as1 - 1st aligned seq, 0..L-1, \0-terminated
190 * as2 - 2nd aligned seq, 0..L-1, \0-terminated
191 * opt_distance - optRETURN: ML estimate of distance d
192 * opt_variance - optRETURN: large-sample variance of d
193 *
194 * Returns: <eslOK> on success.
195 *
196 * Infinite distances are possible, in which case distance
197 * and variance are both <HUGE_VAL>. Caller has to deal
198 * with this case as it sees fit, perhaps by enforcing
199 * an arbitrary maximum distance.
200 *
201 * Throws: <eslEINVAL> if the two strings aren't the same length (and
202 * thus can't have been properly aligned).
203 * <eslEDIVZERO> if no aligned residues were counted.
204 * On either failure, distance and variance are both returned
205 * as <HUGE_VAL>.
206 */
207 int
esl_dst_CJukesCantor(int K,const char * as1,const char * as2,double * opt_distance,double * opt_variance)208 esl_dst_CJukesCantor(int K, const char *as1, const char *as2,
209 double *opt_distance, double *opt_variance)
210 {
211 int status;
212 int n1, n2; /* number of observed identities, substitutions */
213 int i; /* position in aligned seqs */
214
215 /* 1. Count identities, mismatches.
216 */
217 n1 = n2 = 0;
218 for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++)
219 {
220 if (isalpha(as1[i]) && isalpha(as2[i]))
221 {
222 if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++;
223 }
224 }
225 if (as1[i] != '\0' || as2[i] != '\0')
226 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
227
228 return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */
229
230 ERROR:
231 if (opt_distance != NULL) *opt_distance = HUGE_VAL;
232 if (opt_variance != NULL) *opt_variance = HUGE_VAL;
233 return status;
234 }
235
236 /*------- end, pairwise distances for aligned text seqs ---------*/
237
238
239
240
241
242 /*****************************************************************
243 * 2. Pairwise distances for aligned digitized sequences.
244 *****************************************************************/
245
246 /* Function: esl_dst_XPairId()
247 * Synopsis: Pairwise identity of two aligned digital seqs.
248 * Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis]
249 *
250 * Purpose: Digital version of <esl_dst_CPairId()>: <adsq1> and
251 * <adsq2> are digitized aligned sequences, in alphabet
252 * <abc>. Otherwise, same as <esl_dst_CPairId()> except
253 * that only canonical residues are counted and checked for
254 * identity, while <esl_dst_CPairId()> (which has no
255 * alphabet) counts and checks identity of all alphanumeric
256 * characters.
257 *
258 * This function does not use <esl_abc_Match()> to handle
259 * degeneracies but it is set up to do so. Doing that would
260 * require that <opt_nid> be changed to a float or double,
261 * or its meaning be changed to be the number of canonical
262 * identities.
263 *
264 * Args: abc - digital alphabet in use
265 * ax1 - aligned digital seq 1
266 * ax2 - aligned digital seq 2
267 * opt_pid - optRETURN: pairwise identity, 0<=x<=1
268 * opt_nid - optRETURN: # of identities
269 * opt_n - optRETURN: denominator MIN(len1,len2)
270 *
271 * Returns: <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n>
272 * contain the answers, for any of these that were passed
273 * non-<NULL> pointers.
274 *
275 * Throws: <eslEINVAL> if the strings are different lengths (not aligned).
276 */
277 int
esl_dst_XPairId(const ESL_ALPHABET * abc,const ESL_DSQ * ax1,const ESL_DSQ * ax2,double * opt_distance,int * opt_nid,int * opt_n)278 esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
279 double *opt_distance, int *opt_nid, int *opt_n)
280 {
281 int status;
282 int idents; /* total identical positions */
283 int len1, len2; /* lengths of seqs */
284 int i; /* position in aligned seqs */
285
286 idents = len1 = len2 = 0;
287 for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
288 {
289 if (esl_abc_XIsCanonical(abc, ax1[i])) len1++;
290 if (esl_abc_XIsCanonical(abc, ax2[i])) len2++;
291
292 if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])
293 && ax1[i] == ax2[i])
294 idents++;
295 }
296 if (len2 < len1) len1 = len2;
297
298 if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
299 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
300
301 if (opt_distance != NULL) *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 );
302 if (opt_nid != NULL) *opt_nid = idents;
303 if (opt_n != NULL) *opt_n = len1;
304 return eslOK;
305
306 ERROR:
307 if (opt_distance != NULL) *opt_distance = 0.;
308 if (opt_nid != NULL) *opt_nid = 0;
309 if (opt_n != NULL) *opt_n = 0;
310 return status;
311 }
312
313 /* Function: esl_dst_XPairMatch()
314 * Synopsis: Pairwise matches of two aligned digital seqs.
315 * Incept: ER, Wed Oct 29 09:09:07 EDT 2014 [janelia]
316 *
317 * Purpose: Digital version of <esl_dst_CPairMatch()>: <adsq1> and
318 * <adsq2> are digitized aligned sequences, in alphabet
319 * <abc>. Otherwise, same as <esl_dst_CPairId()> except
320 * that only canonical residues are counted and checked for
321 * identity, while <esl_dst_CPairId()> (which has no
322 * alphabet) counts and checks identity of all alphanumeric
323 * characters.
324 *
325 * Args: abc - digital alphabet in use
326 * ax1 - aligned digital seq 1
327 * ax2 - aligned digital seq 2
328 * opt_pmatch - optRETURN: pairwise matches, 0<=x<=1
329 * opt_nmatch - optRETURN: # of maches
330 * opt_n - optRETURN: denominator alen-double_gaps
331 *
332 * Returns: <eslOK> on success. <opt_distance>, <opt_nmatch>, <opt_n>
333 * contain the answers, for any of these that were passed
334 * non-<NULL> pointers.
335 *
336 * Throws: <eslEINVAL> if the strings are different lengths (not aligned).
337 */
338 int
esl_dst_XPairMatch(const ESL_ALPHABET * abc,const ESL_DSQ * ax1,const ESL_DSQ * ax2,double * opt_distance,int * opt_nmatch,int * opt_n)339 esl_dst_XPairMatch(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
340 double *opt_distance, int *opt_nmatch, int *opt_n)
341 {
342 int status;
343 int match; /* total matched positions */
344 int len; /* length of alignment (no double gaps) */
345 int i; /* position in aligned seqs */
346
347 match = len = 0;
348 for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
349 {
350 if (esl_abc_XIsCanonical(abc, ax1[i]) || esl_abc_XIsCanonical(abc, ax2[i])) len ++;
351 if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])) match++;
352 }
353
354 if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
355 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
356
357 if (opt_distance != NULL) *opt_distance = ( len==0 ? 0. : (double)match / (double)len );
358 if (opt_nmatch != NULL) *opt_nmatch = match;
359 if (opt_n != NULL) *opt_n = len;
360 return eslOK;
361
362 ERROR:
363 if (opt_distance != NULL) *opt_distance = 0.;
364 if (opt_nmatch != NULL) *opt_nmatch = 0;
365 if (opt_n != NULL) *opt_n = 0;
366 return status;
367 }
368
369
370 /* Function: esl_dst_XJukesCantor()
371 * Synopsis: Jukes-Cantor distance for two aligned digitized seqs.
372 * Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis]
373 *
374 * Purpose: Calculate the generalized Jukes-Cantor distance between two
375 * aligned digital strings <ax> and <ay>, in substitutions/site,
376 * using alphabet <abc> to evaluate identities and differences.
377 * The maximum likelihood estimate for the distance is optionally returned in
378 * <opt_distance>. The large-sample variance for the distance
379 * estimate is optionally returned in <opt_variance>.
380 *
381 * Identical to <esl_dst_CJukesCantor()>, except that it takes
382 * digital sequences instead of character strings.
383 *
384 * Args: abc - bioalphabet to use for comparisons
385 * ax - 1st digital aligned seq
386 * ay - 2nd digital aligned seq
387 * opt_distance - optRETURN: ML estimate of distance d
388 * opt_variance - optRETURN: large-sample variance of d
389 *
390 * Returns: <eslOK> on success. As in <esl_dst_CJukesCantor()>, the
391 * distance and variance may be infinite, in which case they
392 * are returned as <HUGE_VAL>.
393 *
394 * Throws: <eslEINVAL> if the two strings aren't the same length (and
395 * thus can't have been properly aligned).
396 * <eslEDIVZERO> if no aligned residues were counted.
397 * On either failure, the distance and variance are set
398 * to <HUGE_VAL>.
399 */
400 int
esl_dst_XJukesCantor(const ESL_ALPHABET * abc,const ESL_DSQ * ax,const ESL_DSQ * ay,double * opt_distance,double * opt_variance)401 esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay,
402 double *opt_distance, double *opt_variance)
403 {
404 int status;
405 int n1, n2; /* number of observed identities, substitutions */
406 int i; /* position in aligned seqs */
407
408 n1 = n2 = 0;
409 for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++)
410 {
411 if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i]))
412 {
413 if (ax[i] == ay[i]) n1++;
414 else n2++;
415 }
416 }
417 if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL)
418 ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
419
420 return jukescantor(n1, n2, abc->K, opt_distance, opt_variance);
421
422 ERROR:
423 if (opt_distance != NULL) *opt_distance = HUGE_VAL;
424 if (opt_variance != NULL) *opt_variance = HUGE_VAL;
425 return status;
426 }
427
428
429 /*---------- end pairwise distances, digital seqs --------------*/
430
431
432
433
434 /*****************************************************************
435 * 3. Distance matrices for aligned text sequences.
436 *****************************************************************/
437
438 /* Function: esl_dst_CPairIdMx()
439 * Synopsis: NxN identity matrix for N aligned text sequences.
440 * Incept: SRE, Thu Apr 27 08:46:08 2006 [New York]
441 *
442 * Purpose: Given a multiple sequence alignment <as>, consisting
443 * of <N> aligned character strings; calculate
444 * a symmetric fractional pairwise identity matrix by $N(N-1)/2$
445 * calls to <esl_dst_CPairId()>, and return it in
446 * <ret_D>.
447 *
448 * Args: as - aligned seqs (all same length), [0..N-1]
449 * N - # of aligned sequences
450 * ret_S - RETURN: symmetric fractional identity matrix
451 *
452 * Returns: <eslOK> on success, and <ret_S> contains the fractional
453 * identity matrix. Caller free's <S> with
454 * <esl_dmatrix_Destroy()>.
455 *
456 * Throws: <eslEINVAL> if a seq has a different
457 * length than others. On failure, <ret_D> is returned <NULL>
458 * and state of inputs is unchanged.
459 *
460 * <eslEMEM> on allocation failure.
461 */
462 int
esl_dst_CPairIdMx(char ** as,int N,ESL_DMATRIX ** ret_S)463 esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S)
464 {
465 ESL_DMATRIX *S = NULL;
466 int status;
467 int i,j;
468
469 if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
470
471 for (i = 0; i < N; i++)
472 {
473 S->mx[i][i] = 1.;
474 for (j = i+1; j < N; j++)
475 {
476 status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL);
477 if (status != eslOK)
478 ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
479 S->mx[j][i] = S->mx[i][j];
480 }
481 }
482 if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
483 return eslOK;
484
485 ERROR:
486 if (S != NULL) esl_dmatrix_Destroy(S);
487 if (ret_S != NULL) *ret_S = NULL;
488 return status;
489 }
490
491
492 /* Function: esl_dst_CDiffMx()
493 * Synopsis: NxN difference matrix for N aligned text sequences.
494 * Incept: SRE, Fri Apr 28 06:27:20 2006 [New York]
495 *
496 * Purpose: Same as <esl_dst_CPairIdMx()>, but calculates
497 * the fractional difference <d=1-s> instead of the
498 * fractional identity <s> for each pair.
499 *
500 * Args: as - aligned seqs (all same length), [0..N-1]
501 * N - # of aligned sequences
502 * ret_D - RETURN: symmetric fractional difference matrix
503 *
504 * Returns: <eslOK> on success, and <ret_D> contains the
505 * fractional difference matrix. Caller free's <D> with
506 * <esl_dmatrix_Destroy()>.
507 *
508 * Throws: <eslEINVAL> if any seq has a different
509 * length than others. On failure, <ret_D> is returned <NULL>
510 * and state of inputs is unchanged.
511 */
512 int
esl_dst_CDiffMx(char ** as,int N,ESL_DMATRIX ** ret_D)513 esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D)
514 {
515 ESL_DMATRIX *D = NULL;
516 int status;
517 int i,j;
518
519 status = esl_dst_CPairIdMx(as, N, &D);
520 if (status != eslOK) goto ERROR;
521
522 for (i = 0; i < N; i++)
523 {
524 D->mx[i][i] = 0.;
525 for (j = i+1; j < N; j++)
526 {
527 D->mx[i][j] = 1. - D->mx[i][j];
528 D->mx[j][i] = D->mx[i][j];
529 }
530 }
531
532 if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
533 return eslOK;
534
535 ERROR:
536 if (D != NULL) esl_dmatrix_Destroy(D);
537 if (ret_D != NULL) *ret_D = NULL;
538 return status;
539
540 }
541
542 /* Function: esl_dst_CJukesCantorMx()
543 * Synopsis: NxN Jukes/Cantor distance matrix for N aligned text seqs.
544 * Incept: SRE, Tue Apr 18 16:00:16 2006 [St. Louis]
545 *
546 * Purpose: Given a multiple sequence alignment <aseq>, consisting of
547 * <nseq> aligned character sequences in an alphabet of
548 * <K> letters (usually 4 for DNA, 20 for protein);
549 * calculate a symmetric Jukes/Cantor pairwise distance
550 * matrix for all sequence pairs, and optionally return the distance
551 * matrix in <ret_D>, and optionally return a symmetric matrix of the
552 * large-sample variances for those ML distance estimates
553 * in <ret_V>.
554 *
555 * Infinite distances (and variances) are possible; they
556 * are represented as <HUGE_VAL> in <D> and <V>. Caller must
557 * be prepared to deal with them as appropriate.
558 *
559 * Args: K - size of the alphabet (usually 4 or 20)
560 * aseq - aligned sequences [0.nseq-1][0..L-1]
561 * nseq - number of aseqs
562 * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
563 * opt_V - optRETURN: matrix of variances.
564 *
565 * Returns: <eslOK> on success. <D> and <V> contain the
566 * distance matrix (and variances); caller frees these with
567 * <esl_dmatrix_Destroy()>.
568 *
569 * Throws: <eslEINVAL> if any pair of sequences have differing lengths
570 * (and thus cannot have been properly aligned).
571 * <eslEDIVZERO> if some pair of sequences had no aligned
572 * residues. On failure, <D> and <V> are both returned <NULL>
573 * and state of inputs is unchanged.
574 *
575 * <eslEMEM> on allocation failure.
576 */
577 int
esl_dst_CJukesCantorMx(int K,char ** aseq,int nseq,ESL_DMATRIX ** opt_D,ESL_DMATRIX ** opt_V)578 esl_dst_CJukesCantorMx(int K, char **aseq, int nseq,
579 ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
580 {
581 int status;
582 ESL_DMATRIX *D = NULL;
583 ESL_DMATRIX *V = NULL;
584 int i,j;
585
586 if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
587 if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
588
589 for (i = 0; i < nseq; i++)
590 {
591 D->mx[i][i] = 0.;
592 V->mx[i][i] = 0.;
593 for (j = i+1; j < nseq; j++)
594 {
595 status = esl_dst_CJukesCantor(K, aseq[i], aseq[j],
596 &(D->mx[i][j]), &(V->mx[i][j]));
597 if (status != eslOK)
598 ESL_XEXCEPTION(status, "J/C calculation failed at seqs %d,%d", i,j);
599
600 D->mx[j][i] = D->mx[i][j];
601 V->mx[j][i] = V->mx[i][j];
602 }
603 }
604 if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D);
605 if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V);
606 return eslOK;
607
608 ERROR:
609 if (D != NULL) esl_dmatrix_Destroy(D);
610 if (V != NULL) esl_dmatrix_Destroy(V);
611 if (opt_D != NULL) *opt_D = NULL;
612 if (opt_V != NULL) *opt_V = NULL;
613 return status;
614 }
615 /*----------- end, distance matrices for aligned text seqs ---------*/
616
617
618
619
620 /*****************************************************************
621 * 4. Distance matrices for aligned digital sequences.
622 *****************************************************************/
623
624 /* Function: esl_dst_XPairIdMx()
625 * Synopsis: NxN identity matrix for N aligned digital seqs.
626 * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York]
627 *
628 * Purpose: Given a digitized multiple sequence alignment <ax>, consisting
629 * of <N> aligned digital sequences in alphabet <abc>; calculate
630 * a symmetric pairwise fractional identity matrix by $N(N-1)/2$
631 * calls to <esl_dst_XPairId()>, and return it in <ret_S>.
632 *
633 * Args: abc - digital alphabet in use
634 * ax - aligned dsq's, [0..N-1][1..alen]
635 * N - number of aligned sequences
636 * ret_S - RETURN: NxN matrix of fractional identities
637 *
638 * Returns: <eslOK> on success, and <ret_S> contains the distance
639 * matrix. Caller is obligated to free <S> with
640 * <esl_dmatrix_Destroy()>.
641 *
642 * Throws: <eslEINVAL> if a seq has a different
643 * length than others. On failure, <ret_S> is returned <NULL>
644 * and state of inputs is unchanged.
645 *
646 * <eslEMEM> on allocation failure.
647 */
648 int
esl_dst_XPairIdMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,ESL_DMATRIX ** ret_S)649 esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
650 {
651 int status;
652 ESL_DMATRIX *S = NULL;
653 int i,j;
654
655 if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
656
657 for (i = 0; i < N; i++)
658 {
659 S->mx[i][i] = 1.;
660 for (j = i+1; j < N; j++)
661 {
662 status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
663 if (status != eslOK)
664 ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
665 S->mx[j][i] = S->mx[i][j];
666 }
667 }
668 if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
669 return eslOK;
670
671 ERROR:
672 if (S != NULL) esl_dmatrix_Destroy(S);
673 if (ret_S != NULL) *ret_S = NULL;
674 return status;
675 }
676
677
678 /* Function: esl_dst_XDiffMx()
679 * Synopsis: NxN difference matrix for N aligned digital seqs.
680 * Incept: SRE, Fri Apr 28 06:37:29 2006 [New York]
681 *
682 * Purpose: Same as <esl_dst_XPairIdMx()>, but calculates fractional
683 * difference <1-s> instead of fractional identity <s> for
684 * each pair.
685 *
686 * Args: abc - digital alphabet in use
687 * ax - aligned dsq's, [0..N-1][1..alen]
688 * N - number of aligned sequences
689 * ret_D - RETURN: NxN matrix of fractional differences
690 *
691 * Returns: <eslOK> on success, and <ret_D> contains the difference
692 * matrix; caller is obligated to free <D> with
693 * <esl_dmatrix_Destroy()>.
694 *
695 * Throws: <eslEINVAL> if a seq has a different
696 * length than others. On failure, <ret_D> is returned <NULL>
697 * and state of inputs is unchanged.
698 */
699 int
esl_dst_XDiffMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,ESL_DMATRIX ** ret_D)700 esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D)
701 {
702 int status;
703 ESL_DMATRIX *D = NULL;
704 int i,j;
705
706 status = esl_dst_XPairIdMx(abc, ax, N, &D);
707 if (status != eslOK) goto ERROR;
708
709 for (i = 0; i < N; i++)
710 {
711 D->mx[i][i] = 0.;
712 for (j = i+1; j < N; j++)
713 {
714 D->mx[i][j] = 1. - D->mx[i][j];
715 D->mx[j][i] = D->mx[i][j];
716 }
717 }
718 if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
719 return eslOK;
720
721 ERROR:
722 if (D != NULL) esl_dmatrix_Destroy(D);
723 if (ret_D != NULL) *ret_D = NULL;
724 return status;
725 }
726
727 /* Function: esl_dst_XJukesCantorMx()
728 * Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs.
729 * Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City]
730 *
731 * Purpose: Given a digitized multiple sequence alignment <ax>,
732 * consisting of <nseq> aligned digital sequences in
733 * bioalphabet <abc>, calculate a symmetric Jukes/Cantor
734 * pairwise distance matrix for all sequence pairs;
735 * optionally return the distance matrix in <ret_D> and
736 * a matrix of the large-sample variances for those ML distance
737 * estimates in <ret_V>.
738 *
739 * Infinite distances (and variances) are possible. They
740 * are represented as <HUGE_VAL> in <D> and <V>. Caller must
741 * be prepared to deal with them as appropriate.
742 *
743 * Args: abc - bioalphabet for <aseq>
744 * ax - aligned digital sequences [0.nseq-1][1..L]
745 * nseq - number of aseqs
746 * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
747 * opt_V - optRETURN: matrix of variances.
748 *
749 * Returns: <eslOK> on success. <D> (and optionally <V>) contain the
750 * distance matrix (and variances). Caller frees these with
751 * <esl_dmatrix_Destroy()>.
752 *
753 * Throws: <eslEINVAL> if any pair of sequences have differing lengths
754 * (and thus cannot have been properly aligned).
755 * <eslEDIVZERO> if some pair of sequences had no aligned
756 * residues. On failure, <D> and <V> are both returned <NULL>
757 * and state of inputs is unchanged.
758 *
759 * <eslEMEM> on allocation failure.
760 */
761 int
esl_dst_XJukesCantorMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int nseq,ESL_DMATRIX ** opt_D,ESL_DMATRIX ** opt_V)762 esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq,
763 ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
764 {
765 ESL_DMATRIX *D = NULL;
766 ESL_DMATRIX *V = NULL;
767 int status;
768 int i,j;
769
770 if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
771 if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
772
773 for (i = 0; i < nseq; i++)
774 {
775 D->mx[i][i] = 0.;
776 V->mx[i][i] = 0.;
777 for (j = i+1; j < nseq; j++)
778 {
779 status = esl_dst_XJukesCantor(abc, ax[i], ax[j],
780 &(D->mx[i][j]), &(V->mx[i][j]));
781 if (status != eslOK)
782 ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j);
783
784 D->mx[j][i] = D->mx[i][j];
785 V->mx[j][i] = V->mx[i][j];
786 }
787 }
788 if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D);
789 if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V);
790 return eslOK;
791
792 ERROR:
793 if (D != NULL) esl_dmatrix_Destroy(D);
794 if (V != NULL) esl_dmatrix_Destroy(V);
795 if (opt_D != NULL) *opt_D = NULL;
796 if (opt_V != NULL) *opt_V = NULL;
797 return status;
798 }
799 /*------- end, distance matrices for digital alignments ---------*/
800
801
802
803 /*****************************************************************
804 * 5. Average pairwise identity for multiple alignments
805 *****************************************************************/
806
807 /* Function: esl_dst_CAverageId()
808 * Synopsis: Calculate avg identity for multiple alignment
809 * Incept: SRE, Fri May 18 15:02:38 2007 [Janelia]
810 *
811 * Purpose: Calculates the average pairwise fractional identity in
812 * a multiple sequence alignment <as>, consisting of <N>
813 * aligned character sequences of identical length.
814 *
815 * If an exhaustive calculation would require more than
816 * <max_comparisons> pairwise comparisons, then instead of
817 * looking at all pairs, calculate the average over a
818 * stochastic sample of <max_comparisons> random pairs.
819 * This allows the routine to work efficiently even on very
820 * deep MSAs.
821 *
822 * Each fractional pairwise identity (range $[0..$ pid $..1]$
823 * is calculated using <esl_dst_CPairId()>.
824 *
825 * Returns: <eslOK> on success, and <*ret_id> contains the average
826 * fractional identity.
827 *
828 * Throws: <eslEMEM> on allocation failure.
829 * <eslEINVAL> if any of the aligned sequence pairs aren't
830 * of the same length.
831 * In either case, <*ret_id> is set to 0.
832 */
833 int
esl_dst_CAverageId(char ** as,int N,int max_comparisons,double * ret_id)834 esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
835 {
836 int status;
837 double id;
838 double sum = 0.;
839 int i,j,n;
840
841 if (N <= 1) { *ret_id = 1.; return eslOK; }
842 *ret_id = 0.;
843
844 /* Is nseq small enough that we can average over all pairwise comparisons? */
845 if ((N * (N-1) / 2) <= max_comparisons)
846 {
847 for (i = 0; i < N; i++)
848 for (j = i+1; j < N; j++)
849 {
850 if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
851 sum += id;
852 }
853 sum /= (double) (N * (N-1) / 2);
854 }
855
856 /* If nseq is large, calculate average over a stochastic sample. */
857 else
858 {
859 ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
860 for (n = 0; n < max_comparisons; n++)
861 {
862 do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
863 if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
864 sum += id;
865 }
866 sum /= (double) max_comparisons;
867 esl_randomness_Destroy(r);
868 }
869
870 *ret_id = sum;
871 return eslOK;
872 }
873
874 /* Function: esl_dst_CAverageMatch()
875 * Synopsis: Calculate avg matches for multiple alignment
876 * Incept: ER, Wed Oct 29 09:25:09 EDT 2014 [Janelia]
877 *
878 * Purpose: Calculates the average pairwise fractional matches in
879 * a multiple sequence alignment <as>, consisting of <N>
880 * aligned character sequences of identical length.
881 *
882 * If an exhaustive calculation would require more than
883 * <max_comparisons> pairwise comparisons, then instead of
884 * looking at all pairs, calculate the average over a
885 * stochastic sample of <max_comparisons> random pairs.
886 * This allows the routine to work efficiently even on very
887 * deep MSAs.
888 *
889 * Each fractional pairwise matches (range $[0..$ pid $..1]$
890 * is calculated using <esl_dst_CPairMatch()>.
891 *
892 * Returns: <eslOK> on success, and <*ret_match> contains the average
893 * fractional matches.
894 *
895 * Throws: <eslEMEM> on allocation failure.
896 * <eslEINVAL> if any of the aligned sequence pairs aren't
897 * of the same length.
898 * In either case, <*ret_match> is set to 0.
899 */
900 int
esl_dst_CAverageMatch(char ** as,int N,int max_comparisons,double * ret_match)901 esl_dst_CAverageMatch(char **as, int N, int max_comparisons, double *ret_match)
902 {
903 int status;
904 double match;
905 double sum = 0.;
906 int i,j,n;
907
908 if (N <= 1) { *ret_match = 1.; return eslOK; }
909 *ret_match = 0.;
910
911 /* Is nseq small enough that we can average over all pairwise comparisons? */
912 if ((N * (N-1) / 2) <= max_comparisons)
913 {
914 for (i = 0; i < N; i++)
915 for (j = i+1; j < N; j++)
916 {
917 if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
918 sum += match;
919 }
920 sum /= (double) (N * (N-1) / 2);
921 }
922
923 /* If nseq is large, calculate average over a stochastic sample. */
924 else
925 {
926 ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
927 for (n = 0; n < max_comparisons; n++)
928 {
929 do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
930 if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
931 sum += match;
932 }
933 sum /= (double) max_comparisons;
934 esl_randomness_Destroy(r);
935 }
936
937 *ret_match = sum;
938 return eslOK;
939 }
940
941 /* Function: esl_dst_XAverageId()
942 * Synopsis: Calculate avg identity for digital MSA
943 * Incept: SRE, Fri May 18 15:19:14 2007 [Janelia]
944 *
945 * Purpose: Calculates the average pairwise fractional identity in
946 * a digital multiple sequence alignment <ax>, consisting of <N>
947 * aligned digital sequences of identical length.
948 *
949 * If an exhaustive calculation would require more than
950 * <max_comparisons> pairwise comparisons, then instead of
951 * looking at all pairs, calculate the average over a
952 * stochastic sample of <max_comparisons> random pairs.
953 * This allows the routine to work efficiently even on very
954 * deep MSAs.
955 *
956 * Each fractional pairwise identity (range $[0..$ pid $..1]$
957 * is calculated using <esl_dst_XPairId()>.
958 *
959 * Returns: <eslOK> on success, and <*ret_id> contains the average
960 * fractional identity.
961 *
962 * Throws: <eslEMEM> on allocation failure.
963 * <eslEINVAL> if any of the aligned sequence pairs aren't
964 * of the same length.
965 * In either case, <*ret_id> is set to 0.
966 */
967 int
esl_dst_XAverageId(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,int max_comparisons,double * ret_id)968 esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
969 {
970 int status;
971 double id;
972 double sum = 0.;
973 int i,j,n;
974
975 if (N <= 1) { *ret_id = 1.; return eslOK; }
976 *ret_id = 0.;
977
978 /* Is N small enough that we can average over all pairwise comparisons?
979 watch out for numerical overflow in this: Pfam N's easily overflow when squared
980 */
981 if (N <= max_comparisons &&
982 N <= sqrt(2. * max_comparisons) &&
983 (N * (N-1) / 2) <= max_comparisons)
984 {
985 for (i = 0; i < N; i++)
986 for (j = i+1; j < N; j++)
987 {
988 if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
989 sum += id;
990 }
991 sum /= (double) (N * (N-1) / 2);
992 }
993
994 /* If nseq is large, calculate average over a stochastic sample. */
995 else
996 {
997 ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
998 for (n = 0; n < max_comparisons; n++)
999 {
1000 do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
1001 if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
1002 sum += id;
1003 }
1004 sum /= (double) max_comparisons;
1005 esl_randomness_Destroy(r);
1006 }
1007
1008 *ret_id = sum;
1009 return eslOK;
1010 }
1011
1012 /* Function: esl_dst_XAverageMatch()
1013 * Synopsis: Calculate avg matches for digital MSA
1014 * Incept: ER, ed Oct 29 09:29:05 EDT 2014 [Janelia]
1015 *
1016 * Purpose: Calculates the average pairwise fractional matches in
1017 * a digital multiple sequence alignment <ax>, consisting of <N>
1018 * aligned digital sequences of identical length.
1019 *
1020 * If an exhaustive calculation would require more than
1021 * <max_comparisons> pairwise comparisons, then instead of
1022 * looking at all pairs, calculate the average over a
1023 * stochastic sample of <max_comparisons> random pairs.
1024 * This allows the routine to work efficiently even on very
1025 * deep MSAs.
1026 *
1027 * Each fractional pairwise matches (range $[0..$ pid $..1]$
1028 * is calculated using <esl_dst_XPairMatch()>.
1029 *
1030 * Returns: <eslOK> on success, and <*ret_match> contains the average
1031 * fractional identity.
1032 *
1033 * Throws: <eslEMEM> on allocation failure.
1034 * <eslEINVAL> if any of the aligned sequence pairs aren't
1035 * of the same length.
1036 * In either case, <*ret_match> is set to 0.
1037 */
1038 int
esl_dst_XAverageMatch(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,int max_comparisons,double * ret_match)1039 esl_dst_XAverageMatch(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_match)
1040 {
1041 int status;
1042 double match;
1043 double sum = 0.;
1044 int i,j,n;
1045
1046 if (N <= 1) { *ret_match = 1.; return eslOK; }
1047 *ret_match = 0.;
1048
1049 /* Is N small enough that we can average over all pairwise comparisons?
1050 watch out for numerical overflow in this: Pfam N's easily overflow when squared
1051 */
1052 if (N <= max_comparisons &&
1053 N <= sqrt(2. * max_comparisons) &&
1054 (N * (N-1) / 2) <= max_comparisons)
1055 {
1056 for (i = 0; i < N; i++)
1057 for (j = i+1; j < N; j++)
1058 {
1059 if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
1060 sum += match;
1061 }
1062 sum /= (double) (N * (N-1) / 2);
1063 }
1064
1065 /* If nseq is large, calculate average over a stochastic sample. */
1066 else
1067 {
1068 ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
1069 for (n = 0; n < max_comparisons; n++)
1070 {
1071 do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
1072 if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
1073 sum += match;
1074 }
1075 sum /= (double) max_comparisons;
1076 esl_randomness_Destroy(r);
1077 }
1078
1079 *ret_match = sum;
1080 return eslOK;
1081 }
1082
1083
1084
1085
1086
1087
1088 /*****************************************************************
1089 * 6. Private (static) functions
1090 *****************************************************************/
1091
1092 /* jukescantor()
1093 *
1094 * The generalized Jukes/Cantor distance calculation.
1095 * Given <n1> identities and <n2> differences, for a
1096 * base alphabet size of <alphabet_size> (4 or 20);
1097 * calculate J/C distance in substitutions/site and
1098 * return it in <ret_distance>; calculate large-sample
1099 * variance and return it in <ret_variance>.
1100 *
1101 * Returns <eslEDIVZERO> if there are no data (<n1+n2=0>).
1102 */
1103 static int
jukescantor(int n1,int n2,int alphabet_size,double * opt_distance,double * opt_variance)1104 jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance)
1105 {
1106 int status;
1107 double D, K, N;
1108 double x;
1109 double distance, variance;
1110
1111 ESL_DASSERT1( (n1 >= 0) );
1112 ESL_DASSERT1( (n2 >= 0) );
1113 ESL_DASSERT1( (alphabet_size >= 0) );
1114
1115 if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; }
1116
1117 K = (double) alphabet_size;
1118 D = (double) n2 / (double) (n1+n2);
1119 N = (double) (n1+n2);
1120
1121 x = 1. - D * K/(K-1.);
1122 if (x <= 0.)
1123 {
1124 distance = HUGE_VAL;
1125 variance = HUGE_VAL;
1126 }
1127 else
1128 {
1129 distance = -log(x) * K/(K-1);
1130 variance = exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N;
1131 }
1132 if (opt_distance != NULL) *opt_distance = distance;
1133 if (opt_variance != NULL) *opt_variance = variance;
1134 return eslOK;
1135
1136 ERROR:
1137 if (opt_distance != NULL) *opt_distance = HUGE_VAL;
1138 if (opt_variance != NULL) *opt_variance = HUGE_VAL;
1139 return status;
1140 }
1141 /*--------------- end of private functions ----------------------*/
1142
1143
1144 /*****************************************************************
1145 * 7. Unit tests.
1146 *****************************************************************/
1147 #ifdef eslDISTANCE_TESTDRIVE
1148
1149 /* Each unit test is given an alignment with certain known
1150 * properties:
1151 * seqs 0,1 are identical
1152 * seqs 0,2 are completely different
1153 * seqs 3..N are random
1154 * The alignment may contain gaps, so don't assume that the
1155 * # of compared residues == alignment length. The alignment
1156 * contains only canonical residues, because one of our tests
1157 * is that C and X functions give the same results.
1158 */
1159 static int
utest_CPairId(char ** as,int N)1160 utest_CPairId(char **as, int N)
1161 {
1162 double pid;
1163 int nid;
1164 int nres;
1165 int L;
1166 int i,j;
1167
1168 /* Self comparison gives identity = 1. */
1169 L = strlen(as[0]);
1170 if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort();
1171 if (pid != 1.0 || nid != L || nres > L) abort();
1172
1173 /* So does 0,1 comparison */
1174 if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort();
1175 if (pid != 1.0 || nid != L || nres > L) abort();
1176
1177 /* 0,2 comparison gives 0.0, 0 */
1178 if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort();
1179 if (pid != 0.0 || nid != 0 || nres > L) abort();
1180
1181 /* remaining comparisons shouldn't fail */
1182 for (i = 3; i < N; i++)
1183 for (j = i; j < N; j++)
1184 {
1185 if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort();
1186 if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort();
1187 }
1188
1189 /* API should accept NULL for return values */
1190 if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort();
1191 return eslOK;
1192 }
1193
1194 static int
utest_CJukesCantor(int K,char ** as,int N)1195 utest_CJukesCantor(int K, char **as, int N)
1196 {
1197 double d, V;
1198 int i,j;
1199
1200 /* Self comparison gives distance = 0. */
1201 if (esl_dst_CJukesCantor(K, as[0], as[0], &d, &V) != eslOK) abort();
1202 if (d != 0.0) abort();
1203
1204 /* So does 0,1 comparison */
1205 if (esl_dst_CJukesCantor(K, as[0], as[1], &d, &V) != eslOK) abort();
1206 if (d != 0.0) abort();
1207
1208 /* 0,2 comparison gives infinite distance (HUGE_VAL) */
1209 if (esl_dst_CJukesCantor(K, as[0], as[2], &d, &V) != eslOK) abort();
1210 if (d != HUGE_VAL) abort();
1211
1212 /* remaining comparisons shouldn't fail */
1213 for (i = 3; i < N; i++)
1214 for (j = i; j < N; j++)
1215 if (esl_dst_CJukesCantor(K, as[i], as[j], &d, &V) != eslOK) abort();
1216
1217 /* API should accept NULL for return values */
1218 if (esl_dst_CJukesCantor(K, as[0], as[0], NULL, NULL) != eslOK) abort();
1219 return eslOK;
1220 }
1221
1222 static int
utest_XPairId(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1223 utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1224 {
1225 double pid, pid2;
1226 int nid, nid2;
1227 int nres, nres2;
1228 int dL, L;
1229 int i,j;
1230
1231 /* Self comparison gives identity = 1. */
1232 dL = esl_abc_dsqlen(ax[0]);
1233 L = strlen(as[0]);
1234 if (dL != L) abort();
1235 if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort();
1236 if (pid != 1.0 || nid != L || nres > dL) abort();
1237
1238 /* So does 0,1 comparison */
1239 if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort();
1240 if (pid != 1.0 || nid != L || nres > L) abort();
1241
1242 /* 0,2 comparison gives 0.0, 0 */
1243 if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort();
1244 if (pid != 0.0 || nid != 0 || nres > L) abort();
1245
1246 /* remaining comparisons shouldn't fail, and should be identical to text mode */
1247 for (i = 3; i < N; i++)
1248 for (j = i; j < N; j++)
1249 {
1250 if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort();
1251 if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2) != eslOK) abort();
1252 if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort();
1253 if (pid != pid2 || nid != nid2 || nres != nres2) abort();
1254 }
1255
1256 /* API should accept NULL for return values */
1257 if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort();
1258 return eslOK;
1259
1260 }
1261
1262 static int
utest_XJukesCantor(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1263 utest_XJukesCantor(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1264 {
1265 double d, V;
1266 int i,j;
1267
1268 /* Self comparison gives distance = 0. */
1269 if (esl_dst_XJukesCantor(abc, ax[0], ax[0], &d, &V) != eslOK) abort();
1270 if (d != 0.0) abort();
1271
1272 /* So does 0,1 comparison */
1273 if (esl_dst_XJukesCantor(abc, ax[0], ax[1], &d, &V) != eslOK) abort();
1274 if (d != 0.0) abort();
1275
1276 /* 0,2 comparison gives infinite distance (HUGE_VAL) */
1277 if (esl_dst_XJukesCantor(abc, ax[0], ax[2], &d, &V) != eslOK) abort();
1278 if (d != HUGE_VAL) abort();
1279
1280 /* remaining comparisons shouldn't fail */
1281 for (i = 3; i < N; i++)
1282 for (j = i; j < N; j++)
1283 if (esl_dst_XJukesCantor(abc, ax[i], ax[j], &d, &V) != eslOK) abort();
1284
1285 /* API should accept NULL for return values */
1286 if (esl_dst_XJukesCantor(abc, ax[0], ax[0], NULL, NULL) != eslOK) abort();
1287 return eslOK;
1288 }
1289
1290 static int
utest_CPairIdMx(char ** as,int N)1291 utest_CPairIdMx(char **as, int N)
1292 {
1293 ESL_DMATRIX *S;
1294 int i,j;
1295 double pid;
1296
1297 if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort();
1298
1299 for (i = 0; i < N; i++)
1300 if (S->mx[i][i] != 1.0) abort();
1301
1302 pid = 0.;
1303 for (i = 3; i < N; i++)
1304 for (j = i+1; j < N; j++)
1305 pid += S->mx[i][j];
1306 pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
1307 if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */
1308
1309 esl_dmatrix_Destroy(S);
1310 return eslOK;
1311 }
1312
1313 static int
utest_CDiffMx(char ** as,int N)1314 utest_CDiffMx(char **as, int N)
1315 {
1316 ESL_DMATRIX *D;
1317 int i,j;
1318 double diff;
1319
1320 if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort();
1321
1322 for (i = 0; i < N; i++)
1323 if (D->mx[i][i] != 0.0) abort();
1324
1325 diff = 0.;
1326 for (i = 3; i < N; i++)
1327 for (j = i+1; j < N; j++)
1328 diff += D->mx[i][j];
1329 diff /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
1330 if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */
1331
1332 esl_dmatrix_Destroy(D);
1333 return eslOK;
1334 }
1335
1336 static int
utest_CJukesCantorMx(int K,char ** as,int N)1337 utest_CJukesCantorMx(int K, char **as, int N)
1338 {
1339 ESL_DMATRIX *D, *V;
1340 /* just a crash test */
1341 if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort();
1342 esl_dmatrix_Destroy(D);
1343 esl_dmatrix_Destroy(V);
1344 return eslOK;
1345 }
1346
1347 static int
utest_XPairIdMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1348 utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1349 {
1350 ESL_DMATRIX *S, *S2;
1351 int i, j;
1352
1353 if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort();
1354 if (esl_dst_CPairIdMx(as, N, &S2) != eslOK) abort();
1355
1356 for (i = 0; i < N; i++)
1357 for (j = i; j < N; j++)
1358 if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort();
1359
1360 esl_dmatrix_Destroy(S);
1361 esl_dmatrix_Destroy(S2);
1362 return eslOK;
1363 }
1364
1365 static int
utest_XDiffMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1366 utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1367 {
1368 ESL_DMATRIX *D, *D2;
1369 int i, j;
1370
1371 if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort();
1372 if (esl_dst_CDiffMx(as, N, &D2) != eslOK) abort();
1373
1374 for (i = 0; i < N; i++)
1375 for (j = i; j < N; j++)
1376 if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
1377
1378 esl_dmatrix_Destroy(D);
1379 esl_dmatrix_Destroy(D2);
1380 return eslOK;
1381 }
1382
1383 static int
utest_XJukesCantorMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1384 utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1385 {
1386 ESL_DMATRIX *D, *D2, *V, *V2;
1387 int i, j;
1388
1389 if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V) != eslOK) abort();
1390 if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort();
1391
1392 for (i = 0; i < N; i++)
1393 for (j = i; j < N; j++)
1394 {
1395 if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
1396 if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort();
1397 }
1398
1399 esl_dmatrix_Destroy(D);
1400 esl_dmatrix_Destroy(D2);
1401 esl_dmatrix_Destroy(V);
1402 esl_dmatrix_Destroy(V2);
1403 return eslOK;
1404 }
1405 #endif /* eslDISTANCE_TESTDRIVE */
1406 /*------------------ end of unit tests --------------------------*/
1407
1408
1409
1410 /*****************************************************************
1411 * 8. Test driver.
1412 *****************************************************************/
1413
1414 #ifdef eslDISTANCE_TESTDRIVE
1415 #include "easel.h"
1416 #include "esl_alphabet.h"
1417 #include "esl_arr2.h"
1418 #include "esl_dmatrix.h"
1419 #include "esl_getopts.h"
1420 #include "esl_random.h"
1421 #include "esl_randomseq.h"
1422
1423 #include "esl_distance.h"
1424
1425 static ESL_OPTIONS options[] = {
1426 /* name type def env range toggles reqs incomp help docgroup*/
1427 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1428 { "-N", eslARG_INT, "10", NULL,"n>3", NULL, NULL, NULL, "number of iid seqs in alignment",0},
1429 { "-L", eslARG_INT, "50", NULL,"n>0", NULL, NULL, NULL, "length of seqs in alignment", 0},
1430 { "--seed", eslARG_INT, "42", NULL,"n>=0",NULL, NULL, NULL, "random # seed", 0},
1431 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1432 };
1433 static char usage[] = "Usage: ./testdrive-distance [-options]";
1434
1435 int
main(int argc,char ** argv)1436 main(int argc, char **argv)
1437 {
1438 ESL_GETOPTS *go = NULL;
1439 ESL_RANDOMNESS *r = NULL;
1440 char **as = NULL; /* aligned character seqs (random, iid) */
1441 int N,L; /* # of seqs, and their aligned lengths */
1442 int seed;
1443 int i,j;
1444 int status;
1445 double p[4]; /// ACGT probabilities
1446 ESL_DSQ **ax = NULL; // digitized alignment
1447 ESL_ALPHABET *abc = NULL;
1448
1449 /* Process command line
1450 */
1451 go = esl_getopts_Create(options);
1452 esl_opt_ProcessCmdline(go, argc, argv);
1453 esl_opt_VerifyConfig(go);
1454 if (esl_opt_GetBoolean(go, "-h") == TRUE) {
1455 puts(usage);
1456 puts("\n where options are:");
1457 esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1458 return 0;
1459 }
1460 L = esl_opt_GetInteger(go, "-L");
1461 N = esl_opt_GetInteger(go, "-N");
1462 seed = esl_opt_GetInteger(go, "--seed");
1463 if (esl_opt_ArgNumber(go) != 0) {
1464 puts("Incorrect number of command line arguments.");
1465 puts(usage);
1466 return 1;
1467 }
1468 esl_getopts_Destroy(go);
1469
1470 /* Create a random DNA alignment;
1471 * force it to obey the conventions of the unit tests:
1472 * 0,1 are identical
1473 * 0,2 are completely dissimilar
1474 */
1475 r = esl_randomness_Create(seed);
1476 for (i = 0; i < 4; i++) p[i] = 0.25;
1477 ESL_ALLOC(as, sizeof(char *) * N);
1478 for (i = 0; i < N; i++)
1479 ESL_ALLOC(as[i], sizeof(char) * (L+1));
1480 esl_rsq_IID(r, "ACGT", p, 4, L, as[0]);
1481 strcpy(as[1], as[0]);
1482 esl_rsq_IID(r, "ACGT", p, 4, L, as[2]);
1483 for (j = 0; j < L; j++)
1484 while (as[2][j] == as[0][j])
1485 as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)];
1486 for (i = 3; i < N; i++)
1487 esl_rsq_IID(r, "ACGT", p, 4, L, as[i]);
1488
1489 abc = esl_alphabet_Create(eslDNA);
1490 ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N);
1491 for (i = 0; i < N; i++)
1492 esl_abc_CreateDsq(abc, as[i], &(ax[i]));
1493
1494 /* Unit tests
1495 */
1496 if (utest_CPairId(as, N) != eslOK) return eslFAIL;
1497 if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL;
1498 if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL;
1499 if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL;
1500 if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL;
1501 if (utest_CDiffMx(as, N) != eslOK) return eslFAIL;
1502 if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL;
1503 if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL;
1504 if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL;
1505 if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL;
1506
1507
1508 esl_randomness_Destroy(r);
1509 esl_alphabet_Destroy(abc);
1510 esl_arr2_Destroy((void **) as, N);
1511 esl_arr2_Destroy((void **) ax, N);
1512 return eslOK;
1513
1514 ERROR:
1515 return eslFAIL;
1516 }
1517 #endif /*eslDISTANCE_TESTDRIVE*/
1518
1519
1520
1521
1522 /*****************************************************************
1523 * 9. Example.
1524 *****************************************************************/
1525
1526 #ifdef eslDISTANCE_EXAMPLE
1527 /*::cexcerpt::distance_example::begin::*/
1528 /* gcc -g -Wall -o example -I. -DeslDISTANCE_EXAMPLE esl_distance.c\
1529 esl_dmatrix.c esl_msa.c easel.c -lm
1530 ./example <msa file>
1531 */
1532 #include "easel.h"
1533 #include "esl_distance.h"
1534 #include "esl_dmatrix.h"
1535 #include "esl_msa.h"
1536
main(int argc,char ** argv)1537 int main(int argc, char **argv)
1538 {
1539 ESL_MSAFILE *afp;
1540 ESL_MSA *msa;
1541 ESL_DMATRIX *P;
1542 int i,j;
1543 double min, avg, max;
1544 int status;
1545
1546 if ((status = esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1547 esl_msafile_OpenFailure(afp, status);
1548 if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
1549 esl_msafile_ReadFailure(afp, status);
1550
1551 esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P);
1552
1553 min = 1.0;
1554 max = 0.0;
1555 avg = 0.0;
1556 for (i = 0; i < msa->nseq; i++)
1557 for (j = i+1; j < msa->nseq; j++)
1558 {
1559 avg += P->mx[i][j];
1560 if (P->mx[i][j] < min) min = P->mx[i][j];
1561 if (P->mx[i][j] > max) max = P->mx[i][j];
1562 }
1563 avg /= (double) (msa->nseq * (msa->nseq-1) / 2);
1564
1565 printf("Average pairwise %% id: %.1f%%\n", avg * 100.);
1566 printf("Minimum pairwise %% id: %.1f%%\n", min * 100.);
1567 printf("Maximum pairwise %% id: %.1f%%\n", max * 100.);
1568
1569 esl_dmatrix_Destroy(P);
1570 esl_msa_Destroy(msa);
1571 esl_msafile_Close(afp);
1572 return 0;
1573 }
1574 /*::cexcerpt::distance_example::end::*/
1575 #endif /*eslDISTANCE_EXAMPLE*/
1576
1577
1578
1579