1 /* Pairwise identities, distances, and distance matrices.
2  *
3  * Contents:
4  *    1. Pairwise distances for aligned text sequences.
5  *    2. Pairwise distances for aligned digital seqs.
6  *    3. Distance matrices for aligned text sequences.
7  *    4. Distance matrices for aligned digital sequences.
8  *    5. Average pairwise identity for multiple alignments.
9  *    6. Private (static) functions.
10  *    7. Unit tests.
11  *    8. Test driver.
12  *    9. Example.
13  */
14 #include "esl_config.h"
15 
16 #include <ctype.h>
17 #include <string.h>
18 #include <math.h>
19 
20 #include "easel.h"
21 #include "esl_alphabet.h"
22 #include "esl_dmatrix.h"
23 #include "esl_random.h"
24 
25 #include "esl_distance.h"
26 
27 /* Forward declaration of our static functions.
28  */
29 static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance);
30 
31 
32 /*****************************************************************
33  * 1. Pairwise distances for aligned text sequences.
34  *****************************************************************/
35 
36 /* Function:  esl_dst_CPairId()
37  * Synopsis:  Pairwise identity of two aligned text strings.
38  * Incept:    SRE, Mon Apr 17 20:06:07 2006 [St. Louis]
39  *
40  * Purpose:   Calculates pairwise fractional identity between two
41  *            aligned character strings <asq1> and <asq2>.
42  *            Return this distance in <opt_pid>; return the
43  *            number of identities counted in <opt_nid>; and
44  *            return the denominator <MIN(len1,len2)> in
45  *            <opt_n>.
46  *
47  *            Alphabetic symbols <[a-zA-Z]> are compared
48  *            case-insensitively for identity. Any nonalphabetic
49  *            character is assumed to be a gap symbol.
50  *
51  *            This simple comparison rule is unaware of synonyms and
52  *            degeneracies in biological alphabets.  For a more
53  *            sophisticated and biosequence-aware comparison, use
54  *            digitized sequences and the <esl_dst_XPairId()> function
55  *            instead. Note that currently <esl_dst_XPairId()> does
56  *            not correctly handle degeneracies, but is set up to.
57  *
58  * Args:      asq1         - aligned character string 1
59  *            asq2         - aligned character string 2
60  *            opt_pid      - optRETURN: pairwise identity, 0<=x<=1
61  *            opt_nid      - optRETURN: # of identities
62  *            opt_n        - optRETURN: denominator MIN(len1,len2)
63  *
64  * Returns:   <eslOK> on success. <opt_pid>, <opt_nid>, <opt_n>
65  *            contain the answers (for whichever were passed non-NULL).
66  *
67  * Throws:    <eslEINVAL> if the strings are different lengths
68  *            (not aligned).
69  */
70 int
esl_dst_CPairId(const char * asq1,const char * asq2,double * opt_pid,int * opt_nid,int * opt_n)71 esl_dst_CPairId(const char *asq1, const char *asq2,
72 		double *opt_pid, int *opt_nid, int *opt_n)
73 {
74   int     status;
75   int     idents;               /* total identical positions  */
76   int     len1, len2;           /* lengths of seqs            */
77   int     i;                    /* position in aligned seqs   */
78 
79   idents = len1 = len2 = 0;
80   for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
81     {
82       if (isalpha(asq1[i])) len1++;
83       if (isalpha(asq2[i])) len2++;
84       if (isalpha(asq1[i]) && isalpha(asq2[i])
85 	  && toupper(asq1[i]) == toupper(asq2[i]))
86 	idents++;
87     }
88   if (asq1[i] != '\0' || asq2[i] != '\0')
89     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
90 
91   if (opt_pid  != NULL)  *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2));
92   if (opt_nid  != NULL)  *opt_nid = idents;
93   if (opt_n    != NULL)  *opt_n   = len1;
94   return eslOK;
95 
96  ERROR:
97   if (opt_pid  != NULL)  *opt_pid = 0.;
98   if (opt_nid  != NULL)  *opt_nid = 0;
99   if (opt_n    != NULL)  *opt_n   = 0;
100   return status;
101 }
102 
103 /* Function:  esl_dst_CPairMatch()
104  * Synopsis:  Pairwise matches of two aligned text strings.
105  * Incept:    ER, Wed Oct 29 09:02:35 EDT 2014 [janelia]
106  *
107  * Purpose:   Calculates pairwise fractional matches between two
108  *            aligned character strings <asq1> and <asq2>.
109  *            Return this distance in <opt_pmatch>; return the
110  *            number of matches counted in <opt_nmatch>; and
111  *            return the denominator <alen - double_gaps> in
112  *            <opt_n>.
113  *
114  *            Alphabetic symbols <[a-zA-Z]> are compared
115  *            case-insensitively for identity. Any nonalphabetic
116  *            character is assumed to be a gap symbol.
117  *
118  *            This simple comparison rule is unaware of synonyms and
119  *            degeneracies in biological alphabets.  For a more
120  *            sophisticated and biosequence-aware comparison, use
121  *            digitized sequences and the <esl_dst_XPairmatch()> function
122  *            instead. Note that currently <esl_dst_XPairMatch()> does
123  *            not correctly handle degeneracies, but is set up to.
124  *
125  * Args:      asq1         - aligned character string 1
126  *            asq2         - aligned character string 2
127  *            opt_pmatch   - optRETURN: pairwise matches, 0<=x<=1
128  *            opt_nmatch   - optRETURN: # of matches
129  *            opt_n        - optRETURN: denominator alen - double_gaps
130  *
131  * Returns:   <eslOK> on success. <opt_pmatch>, <opt_nmatch>, <opt_n>
132  *            contain the answers (for whichever were passed non-NULL).
133  *
134  * Throws:    <eslEINVAL> if the strings are different lengths
135  *            (not aligned).
136  */
137 int
esl_dst_CPairMatch(const char * asq1,const char * asq2,double * opt_pmatch,int * opt_nmatch,int * opt_n)138 esl_dst_CPairMatch(const char *asq1, const char *asq2,
139 		   double *opt_pmatch, int *opt_nmatch, int *opt_n)
140 {
141   int     status;
142   int     match;                /* total matched positions              */
143   int     len;                  /* length of alignment (no double gaps) */
144   int     i;                    /* position in aligned seqs             */
145 
146   match = len = 0;
147   for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
148     {
149       if (isalpha(asq1[i]) || isalpha(asq2[i])) len++;
150       if (isalpha(asq1[i]) && isalpha(asq2[i])) match++;
151     }
152   if (asq1[i] != '\0' || asq2[i] != '\0')
153     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
154 
155   if (opt_pmatch != NULL)  *opt_pmatch = ( len==0 ? 0. : (double)match / (double)len);
156   if (opt_nmatch != NULL)  *opt_nmatch = match;
157   if (opt_n      != NULL)  *opt_n      = len;
158   return eslOK;
159 
160  ERROR:
161   if (opt_pmatch != NULL)  *opt_pmatch = 0.;
162   if (opt_nmatch != NULL)  *opt_nmatch = 0;
163   if (opt_n      != NULL)  *opt_n      = 0;
164   return status;
165 }
166 
167 /* Function:  esl_dst_CJukesCantor()
168  * Synopsis:  Jukes-Cantor distance for two aligned strings.
169  * Incept:    SRE, Tue Apr 18 14:00:37 2006 [St. Louis]
170  *
171  * Purpose:   Calculate the generalized Jukes-Cantor distance between
172  *            two aligned character strings <as1> and <as2>, in
173  *            substitutions/site, for an alphabet of <K> residues
174  *            (<K=4> for nucleic acid, <K=20> for proteins). The
175  *            maximum likelihood estimate for the distance is
176  *            optionally returned in <opt_distance>. The large-sample
177  *            variance for the distance estimate is
178  *            optionally returned in <opt_variance>.
179  *
180  *            Alphabetic symbols <[a-zA-Z]> are compared
181  *            case-insensitively to count the number of identities
182  *            (<n1>) and mismatches (<n2>>). Any nonalphabetic
183  *            character is assumed to be a gap symbol, and aligned
184  *            columns containing gap symbols are ignored.  The
185  *            fractional difference <D> used to calculate the
186  *            Jukes/Cantor distance is <n2/n1+n2>.
187  *
188  * Args:      K            - size of the alphabet (4 or 20)
189  *            as1          - 1st aligned seq, 0..L-1, \0-terminated
190  *            as2          - 2nd aligned seq, 0..L-1, \0-terminated
191  *            opt_distance - optRETURN: ML estimate of distance d
192  *            opt_variance - optRETURN: large-sample variance of d
193  *
194  * Returns:   <eslOK> on success.
195  *
196  *            Infinite distances are possible, in which case distance
197  *            and variance are both <HUGE_VAL>. Caller has to deal
198  *            with this case as it sees fit, perhaps by enforcing
199  *            an arbitrary maximum distance.
200  *
201  * Throws:    <eslEINVAL> if the two strings aren't the same length (and
202  *            thus can't have been properly aligned).
203  *            <eslEDIVZERO> if no aligned residues were counted.
204  *            On either failure, distance and variance are both returned
205  *            as <HUGE_VAL>.
206  */
207 int
esl_dst_CJukesCantor(int K,const char * as1,const char * as2,double * opt_distance,double * opt_variance)208 esl_dst_CJukesCantor(int K, const char *as1, const char *as2,
209 		     double *opt_distance, double *opt_variance)
210 {
211   int     status;
212   int     n1, n2;               /* number of observed identities, substitutions */
213   int     i;                    /* position in aligned seqs   */
214 
215   /* 1. Count identities, mismatches.
216    */
217   n1 = n2 = 0;
218   for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++)
219     {
220       if (isalpha(as1[i]) && isalpha(as2[i]))
221 	{
222 	  if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++;
223 	}
224     }
225   if (as1[i] != '\0' || as2[i] != '\0')
226     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
227 
228   return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */
229 
230  ERROR:
231   if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
232   if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
233   return status;
234 }
235 
236 /*------- end, pairwise distances for aligned text seqs ---------*/
237 
238 
239 
240 
241 
242 /*****************************************************************
243  * 2. Pairwise distances for aligned digitized sequences.
244  *****************************************************************/
245 
246 /* Function:  esl_dst_XPairId()
247  * Synopsis:  Pairwise identity of two aligned digital seqs.
248  * Incept:    SRE, Tue Apr 18 09:24:05 2006 [St. Louis]
249  *
250  * Purpose:   Digital version of <esl_dst_CPairId()>: <adsq1> and
251  *            <adsq2> are digitized aligned sequences, in alphabet
252  *            <abc>. Otherwise, same as <esl_dst_CPairId()> except
253  *            that only canonical residues are counted and checked for
254  *            identity, while <esl_dst_CPairId()> (which has no
255  *            alphabet) counts and checks identity of all alphanumeric
256  *            characters.
257  *
258  *            This function does not use <esl_abc_Match()> to handle
259  *            degeneracies but it is set up to do so. Doing that would
260  *            require that <opt_nid> be changed to a float or double,
261  *            or its meaning be changed to be the number of canonical
262  *            identities.
263  *
264  * Args:      abc          - digital alphabet in use
265  *            ax1          - aligned digital seq 1
266  *            ax2          - aligned digital seq 2
267  *            opt_pid      - optRETURN: pairwise identity, 0<=x<=1
268  *            opt_nid      - optRETURN: # of identities
269  *            opt_n        - optRETURN: denominator MIN(len1,len2)
270  *
271  * Returns:   <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n>
272  *            contain the answers, for any of these that were passed
273  *            non-<NULL> pointers.
274  *
275  * Throws:    <eslEINVAL> if the strings are different lengths (not aligned).
276  */
277 int
esl_dst_XPairId(const ESL_ALPHABET * abc,const ESL_DSQ * ax1,const ESL_DSQ * ax2,double * opt_distance,int * opt_nid,int * opt_n)278 esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
279 		double *opt_distance, int *opt_nid, int *opt_n)
280 {
281   int     status;
282   int     idents;               /* total identical positions  */
283   int     len1, len2;           /* lengths of seqs            */
284   int     i;                    /* position in aligned seqs   */
285 
286   idents = len1 = len2 = 0;
287   for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
288     {
289       if (esl_abc_XIsCanonical(abc, ax1[i])) len1++;
290       if (esl_abc_XIsCanonical(abc, ax2[i])) len2++;
291 
292       if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])
293 	  && ax1[i] == ax2[i])
294 	idents++;
295     }
296   if (len2 < len1) len1 = len2;
297 
298   if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
299     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
300 
301   if (opt_distance != NULL)  *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 );
302   if (opt_nid      != NULL)  *opt_nid      = idents;
303   if (opt_n        != NULL)  *opt_n        = len1;
304   return eslOK;
305 
306  ERROR:
307   if (opt_distance != NULL)  *opt_distance = 0.;
308   if (opt_nid      != NULL)  *opt_nid      = 0;
309   if (opt_n        != NULL)  *opt_n        = 0;
310   return status;
311 }
312 
313 /* Function:  esl_dst_XPairMatch()
314  * Synopsis:  Pairwise matches of two aligned digital seqs.
315  * Incept:    ER, Wed Oct 29 09:09:07 EDT 2014 [janelia]
316  *
317  * Purpose:   Digital version of <esl_dst_CPairMatch()>: <adsq1> and
318  *            <adsq2> are digitized aligned sequences, in alphabet
319  *            <abc>. Otherwise, same as <esl_dst_CPairId()> except
320  *            that only canonical residues are counted and checked for
321  *            identity, while <esl_dst_CPairId()> (which has no
322  *            alphabet) counts and checks identity of all alphanumeric
323  *            characters.
324  *
325  * Args:      abc          - digital alphabet in use
326  *            ax1          - aligned digital seq 1
327  *            ax2          - aligned digital seq 2
328  *            opt_pmatch   - optRETURN: pairwise matches, 0<=x<=1
329  *            opt_nmatch   - optRETURN: # of maches
330  *            opt_n        - optRETURN: denominator alen-double_gaps
331  *
332  * Returns:   <eslOK> on success. <opt_distance>, <opt_nmatch>, <opt_n>
333  *            contain the answers, for any of these that were passed
334  *            non-<NULL> pointers.
335  *
336  * Throws:    <eslEINVAL> if the strings are different lengths (not aligned).
337  */
338 int
esl_dst_XPairMatch(const ESL_ALPHABET * abc,const ESL_DSQ * ax1,const ESL_DSQ * ax2,double * opt_distance,int * opt_nmatch,int * opt_n)339 esl_dst_XPairMatch(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
340 		   double *opt_distance, int *opt_nmatch, int *opt_n)
341 {
342   int     status;
343   int     match;                /* total matched positions              */
344   int     len;                  /* length of alignment (no double gaps) */
345   int     i;                    /* position in aligned seqs             */
346 
347   match = len = 0;
348   for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
349     {
350       if (esl_abc_XIsCanonical(abc, ax1[i]) || esl_abc_XIsCanonical(abc, ax2[i])) len ++;
351       if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])) match++;
352     }
353 
354   if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
355     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
356 
357   if (opt_distance != NULL)  *opt_distance = ( len==0 ? 0. : (double)match / (double)len );
358   if (opt_nmatch   != NULL)  *opt_nmatch   = match;
359   if (opt_n        != NULL)  *opt_n        = len;
360   return eslOK;
361 
362  ERROR:
363   if (opt_distance != NULL)  *opt_distance = 0.;
364   if (opt_nmatch   != NULL)  *opt_nmatch   = 0;
365   if (opt_n        != NULL)  *opt_n        = 0;
366   return status;
367 }
368 
369 
370 /* Function:  esl_dst_XJukesCantor()
371  * Synopsis:  Jukes-Cantor distance for two aligned digitized seqs.
372  * Incept:    SRE, Tue Apr 18 15:26:51 2006 [St. Louis]
373  *
374  * Purpose:   Calculate the generalized Jukes-Cantor distance between two
375  *            aligned digital strings <ax> and <ay>, in substitutions/site,
376  *            using alphabet <abc> to evaluate identities and differences.
377  *            The maximum likelihood estimate for the distance is optionally returned in
378  *            <opt_distance>. The large-sample variance for the distance
379  *            estimate is optionally returned in <opt_variance>.
380  *
381  *            Identical to <esl_dst_CJukesCantor()>, except that it takes
382  *            digital sequences instead of character strings.
383  *
384  * Args:      abc          - bioalphabet to use for comparisons
385  *            ax           - 1st digital aligned seq
386  *            ay           - 2nd digital aligned seq
387  *            opt_distance - optRETURN: ML estimate of distance d
388  *            opt_variance - optRETURN: large-sample variance of d
389  *
390  * Returns:   <eslOK> on success. As in <esl_dst_CJukesCantor()>, the
391  *            distance and variance may be infinite, in which case they
392  *            are returned as <HUGE_VAL>.
393  *
394  * Throws:    <eslEINVAL> if the two strings aren't the same length (and
395  *            thus can't have been properly aligned).
396  *            <eslEDIVZERO> if no aligned residues were counted.
397  *            On either failure, the distance and variance are set
398  *            to <HUGE_VAL>.
399  */
400 int
esl_dst_XJukesCantor(const ESL_ALPHABET * abc,const ESL_DSQ * ax,const ESL_DSQ * ay,double * opt_distance,double * opt_variance)401 esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay,
402 		     double *opt_distance, double *opt_variance)
403 {
404   int     status;
405   int     n1, n2;               /* number of observed identities, substitutions */
406   int     i;                    /* position in aligned seqs   */
407 
408   n1 = n2 = 0;
409   for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++)
410     {
411       if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i]))
412 	{
413 	  if (ax[i] == ay[i]) n1++;
414 	  else                n2++;
415 	}
416     }
417   if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL)
418     ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
419 
420   return jukescantor(n1, n2, abc->K, opt_distance, opt_variance);
421 
422  ERROR:
423   if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
424   if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
425   return status;
426 }
427 
428 
429 /*---------- end pairwise distances, digital seqs --------------*/
430 
431 
432 
433 
434 /*****************************************************************
435  * 3. Distance matrices for aligned text sequences.
436  *****************************************************************/
437 
438 /* Function:  esl_dst_CPairIdMx()
439  * Synopsis:  NxN identity matrix for N aligned text sequences.
440  * Incept:    SRE, Thu Apr 27 08:46:08 2006 [New York]
441  *
442  * Purpose:   Given a multiple sequence alignment <as>, consisting
443  *            of <N> aligned character strings; calculate
444  *            a symmetric fractional pairwise identity matrix by $N(N-1)/2$
445  *            calls to <esl_dst_CPairId()>, and return it in
446  *            <ret_D>.
447  *
448  * Args:      as      - aligned seqs (all same length), [0..N-1]
449  *            N       - # of aligned sequences
450  *            ret_S   - RETURN: symmetric fractional identity matrix
451  *
452  * Returns:   <eslOK> on success, and <ret_S> contains the fractional
453  *            identity matrix. Caller free's <S> with
454  *            <esl_dmatrix_Destroy()>.
455  *
456  * Throws:    <eslEINVAL> if a seq has a different
457  *            length than others. On failure, <ret_D> is returned <NULL>
458  *            and state of inputs is unchanged.
459  *
460  *            <eslEMEM> on allocation failure.
461  */
462 int
esl_dst_CPairIdMx(char ** as,int N,ESL_DMATRIX ** ret_S)463 esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S)
464 {
465   ESL_DMATRIX *S = NULL;
466   int status;
467   int i,j;
468 
469   if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
470 
471   for (i = 0; i < N; i++)
472     {
473       S->mx[i][i] = 1.;
474       for (j = i+1; j < N; j++)
475 	{
476 	  status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL);
477 	  if (status != eslOK)
478 	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
479 	  S->mx[j][i] =  S->mx[i][j];
480 	}
481     }
482   if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
483   return eslOK;
484 
485  ERROR:
486   if (S     != NULL)  esl_dmatrix_Destroy(S);
487   if (ret_S != NULL) *ret_S = NULL;
488   return status;
489 }
490 
491 
492 /* Function:  esl_dst_CDiffMx()
493  * Synopsis:  NxN difference matrix for N aligned text sequences.
494  * Incept:    SRE, Fri Apr 28 06:27:20 2006 [New York]
495  *
496  * Purpose:   Same as <esl_dst_CPairIdMx()>, but calculates
497  *            the fractional difference <d=1-s> instead of the
498  *            fractional identity <s> for each pair.
499  *
500  * Args:      as      - aligned seqs (all same length), [0..N-1]
501  *            N       - # of aligned sequences
502  *            ret_D   - RETURN: symmetric fractional difference matrix
503  *
504  * Returns:   <eslOK> on success, and <ret_D> contains the
505  *            fractional difference matrix. Caller free's <D> with
506  *            <esl_dmatrix_Destroy()>.
507  *
508  * Throws:    <eslEINVAL> if any seq has a different
509  *            length than others. On failure, <ret_D> is returned <NULL>
510  *            and state of inputs is unchanged.
511  */
512 int
esl_dst_CDiffMx(char ** as,int N,ESL_DMATRIX ** ret_D)513 esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D)
514 {
515   ESL_DMATRIX *D = NULL;
516   int status;
517   int i,j;
518 
519   status = esl_dst_CPairIdMx(as, N, &D);
520   if (status != eslOK) goto ERROR;
521 
522   for (i = 0; i < N; i++)
523     {
524       D->mx[i][i] = 0.;
525       for (j = i+1; j < N; j++)
526 	{
527 	  D->mx[i][j] = 1. - D->mx[i][j];
528 	  D->mx[j][i] = D->mx[i][j];
529 	}
530     }
531 
532   if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
533   return eslOK;
534 
535  ERROR:
536   if (D     != NULL)  esl_dmatrix_Destroy(D);
537   if (ret_D != NULL) *ret_D = NULL;
538   return status;
539 
540 }
541 
542 /* Function:  esl_dst_CJukesCantorMx()
543  * Synopsis:  NxN Jukes/Cantor distance matrix for N aligned text seqs.
544  * Incept:    SRE, Tue Apr 18 16:00:16 2006 [St. Louis]
545  *
546  * Purpose:   Given a multiple sequence alignment <aseq>, consisting of
547  *            <nseq> aligned character sequences in an alphabet of
548  *            <K> letters (usually 4 for DNA, 20 for protein);
549  *            calculate a symmetric Jukes/Cantor pairwise distance
550  *            matrix for all sequence pairs, and optionally return the distance
551  *            matrix in <ret_D>, and optionally return a symmetric matrix of the
552  *            large-sample variances for those ML distance estimates
553  *            in <ret_V>.
554  *
555  *            Infinite distances (and variances) are possible; they
556  *            are represented as <HUGE_VAL> in <D> and <V>. Caller must
557  *            be prepared to deal with them as appropriate.
558  *
559  * Args:      K      - size of the alphabet (usually 4 or 20)
560  *            aseq   - aligned sequences [0.nseq-1][0..L-1]
561  *            nseq   - number of aseqs
562  *            opt_D  - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
563  *            opt_V  - optRETURN: matrix of variances.
564  *
565  * Returns:   <eslOK> on success. <D> and <V> contain the
566  *            distance matrix (and variances); caller frees these with
567  *            <esl_dmatrix_Destroy()>.
568  *
569  * Throws:    <eslEINVAL> if any pair of sequences have differing lengths
570  *            (and thus cannot have been properly aligned).
571  *            <eslEDIVZERO> if some pair of sequences had no aligned
572  *            residues. On failure, <D> and <V> are both returned <NULL>
573  *            and state of inputs is unchanged.
574  *
575  *            <eslEMEM> on allocation failure.
576  */
577 int
esl_dst_CJukesCantorMx(int K,char ** aseq,int nseq,ESL_DMATRIX ** opt_D,ESL_DMATRIX ** opt_V)578 esl_dst_CJukesCantorMx(int K, char **aseq, int nseq,
579 		       ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
580 {
581   int          status;
582   ESL_DMATRIX *D = NULL;
583   ESL_DMATRIX *V = NULL;
584   int          i,j;
585 
586   if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
587   if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
588 
589   for (i = 0; i < nseq; i++)
590     {
591       D->mx[i][i] = 0.;
592       V->mx[i][i] = 0.;
593       for (j = i+1; j < nseq; j++)
594 	{
595 	  status = esl_dst_CJukesCantor(K, aseq[i], aseq[j],
596 					&(D->mx[i][j]), &(V->mx[i][j]));
597 	  if (status != eslOK)
598 	    ESL_XEXCEPTION(status, "J/C calculation failed at seqs %d,%d", i,j);
599 
600 	  D->mx[j][i] = D->mx[i][j];
601 	  V->mx[j][i] = V->mx[i][j];
602 	}
603     }
604   if (opt_D != NULL) *opt_D = D;  else esl_dmatrix_Destroy(D);
605   if (opt_V != NULL) *opt_V = V;  else esl_dmatrix_Destroy(V);
606   return eslOK;
607 
608  ERROR:
609   if (D     != NULL) esl_dmatrix_Destroy(D);
610   if (V     != NULL) esl_dmatrix_Destroy(V);
611   if (opt_D != NULL) *opt_D = NULL;
612   if (opt_V != NULL) *opt_V = NULL;
613   return status;
614 }
615 /*----------- end, distance matrices for aligned text seqs ---------*/
616 
617 
618 
619 
620 /*****************************************************************
621  * 4. Distance matrices for aligned digital sequences.
622  *****************************************************************/
623 
624 /* Function:  esl_dst_XPairIdMx()
625  * Synopsis:  NxN identity matrix for N aligned digital seqs.
626  * Incept:    SRE, Thu Apr 27 09:08:11 2006 [New York]
627  *
628  * Purpose:   Given a digitized multiple sequence alignment <ax>, consisting
629  *            of <N> aligned digital sequences in alphabet <abc>; calculate
630  *            a symmetric pairwise fractional identity matrix by $N(N-1)/2$
631  *            calls to <esl_dst_XPairId()>, and return it in <ret_S>.
632  *
633  * Args:      abc   - digital alphabet in use
634  *            ax    - aligned dsq's, [0..N-1][1..alen]
635  *            N     - number of aligned sequences
636  *            ret_S - RETURN: NxN matrix of fractional identities
637  *
638  * Returns:   <eslOK> on success, and <ret_S> contains the distance
639  *            matrix. Caller is obligated to free <S> with
640  *            <esl_dmatrix_Destroy()>.
641  *
642  * Throws:    <eslEINVAL> if a seq has a different
643  *            length than others. On failure, <ret_S> is returned <NULL>
644  *            and state of inputs is unchanged.
645  *
646  *            <eslEMEM> on allocation failure.
647  */
648 int
esl_dst_XPairIdMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,ESL_DMATRIX ** ret_S)649 esl_dst_XPairIdMx(const ESL_ALPHABET *abc,  ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
650 {
651   int status;
652   ESL_DMATRIX *S = NULL;
653   int i,j;
654 
655   if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
656 
657   for (i = 0; i < N; i++)
658     {
659       S->mx[i][i] = 1.;
660       for (j = i+1; j < N; j++)
661 	{
662 	  status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
663 	  if (status != eslOK)
664 	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
665 	  S->mx[j][i] =  S->mx[i][j];
666 	}
667     }
668   if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
669   return eslOK;
670 
671  ERROR:
672   if (S     != NULL)  esl_dmatrix_Destroy(S);
673   if (ret_S != NULL) *ret_S = NULL;
674   return status;
675 }
676 
677 
678 /* Function:  esl_dst_XDiffMx()
679  * Synopsis:  NxN difference matrix for N aligned digital seqs.
680  * Incept:    SRE, Fri Apr 28 06:37:29 2006 [New York]
681  *
682  * Purpose:   Same as <esl_dst_XPairIdMx()>, but calculates fractional
683  *            difference <1-s> instead of fractional identity <s> for
684  *            each pair.
685  *
686  * Args:      abc   - digital alphabet in use
687  *            ax    - aligned dsq's, [0..N-1][1..alen]
688  *            N     - number of aligned sequences
689  *            ret_D - RETURN: NxN matrix of fractional differences
690  *
691  * Returns:   <eslOK> on success, and <ret_D> contains the difference
692  *            matrix; caller is obligated to free <D> with
693  *            <esl_dmatrix_Destroy()>.
694  *
695  * Throws:    <eslEINVAL> if a seq has a different
696  *            length than others. On failure, <ret_D> is returned <NULL>
697  *            and state of inputs is unchanged.
698  */
699 int
esl_dst_XDiffMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,ESL_DMATRIX ** ret_D)700 esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D)
701 {
702   int status;
703   ESL_DMATRIX *D = NULL;
704   int i,j;
705 
706   status = esl_dst_XPairIdMx(abc, ax, N, &D);
707   if (status != eslOK) goto ERROR;
708 
709   for (i = 0; i < N; i++)
710     {
711       D->mx[i][i] = 0.;
712       for (j = i+1; j < N; j++)
713 	{
714 	  D->mx[i][j] = 1. - D->mx[i][j];
715 	  D->mx[j][i] = D->mx[i][j];
716 	}
717     }
718   if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
719   return eslOK;
720 
721  ERROR:
722   if (D     != NULL)  esl_dmatrix_Destroy(D);
723   if (ret_D != NULL) *ret_D = NULL;
724   return status;
725 }
726 
727 /* Function:  esl_dst_XJukesCantorMx()
728  * Synopsis:  NxN Jukes/Cantor distance matrix for N aligned digital seqs.
729  * Incept:    SRE, Thu Apr 27 08:38:08 2006 [New York City]
730  *
731  * Purpose:   Given a digitized multiple sequence alignment <ax>,
732  *            consisting of <nseq> aligned digital sequences in
733  *            bioalphabet <abc>, calculate a symmetric Jukes/Cantor
734  *            pairwise distance matrix for all sequence pairs;
735  *            optionally return the distance matrix in <ret_D> and
736  *            a matrix of the large-sample variances for those ML distance
737  *            estimates in <ret_V>.
738  *
739  *            Infinite distances (and variances) are possible. They
740  *            are represented as <HUGE_VAL> in <D> and <V>. Caller must
741  *            be prepared to deal with them as appropriate.
742  *
743  * Args:      abc    - bioalphabet for <aseq>
744  *            ax     - aligned digital sequences [0.nseq-1][1..L]
745  *            nseq   - number of aseqs
746  *            opt_D  - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
747  *            opt_V  - optRETURN: matrix of variances.
748  *
749  * Returns:   <eslOK> on success. <D> (and optionally <V>) contain the
750  *            distance matrix (and variances). Caller frees these with
751  *            <esl_dmatrix_Destroy()>.
752  *
753  * Throws:    <eslEINVAL> if any pair of sequences have differing lengths
754  *            (and thus cannot have been properly aligned).
755  *            <eslEDIVZERO> if some pair of sequences had no aligned
756  *            residues. On failure, <D> and <V> are both returned <NULL>
757  *            and state of inputs is unchanged.
758  *
759  *            <eslEMEM> on allocation failure.
760  */
761 int
esl_dst_XJukesCantorMx(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int nseq,ESL_DMATRIX ** opt_D,ESL_DMATRIX ** opt_V)762 esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq,
763 		       ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
764 {
765   ESL_DMATRIX *D = NULL;
766   ESL_DMATRIX *V = NULL;
767   int          status;
768   int          i,j;
769 
770   if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
771   if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
772 
773   for (i = 0; i < nseq; i++)
774     {
775       D->mx[i][i] = 0.;
776       V->mx[i][i] = 0.;
777       for (j = i+1; j < nseq; j++)
778 	{
779 	  status = esl_dst_XJukesCantor(abc, ax[i], ax[j],
780 					&(D->mx[i][j]), &(V->mx[i][j]));
781 	  if (status != eslOK)
782 	    ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j);
783 
784 	  D->mx[j][i] = D->mx[i][j];
785 	  V->mx[j][i] = V->mx[i][j];
786 	}
787     }
788   if (opt_D != NULL) *opt_D = D;  else esl_dmatrix_Destroy(D);
789   if (opt_V != NULL) *opt_V = V;  else esl_dmatrix_Destroy(V);
790   return eslOK;
791 
792  ERROR:
793   if (D     != NULL) esl_dmatrix_Destroy(D);
794   if (V     != NULL) esl_dmatrix_Destroy(V);
795   if (opt_D != NULL) *opt_D = NULL;
796   if (opt_V != NULL) *opt_V = NULL;
797   return status;
798 }
799 /*------- end, distance matrices for digital alignments ---------*/
800 
801 
802 
803 /*****************************************************************
804  * 5. Average pairwise identity for multiple alignments
805  *****************************************************************/
806 
807 /* Function:  esl_dst_CAverageId()
808  * Synopsis:  Calculate avg identity for multiple alignment
809  * Incept:    SRE, Fri May 18 15:02:38 2007 [Janelia]
810  *
811  * Purpose:   Calculates the average pairwise fractional identity in
812  *            a multiple sequence alignment <as>, consisting of <N>
813  *            aligned character sequences of identical length.
814  *
815  *            If an exhaustive calculation would require more than
816  *            <max_comparisons> pairwise comparisons, then instead of
817  *            looking at all pairs, calculate the average over a
818  *            stochastic sample of <max_comparisons> random pairs.
819  *            This allows the routine to work efficiently even on very
820  *            deep MSAs.
821  *
822  *            Each fractional pairwise identity (range $[0..$ pid $..1]$
823  *            is calculated using <esl_dst_CPairId()>.
824  *
825  * Returns:   <eslOK> on success, and <*ret_id> contains the average
826  *            fractional identity.
827  *
828  * Throws:    <eslEMEM> on allocation failure.
829  *            <eslEINVAL> if any of the aligned sequence pairs aren't
830  *            of the same length.
831  *            In either case, <*ret_id> is set to 0.
832  */
833 int
esl_dst_CAverageId(char ** as,int N,int max_comparisons,double * ret_id)834 esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
835 {
836   int    status;
837   double id;
838   double sum = 0.;
839   int    i,j,n;
840 
841   if (N <= 1) { *ret_id = 1.; return eslOK; }
842   *ret_id = 0.;
843 
844   /* Is nseq small enough that we can average over all pairwise comparisons? */
845   if ((N * (N-1) / 2) <= max_comparisons)
846     {
847       for (i = 0; i < N; i++)
848 	for (j = i+1; j < N; j++)
849 	  {
850 	    if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
851 	    sum += id;
852 	  }
853       sum /= (double) (N * (N-1) / 2);
854     }
855 
856   /* If nseq is large, calculate average over a stochastic sample. */
857   else
858     {
859       ESL_RANDOMNESS *r = esl_randomness_Create(42);  /* fixed seed, suppress stochastic variation */
860       for (n = 0; n < max_comparisons; n++)
861 	{
862 	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
863 	  if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
864 	  sum += id;
865 	}
866       sum /= (double) max_comparisons;
867       esl_randomness_Destroy(r);
868     }
869 
870   *ret_id = sum;
871   return eslOK;
872 }
873 
874 /* Function:  esl_dst_CAverageMatch()
875  * Synopsis:  Calculate avg matches for multiple alignment
876  * Incept:    ER, Wed Oct 29 09:25:09 EDT 2014 [Janelia]
877  *
878  * Purpose:   Calculates the average pairwise fractional matches in
879  *            a multiple sequence alignment <as>, consisting of <N>
880  *            aligned character sequences of identical length.
881  *
882  *            If an exhaustive calculation would require more than
883  *            <max_comparisons> pairwise comparisons, then instead of
884  *            looking at all pairs, calculate the average over a
885  *            stochastic sample of <max_comparisons> random pairs.
886  *            This allows the routine to work efficiently even on very
887  *            deep MSAs.
888  *
889  *            Each fractional pairwise matches (range $[0..$ pid $..1]$
890  *            is calculated using <esl_dst_CPairMatch()>.
891  *
892  * Returns:   <eslOK> on success, and <*ret_match> contains the average
893  *            fractional matches.
894  *
895  * Throws:    <eslEMEM> on allocation failure.
896  *            <eslEINVAL> if any of the aligned sequence pairs aren't
897  *            of the same length.
898  *            In either case, <*ret_match> is set to 0.
899  */
900 int
esl_dst_CAverageMatch(char ** as,int N,int max_comparisons,double * ret_match)901 esl_dst_CAverageMatch(char **as, int N, int max_comparisons, double *ret_match)
902 {
903   int    status;
904   double match;
905   double sum = 0.;
906   int    i,j,n;
907 
908  if (N <= 1) { *ret_match = 1.; return eslOK; }
909   *ret_match = 0.;
910 
911   /* Is nseq small enough that we can average over all pairwise comparisons? */
912   if ((N * (N-1) / 2) <= max_comparisons)
913     {
914       for (i = 0; i < N; i++)
915 	for (j = i+1; j < N; j++)
916 	  {
917 	    if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
918 	    sum += match;
919 	  }
920       sum /= (double) (N * (N-1) / 2);
921     }
922 
923   /* If nseq is large, calculate average over a stochastic sample. */
924   else
925     {
926       ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
927       for (n = 0; n < max_comparisons; n++)
928 	{
929 	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
930 	  if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
931 	  sum += match;
932 	}
933       sum /= (double) max_comparisons;
934       esl_randomness_Destroy(r);
935     }
936 
937   *ret_match = sum;
938   return eslOK;
939 }
940 
941 /* Function:  esl_dst_XAverageId()
942  * Synopsis:  Calculate avg identity for digital MSA
943  * Incept:    SRE, Fri May 18 15:19:14 2007 [Janelia]
944  *
945  * Purpose:   Calculates the average pairwise fractional identity in
946  *            a digital multiple sequence alignment <ax>, consisting of <N>
947  *            aligned digital sequences of identical length.
948  *
949  *            If an exhaustive calculation would require more than
950  *            <max_comparisons> pairwise comparisons, then instead of
951  *            looking at all pairs, calculate the average over a
952  *            stochastic sample of <max_comparisons> random pairs.
953  *            This allows the routine to work efficiently even on very
954  *            deep MSAs.
955  *
956  *            Each fractional pairwise identity (range $[0..$ pid $..1]$
957  *            is calculated using <esl_dst_XPairId()>.
958  *
959  * Returns:   <eslOK> on success, and <*ret_id> contains the average
960  *            fractional identity.
961  *
962  * Throws:    <eslEMEM> on allocation failure.
963  *            <eslEINVAL> if any of the aligned sequence pairs aren't
964  *            of the same length.
965  *            In either case, <*ret_id> is set to 0.
966  */
967 int
esl_dst_XAverageId(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,int max_comparisons,double * ret_id)968 esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
969 {
970   int    status;
971   double id;
972   double sum = 0.;
973   int    i,j,n;
974 
975   if (N <= 1) { *ret_id = 1.; return eslOK; }
976   *ret_id = 0.;
977 
978   /* Is N small enough that we can average over all pairwise comparisons?
979      watch out for numerical overflow in this: Pfam N's easily overflow when squared
980    */
981   if (N <= max_comparisons &&
982       N <= sqrt(2. * max_comparisons) &&
983       (N * (N-1) / 2) <= max_comparisons)
984     {
985       for (i = 0; i < N; i++)
986 	for (j = i+1; j < N; j++)
987 	  {
988 	    if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
989 	    sum += id;
990 	  }
991       sum /= (double) (N * (N-1) / 2);
992     }
993 
994   /* If nseq is large, calculate average over a stochastic sample. */
995   else
996     {
997       ESL_RANDOMNESS *r = esl_randomness_Create(42);  /* fixed seed, suppress stochastic variation */
998       for (n = 0; n < max_comparisons; n++)
999 	{
1000 	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
1001 	  if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
1002 	  sum += id;
1003 	}
1004       sum /= (double) max_comparisons;
1005       esl_randomness_Destroy(r);
1006     }
1007 
1008   *ret_id = sum;
1009   return eslOK;
1010 }
1011 
1012 /* Function:  esl_dst_XAverageMatch()
1013  * Synopsis:  Calculate avg matches for digital MSA
1014  * Incept:    ER, ed Oct 29 09:29:05 EDT 2014 [Janelia]
1015  *
1016  * Purpose:   Calculates the average pairwise fractional matches in
1017  *            a digital multiple sequence alignment <ax>, consisting of <N>
1018  *            aligned digital sequences of identical length.
1019  *
1020  *            If an exhaustive calculation would require more than
1021  *            <max_comparisons> pairwise comparisons, then instead of
1022  *            looking at all pairs, calculate the average over a
1023  *            stochastic sample of <max_comparisons> random pairs.
1024  *            This allows the routine to work efficiently even on very
1025  *            deep MSAs.
1026  *
1027  *            Each fractional pairwise matches (range $[0..$ pid $..1]$
1028  *            is calculated using <esl_dst_XPairMatch()>.
1029  *
1030  * Returns:   <eslOK> on success, and <*ret_match> contains the average
1031  *            fractional identity.
1032  *
1033  * Throws:    <eslEMEM> on allocation failure.
1034  *            <eslEINVAL> if any of the aligned sequence pairs aren't
1035  *            of the same length.
1036  *            In either case, <*ret_match> is set to 0.
1037  */
1038 int
esl_dst_XAverageMatch(const ESL_ALPHABET * abc,ESL_DSQ ** ax,int N,int max_comparisons,double * ret_match)1039 esl_dst_XAverageMatch(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_match)
1040 {
1041   int    status;
1042   double match;
1043   double sum = 0.;
1044   int    i,j,n;
1045 
1046   if (N <= 1) { *ret_match = 1.; return eslOK; }
1047   *ret_match = 0.;
1048 
1049   /* Is N small enough that we can average over all pairwise comparisons?
1050      watch out for numerical overflow in this: Pfam N's easily overflow when squared
1051    */
1052   if (N <= max_comparisons &&
1053       N <= sqrt(2. * max_comparisons) &&
1054       (N * (N-1) / 2) <= max_comparisons)
1055     {
1056       for (i = 0; i < N; i++)
1057 	for (j = i+1; j < N; j++)
1058 	  {
1059 	    if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
1060 	    sum += match;
1061 	  }
1062       sum /= (double) (N * (N-1) / 2);
1063     }
1064 
1065   /* If nseq is large, calculate average over a stochastic sample. */
1066   else
1067     {
1068       ESL_RANDOMNESS *r = esl_randomness_Create(42);  /* fixed seed, suppress stochastic variation */
1069       for (n = 0; n < max_comparisons; n++)
1070 	{
1071 	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
1072 	  if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
1073 	  sum += match;
1074 	}
1075       sum /= (double) max_comparisons;
1076       esl_randomness_Destroy(r);
1077     }
1078 
1079   *ret_match = sum;
1080   return eslOK;
1081 }
1082 
1083 
1084 
1085 
1086 
1087 
1088 /*****************************************************************
1089  * 6. Private (static) functions
1090  *****************************************************************/
1091 
1092 /* jukescantor()
1093  *
1094  * The generalized Jukes/Cantor distance calculation.
1095  * Given <n1> identities and <n2> differences, for a
1096  * base alphabet size of <alphabet_size> (4 or 20);
1097  * calculate J/C distance in substitutions/site and
1098  * return it in <ret_distance>; calculate large-sample
1099  * variance and return it in <ret_variance>.
1100  *
1101  * Returns <eslEDIVZERO> if there are no data (<n1+n2=0>).
1102  */
1103 static int
jukescantor(int n1,int n2,int alphabet_size,double * opt_distance,double * opt_variance)1104 jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance)
1105 {
1106   int    status;
1107   double D, K, N;
1108   double x;
1109   double distance, variance;
1110 
1111   ESL_DASSERT1( (n1 >= 0) );
1112   ESL_DASSERT1( (n2 >= 0) );
1113   ESL_DASSERT1( (alphabet_size >= 0) );
1114 
1115   if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; }
1116 
1117   K = (double) alphabet_size;
1118   D = (double) n2 / (double) (n1+n2);
1119   N = (double) (n1+n2);
1120 
1121   x = 1. - D * K/(K-1.);
1122   if (x <= 0.)
1123     {
1124       distance = HUGE_VAL;
1125       variance = HUGE_VAL;
1126     }
1127   else
1128     {
1129       distance =   -log(x) * K/(K-1);
1130       variance =  exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N;
1131     }
1132   if (opt_distance != NULL)  *opt_distance = distance;
1133   if (opt_variance != NULL)  *opt_variance = variance;
1134   return eslOK;
1135 
1136  ERROR:
1137   if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
1138   if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
1139   return status;
1140 }
1141 /*--------------- end of private functions ----------------------*/
1142 
1143 
1144 /*****************************************************************
1145  * 7. Unit tests.
1146  *****************************************************************/
1147 #ifdef eslDISTANCE_TESTDRIVE
1148 
1149 /* Each unit test is given an alignment with certain known
1150  * properties:
1151  *    seqs 0,1 are identical
1152  *    seqs 0,2 are completely different
1153  *    seqs 3..N are random
1154  * The alignment may contain gaps, so don't assume that the
1155  * # of compared residues == alignment length. The alignment
1156  * contains only canonical residues, because one of our tests
1157  * is that C and X functions give the same results.
1158  */
1159 static int
utest_CPairId(char ** as,int N)1160 utest_CPairId(char **as, int N)
1161 {
1162   double pid;
1163   int    nid;
1164   int    nres;
1165   int    L;
1166   int    i,j;
1167 
1168   /* Self comparison gives identity = 1. */
1169   L = strlen(as[0]);
1170   if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort();
1171   if (pid  != 1.0 || nid != L || nres > L) abort();
1172 
1173   /* So does 0,1 comparison  */
1174   if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort();
1175   if (pid  != 1.0 || nid != L || nres > L) abort();
1176 
1177   /* 0,2 comparison gives 0.0, 0 */
1178   if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort();
1179   if (pid  != 0.0 || nid != 0 || nres > L) abort();
1180 
1181   /* remaining comparisons shouldn't fail */
1182   for (i = 3; i < N; i++)
1183     for (j = i; j < N; j++)
1184       {
1185 	if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort();
1186 	if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L)    abort();
1187       }
1188 
1189   /* API should accept NULL for return values */
1190   if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort();
1191   return eslOK;
1192 }
1193 
1194 static int
utest_CJukesCantor(int K,char ** as,int N)1195 utest_CJukesCantor(int K, char **as, int N)
1196 {
1197   double d, V;
1198   int    i,j;
1199 
1200   /* Self comparison gives distance = 0. */
1201   if (esl_dst_CJukesCantor(K, as[0], as[0], &d, &V) != eslOK) abort();
1202   if (d != 0.0) abort();
1203 
1204   /* So does 0,1 comparison  */
1205   if (esl_dst_CJukesCantor(K, as[0], as[1], &d, &V) != eslOK) abort();
1206   if (d != 0.0) abort();
1207 
1208   /* 0,2 comparison gives infinite distance (HUGE_VAL) */
1209   if (esl_dst_CJukesCantor(K, as[0], as[2], &d, &V) != eslOK) abort();
1210   if (d != HUGE_VAL) abort();
1211 
1212   /* remaining comparisons shouldn't fail */
1213   for (i = 3; i < N; i++)
1214     for (j = i; j < N; j++)
1215       if (esl_dst_CJukesCantor(K, as[i], as[j], &d, &V) != eslOK) abort();
1216 
1217   /* API should accept NULL for return values */
1218   if (esl_dst_CJukesCantor(K, as[0], as[0], NULL, NULL) != eslOK) abort();
1219   return eslOK;
1220 }
1221 
1222 static int
utest_XPairId(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1223 utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1224 {
1225   double pid, pid2;
1226   int    nid, nid2;
1227   int    nres, nres2;
1228   int    dL, L;
1229   int    i,j;
1230 
1231   /* Self comparison gives identity = 1. */
1232   dL = esl_abc_dsqlen(ax[0]);
1233   L  = strlen(as[0]);
1234   if (dL != L) abort();
1235   if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort();
1236   if (pid  != 1.0 || nid != L || nres > dL) abort();
1237 
1238   /* So does 0,1 comparison  */
1239   if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort();
1240   if (pid  != 1.0 || nid != L || nres > L) abort();
1241 
1242   /* 0,2 comparison gives 0.0, 0 */
1243   if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort();
1244   if (pid  != 0.0 || nid != 0 || nres > L) abort();
1245 
1246   /* remaining comparisons shouldn't fail, and should be identical to text mode */
1247   for (i = 3; i < N; i++)
1248     for (j = i; j < N; j++)
1249       {
1250 	if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort();
1251 	if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2)   != eslOK) abort();
1252 	if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L)         abort();
1253 	if (pid != pid2 || nid != nid2 || nres != nres2)                    abort();
1254       }
1255 
1256   /* API should accept NULL for return values */
1257   if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort();
1258   return eslOK;
1259 
1260 }
1261 
1262 static int
utest_XJukesCantor(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1263 utest_XJukesCantor(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1264 {
1265   double d, V;
1266   int    i,j;
1267 
1268   /* Self comparison gives distance = 0. */
1269   if (esl_dst_XJukesCantor(abc, ax[0], ax[0], &d, &V) != eslOK) abort();
1270   if (d != 0.0) abort();
1271 
1272   /* So does 0,1 comparison  */
1273   if (esl_dst_XJukesCantor(abc, ax[0], ax[1], &d, &V) != eslOK) abort();
1274   if (d != 0.0) abort();
1275 
1276   /* 0,2 comparison gives infinite distance (HUGE_VAL) */
1277   if (esl_dst_XJukesCantor(abc, ax[0], ax[2], &d, &V) != eslOK) abort();
1278   if (d != HUGE_VAL) abort();
1279 
1280   /* remaining comparisons shouldn't fail */
1281   for (i = 3; i < N; i++)
1282     for (j = i; j < N; j++)
1283       if (esl_dst_XJukesCantor(abc, ax[i], ax[j], &d, &V) != eslOK) abort();
1284 
1285   /* API should accept NULL for return values */
1286   if (esl_dst_XJukesCantor(abc, ax[0], ax[0], NULL, NULL) != eslOK) abort();
1287   return eslOK;
1288 }
1289 
1290 static int
utest_CPairIdMx(char ** as,int N)1291 utest_CPairIdMx(char **as, int N)
1292 {
1293   ESL_DMATRIX *S;
1294   int          i,j;
1295   double       pid;
1296 
1297   if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort();
1298 
1299   for (i = 0; i < N; i++)
1300     if (S->mx[i][i] != 1.0) abort();
1301 
1302   pid = 0.;
1303   for (i = 3; i < N; i++)
1304     for (j = i+1; j < N; j++)
1305       pid += S->mx[i][j];
1306   pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
1307   if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */
1308 
1309   esl_dmatrix_Destroy(S);
1310   return eslOK;
1311 }
1312 
1313 static int
utest_CDiffMx(char ** as,int N)1314 utest_CDiffMx(char **as, int N)
1315 {
1316   ESL_DMATRIX *D;
1317   int          i,j;
1318   double       diff;
1319 
1320   if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort();
1321 
1322   for (i = 0; i < N; i++)
1323     if (D->mx[i][i] != 0.0) abort();
1324 
1325   diff = 0.;
1326   for (i = 3; i < N; i++)
1327     for (j = i+1; j < N; j++)
1328       diff += D->mx[i][j];
1329   diff /= (double) ((N-3) * (N-4) / 2);	/* first 3 don't count */
1330   if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */
1331 
1332   esl_dmatrix_Destroy(D);
1333   return eslOK;
1334 }
1335 
1336 static int
utest_CJukesCantorMx(int K,char ** as,int N)1337 utest_CJukesCantorMx(int K, char **as, int N)
1338 {
1339   ESL_DMATRIX *D, *V;
1340   /* just a crash test */
1341   if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort();
1342   esl_dmatrix_Destroy(D);
1343   esl_dmatrix_Destroy(V);
1344   return eslOK;
1345 }
1346 
1347 static int
utest_XPairIdMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1348 utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1349 {
1350   ESL_DMATRIX *S, *S2;
1351   int i, j;
1352 
1353   if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort();
1354   if (esl_dst_CPairIdMx(as, N, &S2)     != eslOK) abort();
1355 
1356   for (i = 0; i < N; i++)
1357     for (j = i; j < N; j++)
1358       if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort();
1359 
1360   esl_dmatrix_Destroy(S);
1361   esl_dmatrix_Destroy(S2);
1362   return eslOK;
1363 }
1364 
1365 static int
utest_XDiffMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1366 utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1367 {
1368   ESL_DMATRIX *D, *D2;
1369   int i, j;
1370 
1371   if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort();
1372   if (esl_dst_CDiffMx(as, N, &D2)     != eslOK) abort();
1373 
1374   for (i = 0; i < N; i++)
1375     for (j = i; j < N; j++)
1376       if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
1377 
1378   esl_dmatrix_Destroy(D);
1379   esl_dmatrix_Destroy(D2);
1380   return eslOK;
1381 }
1382 
1383 static int
utest_XJukesCantorMx(ESL_ALPHABET * abc,char ** as,ESL_DSQ ** ax,int N)1384 utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
1385 {
1386   ESL_DMATRIX *D, *D2, *V, *V2;
1387   int i, j;
1388 
1389   if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V)      != eslOK) abort();
1390   if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort();
1391 
1392   for (i = 0; i < N; i++)
1393     for (j = i; j < N; j++)
1394       {
1395 	if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
1396 	if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort();
1397       }
1398 
1399   esl_dmatrix_Destroy(D);
1400   esl_dmatrix_Destroy(D2);
1401   esl_dmatrix_Destroy(V);
1402   esl_dmatrix_Destroy(V2);
1403   return eslOK;
1404 }
1405 #endif /* eslDISTANCE_TESTDRIVE */
1406 /*------------------ end of unit tests --------------------------*/
1407 
1408 
1409 
1410 /*****************************************************************
1411  * 8. Test driver.
1412  *****************************************************************/
1413 
1414 #ifdef eslDISTANCE_TESTDRIVE
1415 #include "easel.h"
1416 #include "esl_alphabet.h"
1417 #include "esl_arr2.h"
1418 #include "esl_dmatrix.h"
1419 #include "esl_getopts.h"
1420 #include "esl_random.h"
1421 #include "esl_randomseq.h"
1422 
1423 #include "esl_distance.h"
1424 
1425 static ESL_OPTIONS options[] = {
1426   /* name        type       def   env  range toggles reqs incomp help                       docgroup*/
1427   { "-h",     eslARG_NONE,  FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",            0},
1428   { "-N",     eslARG_INT,    "10", NULL,"n>3", NULL, NULL, NULL, "number of iid seqs in alignment",0},
1429   { "-L",     eslARG_INT,    "50", NULL,"n>0", NULL, NULL, NULL, "length of seqs in alignment",    0},
1430   { "--seed", eslARG_INT,    "42", NULL,"n>=0",NULL, NULL, NULL, "random # seed",                  0},
1431   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1432 };
1433 static char usage[] = "Usage: ./testdrive-distance [-options]";
1434 
1435 int
main(int argc,char ** argv)1436 main(int argc, char **argv)
1437 {
1438   ESL_GETOPTS   *go = NULL;
1439   ESL_RANDOMNESS *r = NULL;
1440   char  **as = NULL;		/* aligned character seqs (random, iid) */
1441   int     N,L;			/* # of seqs, and their aligned lengths */
1442   int seed;
1443   int i,j;
1444   int status;
1445   double p[4];			/// ACGT probabilities
1446   ESL_DSQ      **ax = NULL;	// digitized alignment
1447   ESL_ALPHABET *abc = NULL;
1448 
1449   /* Process command line
1450    */
1451   go = esl_getopts_Create(options);
1452   esl_opt_ProcessCmdline(go, argc, argv);
1453   esl_opt_VerifyConfig(go);
1454   if (esl_opt_GetBoolean(go, "-h") == TRUE) {
1455     puts(usage);
1456     puts("\n  where options are:");
1457     esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
1458     return 0;
1459   }
1460   L    = esl_opt_GetInteger(go, "-L");
1461   N    = esl_opt_GetInteger(go, "-N");
1462   seed = esl_opt_GetInteger(go, "--seed");
1463   if (esl_opt_ArgNumber(go) != 0) {
1464     puts("Incorrect number of command line arguments.");
1465     puts(usage);
1466     return 1;
1467   }
1468   esl_getopts_Destroy(go);
1469 
1470   /* Create a random DNA alignment;
1471    * force it to obey the conventions of the unit tests:
1472    *   0,1 are identical
1473    *   0,2 are completely dissimilar
1474    */
1475   r   = esl_randomness_Create(seed);
1476   for (i = 0; i < 4; i++) p[i] = 0.25;
1477   ESL_ALLOC(as, sizeof(char *) * N);
1478   for (i = 0; i < N; i++)
1479     ESL_ALLOC(as[i], sizeof(char) * (L+1));
1480   esl_rsq_IID(r, "ACGT", p, 4, L, as[0]);
1481   strcpy(as[1], as[0]);
1482   esl_rsq_IID(r, "ACGT", p, 4, L, as[2]);
1483   for (j = 0; j < L; j++)
1484     while (as[2][j] == as[0][j])
1485       as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)];
1486   for (i = 3; i < N; i++)
1487     esl_rsq_IID(r, "ACGT", p, 4, L, as[i]);
1488 
1489   abc = esl_alphabet_Create(eslDNA);
1490   ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N);
1491   for (i = 0; i < N; i++)
1492     esl_abc_CreateDsq(abc, as[i], &(ax[i]));
1493 
1494   /* Unit tests
1495    */
1496   if (utest_CPairId(as, N)               != eslOK) return eslFAIL;
1497   if (utest_CJukesCantor(4, as, N)       != eslOK) return eslFAIL;
1498   if (utest_XPairId(abc, as, ax, N)      != eslOK) return eslFAIL;
1499   if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL;
1500   if (utest_CPairIdMx(as, N)             != eslOK) return eslFAIL;
1501   if (utest_CDiffMx(as, N)               != eslOK) return eslFAIL;
1502   if (utest_CJukesCantorMx(4, as, N)     != eslOK) return eslFAIL;
1503   if (utest_XPairIdMx(abc, as, ax, N)       != eslOK) return eslFAIL;
1504   if (utest_XDiffMx(abc, as, ax, N)         != eslOK) return eslFAIL;
1505   if (utest_XJukesCantorMx(abc, as, ax, N)  != eslOK) return eslFAIL;
1506 
1507 
1508   esl_randomness_Destroy(r);
1509   esl_alphabet_Destroy(abc);
1510   esl_arr2_Destroy((void **) as, N);
1511   esl_arr2_Destroy((void **) ax, N);
1512   return eslOK;
1513 
1514  ERROR:
1515   return eslFAIL;
1516 }
1517 #endif /*eslDISTANCE_TESTDRIVE*/
1518 
1519 
1520 
1521 
1522 /*****************************************************************
1523  * 9. Example.
1524  *****************************************************************/
1525 
1526 #ifdef eslDISTANCE_EXAMPLE
1527 /*::cexcerpt::distance_example::begin::*/
1528 /* gcc -g -Wall -o example -I. -DeslDISTANCE_EXAMPLE esl_distance.c\
1529        esl_dmatrix.c esl_msa.c easel.c -lm
1530    ./example <msa file>
1531  */
1532 #include "easel.h"
1533 #include "esl_distance.h"
1534 #include "esl_dmatrix.h"
1535 #include "esl_msa.h"
1536 
main(int argc,char ** argv)1537 int main(int argc, char **argv)
1538 {
1539   ESL_MSAFILE  *afp;
1540   ESL_MSA      *msa;
1541   ESL_DMATRIX  *P;
1542   int           i,j;
1543   double        min, avg, max;
1544   int           status;
1545 
1546   if ((status = esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
1547     esl_msafile_OpenFailure(afp, status);
1548   if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
1549     esl_msafile_ReadFailure(afp, status);
1550 
1551   esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P);
1552 
1553   min = 1.0;
1554   max = 0.0;
1555   avg = 0.0;
1556   for (i = 0; i < msa->nseq; i++)
1557     for (j = i+1; j < msa->nseq; j++)
1558       {
1559 	avg += P->mx[i][j];
1560 	if (P->mx[i][j] < min) min = P->mx[i][j];
1561 	if (P->mx[i][j] > max) max = P->mx[i][j];
1562       }
1563   avg /= (double) (msa->nseq * (msa->nseq-1) / 2);
1564 
1565   printf("Average pairwise %% id:  %.1f%%\n", avg * 100.);
1566   printf("Minimum pairwise %% id:  %.1f%%\n", min * 100.);
1567   printf("Maximum pairwise %% id:  %.1f%%\n", max * 100.);
1568 
1569   esl_dmatrix_Destroy(P);
1570   esl_msa_Destroy(msa);
1571   esl_msafile_Close(afp);
1572   return 0;
1573 }
1574 /*::cexcerpt::distance_example::end::*/
1575 #endif /*eslDISTANCE_EXAMPLE*/
1576 
1577 
1578 
1579