1 /* Clustering sequences in an MSA by % identity.
2  *
3  * Table of contents:
4  *    1. Single linkage clustering an MSA by %id
5  *    2. Internal functions, interface to the clustering API
6  *    3. Some internal functions needed for regression tests
7  *    4. Unit tests
8  *    5. Test driver
9  *    6. Example
10  *
11  * (Wondering why isn't this just part of the cluster or MSA modules?
12  * esl_cluster itself is a core module, dependent only on easel. MSA
13  * clustering involves at least the distance, cluster, and msa
14  * modules. We're better off separating its functionality away into a
15  * more highly derived module.)
16  */
17 #include "esl_config.h"
18 
19 #include "easel.h"
20 #include "esl_alphabet.h"
21 #include "esl_cluster.h"
22 #include "esl_distance.h"
23 #include "esl_msa.h"
24 
25 #include "esl_msacluster.h"
26 
27 /* These functions are going to get defined in an internal regression
28  * testing section further below:
29  */
30 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
31 #include <ctype.h>
32 static double squid_distance(char *s1, char *s2);
33 static double squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2);
34 #endif
35 
36 /* These functions will define linkage between a pair of text or
37  *  digital aseq's:
38  */
39 static int msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link);
40 static int msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link);
41 
42 /* In digital mode, we'll need to pass the clustering routine two parameters -
43  * %id threshold and alphabet ptr - so make a structure that bundles them.
44  */
45 struct msa_param_s {
46   double        maxid;
47   ESL_ALPHABET *abc;
48 };
49 
50 
51 /*****************************************************************
52  * 1. Single linkage clustering an MSA by %id
53  *****************************************************************/
54 
55 /* Function:  esl_msacluster_SingleLinkage()
56  * Synopsis:  Single linkage clustering by percent identity.
57  * Incept:    SRE, Sun Nov  5 10:11:45 2006 [Janelia]
58  *
59  * Purpose:   Perform single link clustering of the sequences in
60  *            multiple alignment <msa>. Any pair of sequences with
61  *            percent identity $\geq$ <maxid> are linked (using
62  *            the definition from the \eslmod{distance} module).
63  *
64  *            The resulting clustering is optionally returned in one
65  *            or more of <opt_c>, <opt_nin>, and <opt_nc>.  The
66  *            <opt_c[0..nseq-1]> array assigns a cluster index
67  *            <(0..nc-1)> to each sequence. For example, <c[4] = 1>
68  *            means that sequence 4 is assigned to cluster 1.  The
69  *            <opt_nin[0..nc-1]> array is the number of sequences
70  *            in each cluster. <opt_nc> is the number of clusters.
71  *
72  *            Importantly, this algorithm runs in $O(N)$ memory, and
73  *            produces one discrete clustering. Compare to
74  *            <esl_tree_SingleLinkage()>, which requires an $O(N^2)$
75  *            adjacency matrix, and produces a hierarchical clustering
76  *            tree.
77  *
78  *            The algorithm is worst case $O(LN^2)$ time, for $N$
79  *            sequences of length $L$. However, the worst case is no
80  *            links at all, and this is unusual. More typically, time
81  *            scales as about $LN \log N$. The best case scales as
82  *            $LN$, when there is just one cluster in a completely
83  *            connected graph.
84  *
85  * Args:      msa     - multiple alignment to cluster
86  *            maxid   - pairwise identity threshold: cluster if $\geq$ <maxid>
87  *            opt_c   - optRETURN: cluster assignments for each sequence, [0..nseq-1]
88  *            opt_nin - optRETURN: number of seqs in each cluster, [0..nc-1]
89  *            opt_nc  - optRETURN: number of clusters
90  *
91  * Returns:   <eslOK> on success; the <opt_c[0..nseq-1]> array contains
92  *            cluster indices <0..nc-1> assigned to each sequence; the
93  *            <opt_nin[0..nc-1]> array contains the number of seqs in
94  *            each cluster; and <opt_nc> contains the number of
95  *            clusters. The <opt_c> array and <opt_nin> arrays will be
96  *            allocated here, if non-<NULL>, and must be free'd by the
97  *            caller. The input <msa> is unmodified.
98  *
99  *            The caller may pass <NULL> for either <opt_c> or
100  *            <opt_nc> if it is only interested in one of the two
101  *            results.
102  *
103  * Throws:    <eslEMEM> on allocation failure, and <eslEINVAL> if a pairwise
104  *            comparison is invalid (which means the MSA is corrupted, so it
105  *            shouldn't happen). In either case, <opt_c> and <opt_nin> are set to <NULL>
106  *            and <opt_nc> is set to 0, and the <msa> is unmodified.
107  */
108 int
esl_msacluster_SingleLinkage(const ESL_MSA * msa,double maxid,int ** opt_c,int ** opt_nin,int * opt_nc)109 esl_msacluster_SingleLinkage(const ESL_MSA *msa, double maxid,
110 			     int **opt_c, int **opt_nin, int *opt_nc)
111 
112 {
113   int   status;
114   int  *workspace  = NULL;
115   int  *assignment = NULL;
116   int  *nin        = NULL;
117   int   nc;
118   int   i;
119   struct msa_param_s param;
120 
121   /* Allocations */
122   ESL_ALLOC(workspace,  sizeof(int) * msa->nseq * 2);
123   ESL_ALLOC(assignment, sizeof(int) * msa->nseq);
124 
125   /* call to SLC API: */
126   if (! (msa->flags & eslMSA_DIGITAL))
127     status = esl_cluster_SingleLinkage((void *) msa->aseq, (size_t) msa->nseq, sizeof(char *),
128 				       msacluster_clinkage, (void *) &maxid,
129 				       workspace, assignment, &nc);
130   else {
131     param.maxid = maxid;
132     param.abc   = msa->abc;
133     status = esl_cluster_SingleLinkage((void *) msa->ax, (size_t) msa->nseq, sizeof(ESL_DSQ *),
134 				       msacluster_xlinkage, (void *) &param,
135 				       workspace, assignment, &nc);
136   }
137   if (status != eslOK) goto ERROR;
138 
139 
140   if (opt_nin != NULL)
141     {
142       ESL_ALLOC(nin, sizeof(int) * nc);
143       for (i = 0; i < nc; i++) nin[i] = 0;
144       for (i = 0; i < msa->nseq; i++)
145 	nin[assignment[i]]++;
146       *opt_nin = nin;
147     }
148 
149   /* cleanup and return */
150   free(workspace);
151   if (opt_c  != NULL) *opt_c  = assignment; else free(assignment);
152   if (opt_nc != NULL) *opt_nc = nc;
153   return eslOK;
154 
155  ERROR:
156   if (workspace  != NULL) free(workspace);
157   if (assignment != NULL) free(assignment);
158   if (nin        != NULL) free(nin);
159   if (opt_c  != NULL) *opt_c  = NULL;
160   if (opt_nc != NULL) *opt_nc = 0;
161   return status;
162 }
163 
164 
165 
166 
167 
168 /*****************************************************************
169  * 2. Internal functions, interface to the clustering API
170  *****************************************************************/
171 
172 /* Definition of %id linkage in text-mode aligned seqs (>= maxid): */
173 static int
msacluster_clinkage(const void * v1,const void * v2,const void * p,int * ret_link)174 msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link)
175 {
176   char  *as1   = *(char **) v1;
177   char  *as2   = *(char **) v2;
178   double maxid = *(double *) p;
179   double pid;
180   int    status = eslOK;
181 
182 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
183   pid = 1. - squid_distance(as1, as2);
184 #else
185   if ((status = esl_dst_CPairId(as1, as2, &pid, NULL, NULL)) != eslOK) return status;
186 #endif
187 
188   *ret_link = (pid >= maxid ? TRUE : FALSE);
189   return status;
190 }
191 
192 /* Definition of % id linkage in digital aligned seqs (>= maxid) */
193 static int
msacluster_xlinkage(const void * v1,const void * v2,const void * p,int * ret_link)194 msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link)
195 {
196   ESL_DSQ *ax1              = *(ESL_DSQ **) v1;
197   ESL_DSQ *ax2              = *(ESL_DSQ **) v2;
198   struct msa_param_s *param = (struct msa_param_s *) p;
199   double   pid;
200   int      status = eslOK;
201 
202 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
203   pid = 1. - squid_xdistance(param->abc, ax1, ax2);
204 #else
205   if ( (status = esl_dst_XPairId(param->abc, ax1, ax2, &pid, NULL, NULL)) != eslOK) return status;
206 #endif
207 
208   *ret_link = (pid >= param->maxid ? TRUE : FALSE);
209   return status;
210 }
211 
212 
213 /*****************************************************************
214  * 3. Some internal functions needed for regression tests
215  *****************************************************************/
216 
217 /* When regression testing against squid, we have to replace
218  * Easel's distance calculations with a simpler, (even) less robust
219  * calculation that squid did.
220  */
221 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
222 static double
squid_distance(char * s1,char * s2)223 squid_distance(char *s1, char *s2)
224 {
225   int diff  = 0;
226   int valid = 0;
227 
228   for (; *s1 != '\0'; s1++, s2++)
229     {
230       if (!isalpha(*s1) || !isalpha(*s2)) continue;
231       if (*s1 != *s2) diff++;
232       valid++;
233     }
234   return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
235 }
236 static double
squid_xdistance(ESL_ALPHABET * a,ESL_DSQ * x1,ESL_DSQ * x2)237 squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2)
238 {
239   int diff  = 0;
240   int valid = 0;
241 
242   for (; *x1 != eslDSQ_SENTINEL; x1++, x2++)
243     {
244       if (esl_abc_XIsGap(a, *x1) || esl_abc_XIsGap(a, *x2)) continue;
245       if (*x1 != *x2) diff++;
246       valid++;
247     }
248   return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
249 }
250 #endif /* eslMSACLUSTER_REGRESSION || eslMSAWEIGHT_REGRESSION */
251 
252 
253 /*****************************************************************
254  * 4. Unit tests
255  *****************************************************************/
256 #ifdef eslMSACLUSTER_TESTDRIVE
257 #include "esl_getopts.h"
258 
259 static void
utest_SingleLinkage(ESL_GETOPTS * go,const ESL_MSA * msa,double maxid,int expected_nc,int last_assignment)260 utest_SingleLinkage(ESL_GETOPTS *go, const ESL_MSA *msa, double maxid, int expected_nc, int last_assignment)
261 {
262   char *msg        = "utest_SingleLinkage() failed";
263   int  *assignment = NULL;
264   int  *nin        = NULL;
265   int   nc;
266 
267   if (esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nc) != eslOK) esl_fatal(msg);
268   if (nc != expected_nc)                                                   esl_fatal(msg);
269   if (assignment[msa->nseq-1] != last_assignment)                          esl_fatal(msg);
270   free(assignment);
271   free(nin);
272 }
273 #endif /*eslMSACLUSTER_TESTDRIVE*/
274 
275 /*****************************************************************
276  * 5. Test driver
277  *****************************************************************/
278 #ifdef eslMSACLUSTER_TESTDRIVE
279 /* gcc -g -Wall -o msacluster_utest -I. -L. -DeslMSACLUSTER_TESTDRIVE esl_msacluster.c -leasel -lm
280  */
281 #include "esl_config.h"
282 
283 #include <stdio.h>
284 #include <math.h>
285 
286 #include "easel.h"
287 #include "esl_alphabet.h"
288 #include "esl_getopts.h"
289 #include "esl_msa.h"
290 #include "esl_msacluster.h"
291 #include "esl_msafile.h"
292 
293 static ESL_OPTIONS options[] = {
294   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
295   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
296   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
297 };
298 static char usage[]  = "[-options]";
299 static char banner[] = "test driver for msacluster module";
300 
301 int
main(int argc,char ** argv)302 main(int argc, char **argv)
303 {
304   ESL_GETOPTS    *go      = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
305   ESL_ALPHABET   *abc     = esl_alphabet_Create(eslAMINO);
306   ESL_MSA        *msa     = esl_msa_CreateFromString("\
307 # STOCKHOLM 1.0\n\
308 \n\
309 seq0  AAAAAAAAAA\n\
310 seq1  AAAAAAAAAA\n\
311 seq2  AAAAAAAAAC\n\
312 seq3  AAAAAAAADD\n\
313 seq4  AAAAAAAEEE\n\
314 seq5  AAAAAAFFFF\n\
315 seq6  AAAAAGGGGG\n\
316 seq7  AAAAHHHHHH\n\
317 seq8  AAAIIIIIII\n\
318 seq9  AAKKKKKKKK\n\
319 seq10 ALLLLLLLLL\n\
320 seq11 MMMMMMMMMM\n\
321 //",   eslMSAFILE_STOCKHOLM);
322 
323 
324   utest_SingleLinkage(go, msa, 1.0, 11, 10);    /* at 100% id, only seq0/seq1 cluster */
325   utest_SingleLinkage(go, msa, 0.5,  6,  5);    /* at 50% id, seq0-seq6 cluster       */
326   utest_SingleLinkage(go, msa, 0.0,  1,  0);    /* at 0% id, everything clusters      */
327 
328   /* Do the same tests, but now with a digital MSA */
329   esl_msa_Digitize(abc, msa, NULL);
330   utest_SingleLinkage(go, msa, 1.0, 11, 10);    /* at 100% id, only seq0/seq1 cluster */
331   utest_SingleLinkage(go, msa, 0.5,  6,  5);    /* at 50% id, seq0-seq6 cluster       */
332   utest_SingleLinkage(go, msa, 0.0,  1,  0);    /* at 0% id, everything clusters      */
333 
334   esl_msa_Destroy(msa);
335   esl_alphabet_Destroy(abc);
336   esl_getopts_Destroy(go);
337   return 0;
338 }
339 #endif /* eslMSACLUSTER_TESTDRIVE*/
340 
341 
342 
343 
344 /*****************************************************************
345  * 6. Example
346  *****************************************************************/
347 
348 #ifdef eslMSACLUSTER_EXAMPLE
349 /*::cexcerpt::msacluster_example::begin::*/
350 /*
351    gcc -g -Wall -o msacluster_example -I. -L. -DeslMSACLUSTER_EXAMPLE esl_msacluster.c -leasel -lm
352    ./msacluster_example <MSA file>
353  */
354 #include <stdio.h>
355 #include "easel.h"
356 #include "esl_msa.h"
357 #include "esl_msacluster.h"
358 #include "esl_msafile.h"
359 
360 int
main(int argc,char ** argv)361 main(int argc, char **argv)
362 {
363   char        *filename   = argv[1];
364   int          fmt        = eslMSAFILE_UNKNOWN;
365   ESL_ALPHABET *abc       = NULL;
366   ESL_MSAFILE  *afp       = NULL;
367   ESL_MSA      *msa       = NULL;
368   double       maxid      = 0.62; /* cluster at 62% identity: the BLOSUM62 rule */
369   int         *assignment = NULL;
370   int         *nin        = NULL;
371   int          nclusters;
372   int          c, i;
373   int          status;
374 
375   /* Open; guess alphabet; set to digital mode */
376   if ((status = esl_msafile_Open(&abc, filename, NULL, fmt, NULL, &afp)) != eslOK)
377     esl_msafile_OpenFailure(afp, status);
378 
379   /* read one alignment */
380   if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
381     esl_msafile_ReadFailure(afp, status);
382 
383   /* do the clustering */
384   esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nclusters);
385 
386   printf("%d clusters at threshold of %f fractional identity\n", nclusters, maxid);
387   for (c = 0; c < nclusters; c++) {
388     printf("cluster %d:\n", c);
389     for (i = 0; i < msa->nseq; i++) if (assignment[i] == c) printf("  %s\n", msa->sqname[i]);
390     printf("(%d sequences)\n\n", nin[c]);
391   }
392 
393   esl_msa_Destroy(msa);
394   esl_msafile_Close(afp);
395   free(assignment);
396   free(nin);
397   return 0;
398 }
399 /*::cexcerpt::msacluster_example::end::*/
400 #endif /*eslMSACLUSTER_EXAMPLE*/
401 /*------------------------ end of example -----------------------*/
402