1 /* Clustering sequences in an MSA by % identity.
2 *
3 * Table of contents:
4 * 1. Single linkage clustering an MSA by %id
5 * 2. Internal functions, interface to the clustering API
6 * 3. Some internal functions needed for regression tests
7 * 4. Unit tests
8 * 5. Test driver
9 * 6. Example
10 *
11 * (Wondering why isn't this just part of the cluster or MSA modules?
12 * esl_cluster itself is a core module, dependent only on easel. MSA
13 * clustering involves at least the distance, cluster, and msa
14 * modules. We're better off separating its functionality away into a
15 * more highly derived module.)
16 */
17 #include "esl_config.h"
18
19 #include "easel.h"
20 #include "esl_alphabet.h"
21 #include "esl_cluster.h"
22 #include "esl_distance.h"
23 #include "esl_msa.h"
24
25 #include "esl_msacluster.h"
26
27 /* These functions are going to get defined in an internal regression
28 * testing section further below:
29 */
30 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
31 #include <ctype.h>
32 static double squid_distance(char *s1, char *s2);
33 static double squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2);
34 #endif
35
36 /* These functions will define linkage between a pair of text or
37 * digital aseq's:
38 */
39 static int msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link);
40 static int msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link);
41
42 /* In digital mode, we'll need to pass the clustering routine two parameters -
43 * %id threshold and alphabet ptr - so make a structure that bundles them.
44 */
45 struct msa_param_s {
46 double maxid;
47 ESL_ALPHABET *abc;
48 };
49
50
51 /*****************************************************************
52 * 1. Single linkage clustering an MSA by %id
53 *****************************************************************/
54
55 /* Function: esl_msacluster_SingleLinkage()
56 * Synopsis: Single linkage clustering by percent identity.
57 * Incept: SRE, Sun Nov 5 10:11:45 2006 [Janelia]
58 *
59 * Purpose: Perform single link clustering of the sequences in
60 * multiple alignment <msa>. Any pair of sequences with
61 * percent identity $\geq$ <maxid> are linked (using
62 * the definition from the \eslmod{distance} module).
63 *
64 * The resulting clustering is optionally returned in one
65 * or more of <opt_c>, <opt_nin>, and <opt_nc>. The
66 * <opt_c[0..nseq-1]> array assigns a cluster index
67 * <(0..nc-1)> to each sequence. For example, <c[4] = 1>
68 * means that sequence 4 is assigned to cluster 1. The
69 * <opt_nin[0..nc-1]> array is the number of sequences
70 * in each cluster. <opt_nc> is the number of clusters.
71 *
72 * Importantly, this algorithm runs in $O(N)$ memory, and
73 * produces one discrete clustering. Compare to
74 * <esl_tree_SingleLinkage()>, which requires an $O(N^2)$
75 * adjacency matrix, and produces a hierarchical clustering
76 * tree.
77 *
78 * The algorithm is worst case $O(LN^2)$ time, for $N$
79 * sequences of length $L$. However, the worst case is no
80 * links at all, and this is unusual. More typically, time
81 * scales as about $LN \log N$. The best case scales as
82 * $LN$, when there is just one cluster in a completely
83 * connected graph.
84 *
85 * Args: msa - multiple alignment to cluster
86 * maxid - pairwise identity threshold: cluster if $\geq$ <maxid>
87 * opt_c - optRETURN: cluster assignments for each sequence, [0..nseq-1]
88 * opt_nin - optRETURN: number of seqs in each cluster, [0..nc-1]
89 * opt_nc - optRETURN: number of clusters
90 *
91 * Returns: <eslOK> on success; the <opt_c[0..nseq-1]> array contains
92 * cluster indices <0..nc-1> assigned to each sequence; the
93 * <opt_nin[0..nc-1]> array contains the number of seqs in
94 * each cluster; and <opt_nc> contains the number of
95 * clusters. The <opt_c> array and <opt_nin> arrays will be
96 * allocated here, if non-<NULL>, and must be free'd by the
97 * caller. The input <msa> is unmodified.
98 *
99 * The caller may pass <NULL> for either <opt_c> or
100 * <opt_nc> if it is only interested in one of the two
101 * results.
102 *
103 * Throws: <eslEMEM> on allocation failure, and <eslEINVAL> if a pairwise
104 * comparison is invalid (which means the MSA is corrupted, so it
105 * shouldn't happen). In either case, <opt_c> and <opt_nin> are set to <NULL>
106 * and <opt_nc> is set to 0, and the <msa> is unmodified.
107 */
108 int
esl_msacluster_SingleLinkage(const ESL_MSA * msa,double maxid,int ** opt_c,int ** opt_nin,int * opt_nc)109 esl_msacluster_SingleLinkage(const ESL_MSA *msa, double maxid,
110 int **opt_c, int **opt_nin, int *opt_nc)
111
112 {
113 int status;
114 int *workspace = NULL;
115 int *assignment = NULL;
116 int *nin = NULL;
117 int nc;
118 int i;
119 struct msa_param_s param;
120
121 /* Allocations */
122 ESL_ALLOC(workspace, sizeof(int) * msa->nseq * 2);
123 ESL_ALLOC(assignment, sizeof(int) * msa->nseq);
124
125 /* call to SLC API: */
126 if (! (msa->flags & eslMSA_DIGITAL))
127 status = esl_cluster_SingleLinkage((void *) msa->aseq, (size_t) msa->nseq, sizeof(char *),
128 msacluster_clinkage, (void *) &maxid,
129 workspace, assignment, &nc);
130 else {
131 param.maxid = maxid;
132 param.abc = msa->abc;
133 status = esl_cluster_SingleLinkage((void *) msa->ax, (size_t) msa->nseq, sizeof(ESL_DSQ *),
134 msacluster_xlinkage, (void *) ¶m,
135 workspace, assignment, &nc);
136 }
137 if (status != eslOK) goto ERROR;
138
139
140 if (opt_nin != NULL)
141 {
142 ESL_ALLOC(nin, sizeof(int) * nc);
143 for (i = 0; i < nc; i++) nin[i] = 0;
144 for (i = 0; i < msa->nseq; i++)
145 nin[assignment[i]]++;
146 *opt_nin = nin;
147 }
148
149 /* cleanup and return */
150 free(workspace);
151 if (opt_c != NULL) *opt_c = assignment; else free(assignment);
152 if (opt_nc != NULL) *opt_nc = nc;
153 return eslOK;
154
155 ERROR:
156 if (workspace != NULL) free(workspace);
157 if (assignment != NULL) free(assignment);
158 if (nin != NULL) free(nin);
159 if (opt_c != NULL) *opt_c = NULL;
160 if (opt_nc != NULL) *opt_nc = 0;
161 return status;
162 }
163
164
165
166
167
168 /*****************************************************************
169 * 2. Internal functions, interface to the clustering API
170 *****************************************************************/
171
172 /* Definition of %id linkage in text-mode aligned seqs (>= maxid): */
173 static int
msacluster_clinkage(const void * v1,const void * v2,const void * p,int * ret_link)174 msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link)
175 {
176 char *as1 = *(char **) v1;
177 char *as2 = *(char **) v2;
178 double maxid = *(double *) p;
179 double pid;
180 int status = eslOK;
181
182 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
183 pid = 1. - squid_distance(as1, as2);
184 #else
185 if ((status = esl_dst_CPairId(as1, as2, &pid, NULL, NULL)) != eslOK) return status;
186 #endif
187
188 *ret_link = (pid >= maxid ? TRUE : FALSE);
189 return status;
190 }
191
192 /* Definition of % id linkage in digital aligned seqs (>= maxid) */
193 static int
msacluster_xlinkage(const void * v1,const void * v2,const void * p,int * ret_link)194 msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link)
195 {
196 ESL_DSQ *ax1 = *(ESL_DSQ **) v1;
197 ESL_DSQ *ax2 = *(ESL_DSQ **) v2;
198 struct msa_param_s *param = (struct msa_param_s *) p;
199 double pid;
200 int status = eslOK;
201
202 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
203 pid = 1. - squid_xdistance(param->abc, ax1, ax2);
204 #else
205 if ( (status = esl_dst_XPairId(param->abc, ax1, ax2, &pid, NULL, NULL)) != eslOK) return status;
206 #endif
207
208 *ret_link = (pid >= param->maxid ? TRUE : FALSE);
209 return status;
210 }
211
212
213 /*****************************************************************
214 * 3. Some internal functions needed for regression tests
215 *****************************************************************/
216
217 /* When regression testing against squid, we have to replace
218 * Easel's distance calculations with a simpler, (even) less robust
219 * calculation that squid did.
220 */
221 #if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
222 static double
squid_distance(char * s1,char * s2)223 squid_distance(char *s1, char *s2)
224 {
225 int diff = 0;
226 int valid = 0;
227
228 for (; *s1 != '\0'; s1++, s2++)
229 {
230 if (!isalpha(*s1) || !isalpha(*s2)) continue;
231 if (*s1 != *s2) diff++;
232 valid++;
233 }
234 return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
235 }
236 static double
squid_xdistance(ESL_ALPHABET * a,ESL_DSQ * x1,ESL_DSQ * x2)237 squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2)
238 {
239 int diff = 0;
240 int valid = 0;
241
242 for (; *x1 != eslDSQ_SENTINEL; x1++, x2++)
243 {
244 if (esl_abc_XIsGap(a, *x1) || esl_abc_XIsGap(a, *x2)) continue;
245 if (*x1 != *x2) diff++;
246 valid++;
247 }
248 return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
249 }
250 #endif /* eslMSACLUSTER_REGRESSION || eslMSAWEIGHT_REGRESSION */
251
252
253 /*****************************************************************
254 * 4. Unit tests
255 *****************************************************************/
256 #ifdef eslMSACLUSTER_TESTDRIVE
257 #include "esl_getopts.h"
258
259 static void
utest_SingleLinkage(ESL_GETOPTS * go,const ESL_MSA * msa,double maxid,int expected_nc,int last_assignment)260 utest_SingleLinkage(ESL_GETOPTS *go, const ESL_MSA *msa, double maxid, int expected_nc, int last_assignment)
261 {
262 char *msg = "utest_SingleLinkage() failed";
263 int *assignment = NULL;
264 int *nin = NULL;
265 int nc;
266
267 if (esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nc) != eslOK) esl_fatal(msg);
268 if (nc != expected_nc) esl_fatal(msg);
269 if (assignment[msa->nseq-1] != last_assignment) esl_fatal(msg);
270 free(assignment);
271 free(nin);
272 }
273 #endif /*eslMSACLUSTER_TESTDRIVE*/
274
275 /*****************************************************************
276 * 5. Test driver
277 *****************************************************************/
278 #ifdef eslMSACLUSTER_TESTDRIVE
279 /* gcc -g -Wall -o msacluster_utest -I. -L. -DeslMSACLUSTER_TESTDRIVE esl_msacluster.c -leasel -lm
280 */
281 #include "esl_config.h"
282
283 #include <stdio.h>
284 #include <math.h>
285
286 #include "easel.h"
287 #include "esl_alphabet.h"
288 #include "esl_getopts.h"
289 #include "esl_msa.h"
290 #include "esl_msacluster.h"
291 #include "esl_msafile.h"
292
293 static ESL_OPTIONS options[] = {
294 /* name type default env range toggles reqs incomp help docgroup*/
295 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
296 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
297 };
298 static char usage[] = "[-options]";
299 static char banner[] = "test driver for msacluster module";
300
301 int
main(int argc,char ** argv)302 main(int argc, char **argv)
303 {
304 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
305 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
306 ESL_MSA *msa = esl_msa_CreateFromString("\
307 # STOCKHOLM 1.0\n\
308 \n\
309 seq0 AAAAAAAAAA\n\
310 seq1 AAAAAAAAAA\n\
311 seq2 AAAAAAAAAC\n\
312 seq3 AAAAAAAADD\n\
313 seq4 AAAAAAAEEE\n\
314 seq5 AAAAAAFFFF\n\
315 seq6 AAAAAGGGGG\n\
316 seq7 AAAAHHHHHH\n\
317 seq8 AAAIIIIIII\n\
318 seq9 AAKKKKKKKK\n\
319 seq10 ALLLLLLLLL\n\
320 seq11 MMMMMMMMMM\n\
321 //", eslMSAFILE_STOCKHOLM);
322
323
324 utest_SingleLinkage(go, msa, 1.0, 11, 10); /* at 100% id, only seq0/seq1 cluster */
325 utest_SingleLinkage(go, msa, 0.5, 6, 5); /* at 50% id, seq0-seq6 cluster */
326 utest_SingleLinkage(go, msa, 0.0, 1, 0); /* at 0% id, everything clusters */
327
328 /* Do the same tests, but now with a digital MSA */
329 esl_msa_Digitize(abc, msa, NULL);
330 utest_SingleLinkage(go, msa, 1.0, 11, 10); /* at 100% id, only seq0/seq1 cluster */
331 utest_SingleLinkage(go, msa, 0.5, 6, 5); /* at 50% id, seq0-seq6 cluster */
332 utest_SingleLinkage(go, msa, 0.0, 1, 0); /* at 0% id, everything clusters */
333
334 esl_msa_Destroy(msa);
335 esl_alphabet_Destroy(abc);
336 esl_getopts_Destroy(go);
337 return 0;
338 }
339 #endif /* eslMSACLUSTER_TESTDRIVE*/
340
341
342
343
344 /*****************************************************************
345 * 6. Example
346 *****************************************************************/
347
348 #ifdef eslMSACLUSTER_EXAMPLE
349 /*::cexcerpt::msacluster_example::begin::*/
350 /*
351 gcc -g -Wall -o msacluster_example -I. -L. -DeslMSACLUSTER_EXAMPLE esl_msacluster.c -leasel -lm
352 ./msacluster_example <MSA file>
353 */
354 #include <stdio.h>
355 #include "easel.h"
356 #include "esl_msa.h"
357 #include "esl_msacluster.h"
358 #include "esl_msafile.h"
359
360 int
main(int argc,char ** argv)361 main(int argc, char **argv)
362 {
363 char *filename = argv[1];
364 int fmt = eslMSAFILE_UNKNOWN;
365 ESL_ALPHABET *abc = NULL;
366 ESL_MSAFILE *afp = NULL;
367 ESL_MSA *msa = NULL;
368 double maxid = 0.62; /* cluster at 62% identity: the BLOSUM62 rule */
369 int *assignment = NULL;
370 int *nin = NULL;
371 int nclusters;
372 int c, i;
373 int status;
374
375 /* Open; guess alphabet; set to digital mode */
376 if ((status = esl_msafile_Open(&abc, filename, NULL, fmt, NULL, &afp)) != eslOK)
377 esl_msafile_OpenFailure(afp, status);
378
379 /* read one alignment */
380 if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
381 esl_msafile_ReadFailure(afp, status);
382
383 /* do the clustering */
384 esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nclusters);
385
386 printf("%d clusters at threshold of %f fractional identity\n", nclusters, maxid);
387 for (c = 0; c < nclusters; c++) {
388 printf("cluster %d:\n", c);
389 for (i = 0; i < msa->nseq; i++) if (assignment[i] == c) printf(" %s\n", msa->sqname[i]);
390 printf("(%d sequences)\n\n", nin[c]);
391 }
392
393 esl_msa_Destroy(msa);
394 esl_msafile_Close(afp);
395 free(assignment);
396 free(nin);
397 return 0;
398 }
399 /*::cexcerpt::msacluster_example::end::*/
400 #endif /*eslMSACLUSTER_EXAMPLE*/
401 /*------------------------ end of example -----------------------*/
402