1 /* Shuffling, bootstrapping, permuting alignments, by column or row.
2 *
3 * Table of contents:
4 * 1. Shuffling or resampling columns ("horizontal" shuffling)
5 * 2. Shuffling residues within columns ("vertical" shuffling)
6 * 3. Permuting sequence order (i.e. by row)
7 * 4. Shuffling pairwise (QRNA) alignments.
8 * 5. Example
9 */
10 #include "esl_config.h"
11
12 #include <string.h>
13
14 #include "easel.h"
15 #include "esl_alphabet.h"
16 #include "esl_msa.h"
17 #include "esl_random.h"
18
19 #include "esl_msashuffle.h"
20
21 /*****************************************************************
22 * 1. Shuffling or resampling columns ("horizontal" shuffling)
23 *****************************************************************/
24
25 /* Function: esl_msashuffle_Shuffle()
26 * Synopsis: Shuffle an alignment's columns.
27 *
28 * Purpose: Returns a column-shuffled version of <msa> in <shuf>,
29 * using random generator <r>. Shuffling by columns
30 * preserves the \% identity of the original
31 * alignment. <msa> and <shuf> can be identical, to shuffle
32 * in place.
33 *
34 * The caller sets up the rest of the data (everything but
35 * the alignment itself) in <shuf> the way it wants,
36 * including sequence names, MSA name, and other
37 * annotation. The easy thing to do is to make <shuf>
38 * a copy of <msa>: the caller might create <shuf> by
39 * a call to <esl_msa_Clone()>.
40 *
41 * The alignments <msa> and <shuf> can both be in digital
42 * mode, or can both be in text mode; you cannot mix
43 * digital and text modes.
44 *
45 * Returns: <eslOK> on success.
46 *
47 * Throws: <eslEINVAL> if <msa>,<shuf> aren't in the same mode (digital vs. text).
48 */
49 int
esl_msashuffle_Shuffle(ESL_RANDOMNESS * r,ESL_MSA * msa,ESL_MSA * shuf)50 esl_msashuffle_Shuffle(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *shuf)
51 {
52 int i, pos, alen;
53
54 if (! (msa->flags & eslMSA_DIGITAL))
55 {
56 char c;
57 if (shuf->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in text mode if <msa> is");
58 if (msa != shuf) {
59 for (i = 0; i < msa->nseq; i++)
60 strcpy(shuf->aseq[i], msa->aseq[i]);
61 }
62
63 for (i = 0; i < msa->nseq; i++)
64 shuf->aseq[i][msa->alen] = '\0';
65
66 for (alen = msa->alen; alen > 1; alen--)
67 {
68 pos = esl_rnd_Roll(r, alen);
69 for (i = 0; i < msa->nseq; i++)
70 {
71 c = shuf->aseq[i][pos];
72 shuf->aseq[i][pos] = shuf->aseq[i][alen-1];
73 shuf->aseq[i][alen-1] = c;
74 }
75 }
76 }
77 else
78 {
79 ESL_DSQ x;
80 if (! (shuf->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in digital mode if <msa> is");
81
82 if (msa != shuf) {
83 for (i = 0; i < msa->nseq; i++)
84 memcpy(shuf->ax[i], msa->ax[i], (msa->alen + 2) * sizeof(ESL_DSQ));
85 }
86
87 for (i = 0; i < msa->nseq; i++)
88 shuf->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
89
90 for (alen = msa->alen; alen > 1; alen--)
91 {
92 pos = esl_rnd_Roll(r, alen) + 1;
93 for (i = 0; i < msa->nseq; i++)
94 {
95 x = shuf->ax[i][pos];
96 shuf->ax[i][pos] = shuf->ax[i][alen];
97 shuf->ax[i][alen] = x;
98 }
99 }
100 }
101
102 return eslOK;
103 }
104
105 /* Function: esl_msashuffle_Bootstrap()
106 * Synopsis: Bootstrap sample an MSA.
107 * Incept: SRE, Tue Jan 22 11:05:07 2008 [Janelia]
108 *
109 * Purpose: Takes a bootstrap sample of <msa> (sample <alen> columns,
110 * with replacement) and puts it in <bootsample>, using
111 * random generator <r>.
112 *
113 * The caller provides allocated space for <bootsample>.
114 * It must be different space than <msa>; you cannot take
115 * a bootstrap sample "in place". The caller sets up the
116 * rest of the data in <bootsample> (everything but the
117 * alignment itself) the way it wants, including sequence
118 * names, MSA name, and other annotation. The easy thing to
119 * do is to initialize <bootsample> by cloning <msa>.
120 *
121 * The alignments <msa> and <bootsample> can both be in digital
122 * mode, or can both be in text mode; you cannot mix
123 * digital and text modes.
124 *
125 * Returns: <eslOK> on success, and the alignment in <bootsample> is
126 * set to be a bootstrap resample of the alignment in <msa>.
127 *
128 * Throws: <eslEINVAL> if <msa>,<bootsample> aren't in the same mode
129 * (digital vs. text).
130 */
131 int
esl_msashuffle_Bootstrap(ESL_RANDOMNESS * r,ESL_MSA * msa,ESL_MSA * bootsample)132 esl_msashuffle_Bootstrap(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *bootsample)
133 {
134 int i, pos, col;
135
136 /* contract checks */
137 if ( (msa->flags & eslMSA_DIGITAL) && ! (bootsample->flags & eslMSA_DIGITAL))
138 ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
139 if (! (msa->flags & eslMSA_DIGITAL) && (bootsample->flags & eslMSA_DIGITAL))
140 ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
141
142 if (! (msa->flags & eslMSA_DIGITAL))
143 {
144 for (pos = 0; pos < msa->alen; pos++)
145 {
146 col = esl_rnd_Roll(r, msa->alen);
147 for (i = 0; i < msa->nseq; i++)
148 bootsample->aseq[i][pos] = msa->aseq[i][col];
149 }
150
151 for (i = 0; i < msa->nseq; i++)
152 bootsample->aseq[i][msa->alen] = '\0';
153 }
154 else
155 {
156 for (i = 0; i < msa->nseq; i++)
157 bootsample->ax[i][0] = eslDSQ_SENTINEL;
158
159 for (pos = 1; pos <= msa->alen; pos++)
160 {
161 col = esl_rnd_Roll(r, msa->alen) + 1;
162 for (i = 0; i < msa->nseq; i++)
163 bootsample->ax[i][pos] = msa->ax[i][col];
164 }
165
166 for (i = 0; i < msa->nseq; i++)
167 bootsample->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
168 }
169
170 return eslOK;
171 }
172
173
174 /*****************************************************************
175 * 2. Shuffling residues within columns ("vertical" shuffling)
176 *****************************************************************/
177
178 /* Function: esl_msashuffle_VShuffle()
179 * Synopsis: Shuffle <msa> residues within independent columns
180 * Incept: SRE, Tue 10 Jul 2018 [World Cup, France v. Belgium]
181 *
182 * Purpose: Shuffle the residues in each column of <msa>
183 * independently, using random generator <rng>.
184 * Return the shuffled alignment in <shuf>, space
185 * allocated by the caller. <msa> and <shuf> can
186 * be identical to shuffle <msa> in place.
187 *
188 * Caller is responsible for the metadata in <shuf> (name,
189 * etc.; everything but the alignment itself). Easiest
190 * thing for caller to do is to <esl_msa_Clone()> the <msa>
191 * to create <shuf>, then shuffle it.
192 *
193 * <msa> and <shuf> must be in digital mode.
194 *
195 * Args: rng - random number generator
196 * msa - input multiple alignment to shuffle
197 * shuf - RESULT: shuffled <msa>
198 *
199 * Returns: <eslOK> on success, and <shuf> contains the shuffled
200 * alignment.
201 *
202 * Throws: <eslEMEM> on allocation error. Now <shuf> is untouched.
203 */
204 int
esl_msashuffle_VShuffle(ESL_RANDOMNESS * rng,const ESL_MSA * msa,ESL_MSA * shuf)205 esl_msashuffle_VShuffle(ESL_RANDOMNESS *rng, const ESL_MSA *msa, ESL_MSA *shuf)
206 {
207 ESL_DASSERT1 (( msa->flags & eslMSA_DIGITAL ));
208 ESL_DASSERT1 (( shuf->flags & eslMSA_DIGITAL ));
209 ESL_DSQ *csq = NULL;
210 int idx, apos;
211 int nres; // number of non-gap residues in this column
212 int status;
213
214
215 ESL_ALLOC(csq, sizeof(ESL_DSQ) * (msa->nseq+2)); // +2 because we hack <csq> column to look like a digital sequence, suitable for esl_rsq*
216 csq[0] = eslDSQ_SENTINEL;
217
218 for (apos = 1; apos <= msa->alen; apos++)
219 {
220 /* transpose each column from [idx][apos] to an array (residues only) we can shuffle */
221 for (idx = 0, nres = 0; idx < msa->nseq; idx++)
222 if (! esl_abc_XIsGap(msa->abc, msa->ax[idx][apos]))
223 csq[++nres] = msa->ax[idx][apos]; // (again, the prepend ++nres here is because we make <csq> look like a digital seq, starts at 1
224 csq[nres+1] = eslDSQ_SENTINEL;
225
226 /* shuffle it (remember, it's only the residues (and *,~), not the gaps */
227 esl_rsq_XShuffle(rng, csq, nres, csq);
228
229 /* put it back in <shuf> */
230 for (idx = 0, nres = 0; idx < msa->nseq; idx++)
231 if (! esl_abc_XIsGap(msa->abc, msa->ax[idx][apos]))
232 shuf->ax[idx][apos] = csq[++nres]; // (again the ++, so we start at 1)
233 }
234 free(csq);
235 return eslOK;
236
237 ERROR:
238 return status;
239 }
240
241
242
243 /*****************************************************************
244 * 3. Permuting the sequence order
245 *****************************************************************/
246
247 /* Function: esl_msashuffle_PermuteSequenceOrder()
248 * Synopsis: Permutes the order of the sequences.
249 *
250 * Purpose: Randomly permute the order of the sequences in <msa>,
251 * and any associated sequence annotation, in place.
252 *
253 * Returns: <eslOK> on success.
254 *
255 * Throws: (no abnormal error conditions)
256 */
257 int
esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS * r,ESL_MSA * msa)258 esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa)
259 {
260 void *tmp;
261 double tmpwgt;
262 int64_t tmplen;
263 int N, i, tag;
264
265 for (N = msa->nseq; N > 1; N--)
266 {
267 i = esl_rnd_Roll(r, N); /* idx = 0..N-1 */
268
269 if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; }
270 else { tmp = msa->ax[i]; msa->ax[i] = msa->ax[N-1]; msa->ax[N-1] = tmp; }
271 tmp = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp;
272 tmpwgt = msa->wgt[i]; msa->wgt[i] = msa->wgt[N-1]; msa->wgt[N-1] = tmpwgt;
273
274 if (msa->sqacc) { tmp = msa->sqacc[i]; msa->sqacc[i] = msa->sqacc[N-1]; msa->sqacc[N-1] = tmp; }
275 if (msa->sqdesc) { tmp = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp; }
276 if (msa->ss) { tmp = msa->ss[i]; msa->ss[i] = msa->ss[N-1]; msa->ss[N-1] = tmp; }
277 if (msa->sa) { tmp = msa->sa[i]; msa->sa[i] = msa->sa[N-1]; msa->sa[N-1] = tmp; }
278 if (msa->pp) { tmp = msa->pp[i]; msa->pp[i] = msa->pp[N-1]; msa->pp[N-1] = tmp; }
279 if (msa->sqlen) { tmplen = msa->sqlen[i]; msa->sqlen[i] = msa->sqlen[N-1]; msa->sqlen[N-1] = tmplen; }
280 if (msa->sslen) { tmplen = msa->sslen[i]; msa->sslen[i] = msa->sslen[N-1]; msa->sslen[N-1] = tmplen; }
281 if (msa->salen) { tmplen = msa->salen[i]; msa->salen[i] = msa->salen[N-1]; msa->salen[N-1] = tmplen; }
282 if (msa->pplen) { tmplen = msa->pplen[i]; msa->pplen[i] = msa->pplen[N-1]; msa->pplen[N-1] = tmplen; }
283
284 for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; }
285 for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; }
286 }
287
288 /* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */
289 if (msa->index)
290 {
291 esl_keyhash_Reuse(msa->index);
292 for (i = 0; i < msa->nseq; i++)
293 esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL);
294 }
295
296 return eslOK;
297 }
298
299
300 /*****************************************************************
301 * 4. Shuffling pairwise (QRNA) alignments
302 *****************************************************************/
303
304 /* Function: esl_msashuffle_XQRNA()
305 * Synopsis: Gap-preserving column shuffle of a digital pairwise alignment.
306 * Incept: SRE, Tue Jan 22 09:09:52 2008 [Market Street Cafe, Leesburg]
307 *
308 * Purpose: Shuffle a digital pairwise alignment <x>,<y> while
309 * preserving the position of gaps, where both sequences are
310 * in digital alphabet <abc>, using the random number
311 * generator <r>. Return the shuffled alignment in <xs>,
312 * <ys>. Caller provides allocated space for <xs> and <ys>
313 * for at least the same length of <x>,<y>.
314 *
315 * Works by doing three separate
316 * shuffles, of (1) columns with residues in both
317 * <x> and <y>, (2) columns with residue in <x> and gap in <y>,
318 * and (3) columns with gap in <x> and residue in <y>.
319 *
320 * <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
321 * an alignment "in place", destroying the original
322 * alignment, just call <esl_msashuffle_XQRNA(r, abc, x,y,x,y)>.
323 *
324 * Returns: <eslOK> on success, and the shuffled alignment is
325 * returned in <xs>, <ys>.
326 *
327 * Throws: <eslEMEM> on allocation failure.
328 */
329 int
esl_msashuffle_XQRNA(ESL_RANDOMNESS * r,ESL_ALPHABET * abc,ESL_DSQ * x,ESL_DSQ * y,ESL_DSQ * xs,ESL_DSQ * ys)330 esl_msashuffle_XQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, ESL_DSQ *x, ESL_DSQ *y, ESL_DSQ *xs, ESL_DSQ *ys)
331 {
332 int L;
333 int *xycol = NULL;
334 int *xcol = NULL;
335 int *ycol = NULL;
336 int nxy, nx, ny;
337 int i;
338 int pos, c;
339 char xsym, ysym;
340 int status;
341
342 L = esl_abc_dsqlen(x);
343 if (esl_abc_dsqlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
344
345 if (xs != x) esl_abc_dsqcpy(x, L, xs);
346 if (ys != y) esl_abc_dsqcpy(y, L, ys);
347
348 /* First, construct three arrays containing lists of the column positions
349 * of the three types of columns. (If a column contains gaps in both x and y,
350 * we've already simply copied it to the shuffled sequence.)
351 */
352 ESL_ALLOC(xycol, sizeof(int) * L);
353 ESL_ALLOC(xcol, sizeof(int) * L);
354 ESL_ALLOC(ycol, sizeof(int) * L);
355 nxy = nx = ny = 0;
356
357 for (i = 1; i <= L; i++)
358 {
359 if ( esl_abc_XIsGap(abc, x[i]) && esl_abc_XIsGap(abc, y[i])) { continue; }
360 else if (! esl_abc_XIsGap(abc, x[i]) && ! esl_abc_XIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
361 else if ( esl_abc_XIsGap(abc, x[i])) { ycol[ny] = i; ny++; }
362 else if ( esl_abc_XIsGap(abc, y[i])) { xcol[nx] = i; nx++; }
363 }
364
365 /* Second, shuffle the sequences indirectly, via shuffling these arrays.
366 * Yow, careful with those indices, and with order of the statements...
367 */
368 for (; nxy > 1; nxy--) {
369 pos = esl_rnd_Roll(r, nxy);
370 xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos];
371 xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1];
372 xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = c;
373 }
374 for (; nx > 1; nx--) {
375 pos = esl_rnd_Roll(r, nx);
376 xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos];
377 xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1];
378 xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c;
379 }
380 for (; ny > 1; ny--) {
381 pos = esl_rnd_Roll(r, ny);
382 xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos];
383 xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1];
384 xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c;
385 }
386
387 free(xycol); free(xcol); free(ycol);
388 return eslOK;
389
390 ERROR:
391 if (xycol != NULL) free(xycol);
392 if (xcol != NULL) free(xcol);
393 if (ycol != NULL) free(ycol);
394 return status;
395 }
396
397 /* Function: esl_msashuffle_CQRNA()
398 * Synopsis: Gap-preserving column shuffle of a pairwise alignment.
399 * Incept: SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg]
400 *
401 * Purpose: Shuffle a pairwise alignment <x>,<y> while preserving the
402 * position of gaps, using the random number generator <r>.
403 * Return the shuffled alignment in <xs>,
404 * <ys>. Caller provides allocated space for <xs> and <ys>.
405 *
406 * An alphabet <abc> must also be provided, solely for the
407 * definition of gap characters. Because Easel's default
408 * alphabets (DNA, RNA, and protein) all use the same
409 * definition of gap characters <-_.>, you can actually
410 * provide any alphabet here, and get the same results.
411 * (This may save having to determine the alphabet of input
412 * sequences.)
413 *
414 * Works by doing three separate
415 * shuffles, of (1) columns with residues in both
416 * <x> and <y>, (2) columns with residue in <x> and gap in <y>,
417 * and (3) columns with gap in <x> and residue in <y>.
418 *
419 * <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
420 * an alignment "in place", destroying the original
421 * alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>.
422 *
423 * Returns: <eslOK> on success, and the shuffled alignment is
424 * returned in <xs>, <ys>.
425 *
426 * Throws: <eslEMEM> on allocation failure.
427 */
428 int
esl_msashuffle_CQRNA(ESL_RANDOMNESS * r,ESL_ALPHABET * abc,char * x,char * y,char * xs,char * ys)429 esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys)
430 {
431 int L;
432 int *xycol = NULL;
433 int *xcol = NULL;
434 int *ycol = NULL;
435 int nxy, nx, ny;
436 int i;
437 int pos, c;
438 char xsym, ysym;
439 int status;
440
441 if (xs != x) strcpy(xs, x);
442 if (ys != y) strcpy(ys, y);
443
444 /* First, construct three arrays containing lists of the column positions
445 * of the three types of columns. (If a column contains gaps in both x and y,
446 * we've already simply copied it to the shuffled sequence.)
447 */
448 L = strlen(x);
449 if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
450 ESL_ALLOC(xycol, sizeof(int) * L);
451 ESL_ALLOC(xcol, sizeof(int) * L);
452 ESL_ALLOC(ycol, sizeof(int) * L);
453 nxy = nx = ny = 0;
454
455 for (i = 0; i < L; i++)
456 {
457 if ( esl_abc_CIsGap(abc, x[i]) && esl_abc_CIsGap(abc, y[i])) { continue; }
458 else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
459 else if ( esl_abc_CIsGap(abc, x[i])) { ycol[ny] = i; ny++; }
460 else if ( esl_abc_CIsGap(abc, y[i])) { xcol[nx] = i; nx++; }
461 }
462
463 /* Second, shuffle the sequences indirectly, via shuffling these arrays.
464 * Yow, careful with those indices, and with order of the statements...
465 */
466 for (; nxy > 1; nxy--) {
467 pos = esl_rnd_Roll(r, nxy);
468 xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos];
469 xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1];
470 xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = c;
471 }
472 for (; nx > 1; nx--) {
473 pos = esl_rnd_Roll(r, nx);
474 xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos];
475 xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1];
476 xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c;
477 }
478 for (; ny > 1; ny--) {
479 pos = esl_rnd_Roll(r, ny);
480 xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos];
481 xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1];
482 xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c;
483 }
484
485 free(xycol); free(xcol); free(ycol);
486 return eslOK;
487
488 ERROR:
489 if (xycol != NULL) free(xycol);
490 if (xcol != NULL) free(xcol);
491 if (ycol != NULL) free(ycol);
492 return status;
493 }
494
495
496
497
498 /*****************************************************************
499 * 5. Example.
500 *****************************************************************/
501 #ifdef eslMSASHUFFLE_EXAMPLE
502 #include <stdio.h>
503 #include "easel.h"
504 #include "esl_alphabet.h"
505 #include "esl_getopts.h"
506 #include "esl_msa.h"
507 #include "esl_msafile.h"
508 #include "esl_random.h"
509
510 static ESL_OPTIONS options[] = {
511 /* name type default env range toggles reqs incomp help docgroup*/
512 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
513 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 },
514 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 },
515 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 },
516 { "--text", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
517 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
518 };
519 static char usage[] = "[-options] <msafile>";
520 static char banner[] = "example of multiple alignment shuffling/permuting";
521
522 int
main(int argc,char ** argv)523 main(int argc, char **argv)
524 {
525 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
526 ESL_RANDOMNESS *rng = esl_randomness_Create(0);
527 char *msafile = esl_opt_GetArg(go, 1);
528 int fmt = eslMSAFILE_UNKNOWN;
529 ESL_ALPHABET *abc = NULL;
530 ESL_MSAFILE *afp = NULL;
531 ESL_MSA *msa = NULL;
532 int textmode = esl_opt_GetBoolean(go, "--text");
533 int nali = 0;
534 int status;
535
536 /* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */
537 if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
538 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
539 else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
540
541 /* Open in text or digital mode.
542 * To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>.
543 * To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
544 * To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
545 * esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any
546 * open failure to <stderr>. You can of course handle your own diagnostics instead.
547 */
548 if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, fmt, NULL, &afp);
549 else status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp);
550 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
551
552 fmt = afp->format;
553
554 while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
555 {
556 /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data;
557 * if text MSA: msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
558 nali++;
559
560 /* permute it */
561 esl_msashuffle_PermuteSequenceOrder(rng, msa);
562
563 esl_msafile_Write(stdout, msa, fmt);
564 esl_msa_Destroy(msa);
565 }
566 if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
567
568 esl_alphabet_Destroy(abc);
569 esl_msafile_Close(afp);
570 esl_randomness_Destroy(rng);
571 esl_getopts_Destroy(go);
572 exit(0);
573 }
574 #endif
575
576