1 /* Shuffling, bootstrapping, permuting alignments, by column or row.
2  *
3  * Table of contents:
4  *    1. Shuffling or resampling columns ("horizontal" shuffling)
5  *    2. Shuffling residues within columns ("vertical" shuffling)
6  *    3. Permuting sequence order (i.e. by row)
7  *    4. Shuffling pairwise (QRNA) alignments.
8  *    5. Example
9  */
10 #include "esl_config.h"
11 
12 #include <string.h>
13 
14 #include "easel.h"
15 #include "esl_alphabet.h"
16 #include "esl_msa.h"
17 #include "esl_random.h"
18 
19 #include "esl_msashuffle.h"
20 
21 /*****************************************************************
22  * 1. Shuffling or resampling columns ("horizontal" shuffling)
23  *****************************************************************/
24 
25 /* Function:  esl_msashuffle_Shuffle()
26  * Synopsis:  Shuffle an alignment's columns.
27  *
28  * Purpose:   Returns a column-shuffled version of <msa> in <shuf>,
29  *            using random generator <r>. Shuffling by columns
30  *            preserves the \% identity of the original
31  *            alignment. <msa> and <shuf> can be identical, to shuffle
32  *            in place.
33  *
34  *            The caller sets up the rest of the data (everything but
35  *            the alignment itself) in <shuf> the way it wants,
36  *            including sequence names, MSA name, and other
37  *            annotation. The easy thing to do is to make <shuf>
38  *            a copy of <msa>: the caller might create <shuf> by
39  *            a call to <esl_msa_Clone()>.
40  *
41  *            The alignments <msa> and <shuf> can both be in digital
42  *            mode, or can both be in text mode; you cannot mix
43  *            digital and text modes.
44  *
45  * Returns:   <eslOK> on success.
46  *
47  * Throws:    <eslEINVAL> if <msa>,<shuf> aren't in the same mode (digital vs. text).
48  */
49 int
esl_msashuffle_Shuffle(ESL_RANDOMNESS * r,ESL_MSA * msa,ESL_MSA * shuf)50 esl_msashuffle_Shuffle(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *shuf)
51 {
52   int i, pos, alen;
53 
54   if (! (msa->flags & eslMSA_DIGITAL))
55     {
56       char c;
57       if (shuf->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in text mode if <msa> is");
58       if (msa != shuf) {
59 	for (i = 0; i < msa->nseq; i++)
60 	  strcpy(shuf->aseq[i], msa->aseq[i]);
61       }
62 
63       for (i = 0; i < msa->nseq; i++)
64 	shuf->aseq[i][msa->alen] = '\0';
65 
66       for (alen = msa->alen; alen > 1; alen--)
67 	{
68 	  pos = esl_rnd_Roll(r, alen);
69 	  for (i = 0; i < msa->nseq; i++)
70 	    {
71 	      c                     = shuf->aseq[i][pos];
72 	      shuf->aseq[i][pos]    = shuf->aseq[i][alen-1];
73 	      shuf->aseq[i][alen-1] = c;
74 	    }
75 	}
76     }
77   else
78     {
79       ESL_DSQ x;
80       if (! (shuf->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in digital mode if <msa> is");
81 
82       if (msa != shuf) {
83 	for (i = 0; i < msa->nseq; i++)
84 	  memcpy(shuf->ax[i], msa->ax[i], (msa->alen + 2) * sizeof(ESL_DSQ));
85       }
86 
87       for (i = 0; i < msa->nseq; i++)
88 	shuf->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
89 
90       for (alen = msa->alen; alen > 1; alen--)
91 	{
92 	  pos = esl_rnd_Roll(r, alen) + 1;
93 	  for (i = 0; i < msa->nseq; i++)
94 	    {
95 	      x                 = shuf->ax[i][pos];
96 	      shuf->ax[i][pos]  = shuf->ax[i][alen];
97 	      shuf->ax[i][alen] = x;
98 	    }
99 	}
100     }
101 
102   return eslOK;
103 }
104 
105 /* Function:  esl_msashuffle_Bootstrap()
106  * Synopsis:  Bootstrap sample an MSA.
107  * Incept:    SRE, Tue Jan 22 11:05:07 2008 [Janelia]
108  *
109  * Purpose:   Takes a bootstrap sample of <msa> (sample <alen> columns,
110  *            with replacement) and puts it in <bootsample>, using
111  *            random generator <r>.
112  *
113  *            The caller provides allocated space for <bootsample>.
114  *            It must be different space than <msa>; you cannot take
115  *            a bootstrap sample "in place". The caller sets up the
116  *            rest of the data in <bootsample> (everything but the
117  *            alignment itself) the way it wants, including sequence
118  *            names, MSA name, and other annotation. The easy thing to
119  *            do is to initialize <bootsample> by cloning <msa>.
120  *
121  *            The alignments <msa> and <bootsample> can both be in digital
122  *            mode, or can both be in text mode; you cannot mix
123  *            digital and text modes.
124  *
125  * Returns:   <eslOK> on success, and the alignment in <bootsample> is
126  *            set to be a bootstrap resample of the alignment in <msa>.
127  *
128  * Throws:    <eslEINVAL> if <msa>,<bootsample> aren't in the same mode
129  *            (digital vs. text).
130  */
131 int
esl_msashuffle_Bootstrap(ESL_RANDOMNESS * r,ESL_MSA * msa,ESL_MSA * bootsample)132 esl_msashuffle_Bootstrap(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *bootsample)
133 {
134   int i, pos, col;
135 
136   /* contract checks */
137   if (  (msa->flags & eslMSA_DIGITAL) && ! (bootsample->flags & eslMSA_DIGITAL))
138     ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
139   if (! (msa->flags & eslMSA_DIGITAL) &&   (bootsample->flags & eslMSA_DIGITAL))
140     ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
141 
142   if (! (msa->flags & eslMSA_DIGITAL))
143     {
144       for (pos = 0; pos < msa->alen; pos++)
145 	{
146 	  col = esl_rnd_Roll(r, msa->alen);
147 	  for (i = 0; i < msa->nseq; i++)
148 	    bootsample->aseq[i][pos] = msa->aseq[i][col];
149 	}
150 
151       for (i = 0; i < msa->nseq; i++)
152 	bootsample->aseq[i][msa->alen] = '\0';
153     }
154   else
155     {
156       for (i = 0; i < msa->nseq; i++)
157 	bootsample->ax[i][0] = eslDSQ_SENTINEL;
158 
159       for (pos = 1; pos <= msa->alen; pos++)
160 	{
161 	  col = esl_rnd_Roll(r, msa->alen) + 1;
162 	  for (i = 0; i < msa->nseq; i++)
163 	    bootsample->ax[i][pos] = msa->ax[i][col];
164 	}
165 
166       for (i = 0; i < msa->nseq; i++)
167 	bootsample->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
168     }
169 
170   return eslOK;
171 }
172 
173 
174 /*****************************************************************
175  * 2. Shuffling residues within columns ("vertical" shuffling)
176  *****************************************************************/
177 
178 /* Function:  esl_msashuffle_VShuffle()
179  * Synopsis:  Shuffle <msa> residues within independent columns
180  * Incept:    SRE, Tue 10 Jul 2018 [World Cup, France v. Belgium]
181  *
182  * Purpose:   Shuffle the residues in each column of <msa>
183  *            independently, using random generator <rng>.
184  *            Return the shuffled alignment in <shuf>, space
185  *            allocated by the caller. <msa> and <shuf> can
186  *            be identical to shuffle <msa> in place.
187  *
188  *            Caller is responsible for the metadata in <shuf> (name,
189  *            etc.; everything but the alignment itself).  Easiest
190  *            thing for caller to do is to <esl_msa_Clone()> the <msa>
191  *            to create <shuf>, then shuffle it.
192  *
193  *            <msa> and <shuf> must be in digital mode.
194  *
195  * Args:      rng  - random number generator
196  *            msa  - input multiple alignment to shuffle
197  *            shuf - RESULT: shuffled <msa>
198  *
199  * Returns:   <eslOK> on success, and <shuf> contains the shuffled
200  *            alignment.
201  *
202  * Throws:    <eslEMEM> on allocation error. Now <shuf> is untouched.
203  */
204 int
esl_msashuffle_VShuffle(ESL_RANDOMNESS * rng,const ESL_MSA * msa,ESL_MSA * shuf)205 esl_msashuffle_VShuffle(ESL_RANDOMNESS *rng, const ESL_MSA *msa, ESL_MSA *shuf)
206 {
207   ESL_DASSERT1 ((  msa->flags  & eslMSA_DIGITAL ));
208   ESL_DASSERT1 ((  shuf->flags & eslMSA_DIGITAL ));
209   ESL_DSQ *csq = NULL;
210   int      idx, apos;
211   int      nres;      // number of non-gap residues in this column
212   int      status;
213 
214 
215   ESL_ALLOC(csq, sizeof(ESL_DSQ) * (msa->nseq+2));  // +2 because we hack <csq> column to look like a digital sequence, suitable for esl_rsq*
216   csq[0] = eslDSQ_SENTINEL;
217 
218   for (apos = 1; apos <= msa->alen; apos++)
219     {
220       /* transpose each column from [idx][apos] to an array (residues only) we can shuffle */
221       for (idx = 0, nres = 0; idx < msa->nseq; idx++)
222 	if (! esl_abc_XIsGap(msa->abc, msa->ax[idx][apos]))
223 	  csq[++nres] = msa->ax[idx][apos];  // (again, the prepend ++nres here is because we make <csq> look like a digital seq, starts at 1
224       csq[nres+1] = eslDSQ_SENTINEL;
225 
226       /* shuffle it (remember, it's only the residues (and *,~), not the gaps */
227       esl_rsq_XShuffle(rng, csq, nres, csq);
228 
229       /* put it back in <shuf> */
230       for (idx = 0, nres = 0; idx < msa->nseq; idx++)
231 	if (! esl_abc_XIsGap(msa->abc, msa->ax[idx][apos]))
232 	  shuf->ax[idx][apos] = csq[++nres];   // (again the ++, so we start at 1)
233     }
234   free(csq);
235   return eslOK;
236 
237  ERROR:
238   return status;
239 }
240 
241 
242 
243 /*****************************************************************
244  * 3. Permuting the sequence order
245  *****************************************************************/
246 
247 /* Function:  esl_msashuffle_PermuteSequenceOrder()
248  * Synopsis:  Permutes the order of the sequences.
249  *
250  * Purpose:   Randomly permute the order of the sequences in <msa>,
251  *            and any associated sequence annotation, in place.
252  *
253  * Returns:   <eslOK> on success.
254  *
255  * Throws:    (no abnormal error conditions)
256  */
257 int
esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS * r,ESL_MSA * msa)258 esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa)
259 {
260   void   *tmp;
261   double  tmpwgt;
262   int64_t tmplen;
263   int     N, i, tag;
264 
265   for (N = msa->nseq; N > 1; N--)
266     {
267       i = esl_rnd_Roll(r, N);	/* idx = 0..N-1 */
268 
269       if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; }
270       else 	                            { tmp = msa->ax[i];   msa->ax[i]   = msa->ax[N-1];   msa->ax[N-1]   = tmp; }
271       tmp    = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp;
272       tmpwgt = msa->wgt[i];    msa->wgt[i]    = msa->wgt[N-1];    msa->wgt[N-1]    = tmpwgt;
273 
274       if (msa->sqacc)  { tmp    = msa->sqacc[i];  msa->sqacc[i]  = msa->sqacc[N-1];  msa->sqacc[N-1]  = tmp;    }
275       if (msa->sqdesc) { tmp    = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp;    }
276       if (msa->ss)     { tmp    = msa->ss[i];     msa->ss[i]     = msa->ss[N-1];     msa->ss[N-1]     = tmp;    }
277       if (msa->sa)     { tmp    = msa->sa[i];     msa->sa[i]     = msa->sa[N-1];     msa->sa[N-1]     = tmp;    }
278       if (msa->pp)     { tmp    = msa->pp[i];     msa->pp[i]     = msa->pp[N-1];     msa->pp[N-1]     = tmp;    }
279       if (msa->sqlen)  { tmplen = msa->sqlen[i];  msa->sqlen[i]  = msa->sqlen[N-1];  msa->sqlen[N-1]  = tmplen; }
280       if (msa->sslen)  { tmplen = msa->sslen[i];  msa->sslen[i]  = msa->sslen[N-1];  msa->sslen[N-1]  = tmplen; }
281       if (msa->salen)  { tmplen = msa->salen[i];  msa->salen[i]  = msa->salen[N-1];  msa->salen[N-1]  = tmplen; }
282       if (msa->pplen)  { tmplen = msa->pplen[i];  msa->pplen[i]  = msa->pplen[N-1];  msa->pplen[N-1]  = tmplen; }
283 
284       for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; }
285       for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; }
286     }
287 
288   /* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */
289   if (msa->index)
290     {
291       esl_keyhash_Reuse(msa->index);
292       for (i = 0; i < msa->nseq; i++)
293 	esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL);
294     }
295 
296   return eslOK;
297 }
298 
299 
300 /*****************************************************************
301  * 4. Shuffling pairwise (QRNA) alignments
302  *****************************************************************/
303 
304 /* Function: esl_msashuffle_XQRNA()
305  * Synopsis: Gap-preserving column shuffle of a digital pairwise alignment.
306  * Incept:   SRE, Tue Jan 22 09:09:52 2008 [Market Street Cafe, Leesburg]
307  *
308  * Purpose:  Shuffle a digital pairwise alignment <x>,<y> while
309  *           preserving the position of gaps, where both sequences are
310  *           in digital alphabet <abc>, using the random number
311  *           generator <r>. Return the shuffled alignment in <xs>,
312  *           <ys>. Caller provides allocated space for <xs> and <ys>
313  *           for at least the same length of <x>,<y>.
314  *
315  *           Works by doing three separate
316  *           shuffles, of (1) columns with residues in both
317  *           <x> and <y>, (2) columns with residue in <x> and gap in <y>,
318  *           and (3) columns with gap in <x> and residue in <y>.
319  *
320  *           <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
321  *           an alignment "in place", destroying the original
322  *           alignment, just call <esl_msashuffle_XQRNA(r, abc, x,y,x,y)>.
323  *
324  * Returns:  <eslOK> on success, and the shuffled alignment is
325  *           returned in <xs>, <ys>.
326  *
327  * Throws:   <eslEMEM> on allocation failure.
328  */
329 int
esl_msashuffle_XQRNA(ESL_RANDOMNESS * r,ESL_ALPHABET * abc,ESL_DSQ * x,ESL_DSQ * y,ESL_DSQ * xs,ESL_DSQ * ys)330 esl_msashuffle_XQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, ESL_DSQ *x, ESL_DSQ *y, ESL_DSQ *xs, ESL_DSQ *ys)
331 {
332   int  L;
333   int *xycol = NULL;
334   int *xcol  = NULL;
335   int *ycol  = NULL;
336   int  nxy, nx, ny;
337   int  i;
338   int  pos, c;
339   char xsym, ysym;
340   int  status;
341 
342   L = esl_abc_dsqlen(x);
343   if (esl_abc_dsqlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
344 
345   if (xs != x) esl_abc_dsqcpy(x, L, xs);
346   if (ys != y) esl_abc_dsqcpy(y, L, ys);
347 
348   /* First, construct three arrays containing lists of the column positions
349    * of the three types of columns. (If a column contains gaps in both x and y,
350    * we've already simply copied it to the shuffled sequence.)
351    */
352   ESL_ALLOC(xycol, sizeof(int) * L);
353   ESL_ALLOC(xcol,  sizeof(int) * L);
354   ESL_ALLOC(ycol,  sizeof(int) * L);
355   nxy = nx = ny = 0;
356 
357   for (i = 1; i <= L; i++)
358     {
359       if      (  esl_abc_XIsGap(abc, x[i]) &&   esl_abc_XIsGap(abc, y[i])) { continue; }
360       else if (! esl_abc_XIsGap(abc, x[i]) && ! esl_abc_XIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
361       else if (  esl_abc_XIsGap(abc, x[i]))                                { ycol[ny] = i;   ny++;  }
362       else if (  esl_abc_XIsGap(abc, y[i]))                                { xcol[nx] = i;   nx++;  }
363     }
364 
365   /* Second, shuffle the sequences indirectly, via shuffling these arrays.
366    * Yow, careful with those indices, and with order of the statements...
367    */
368   for (; nxy > 1; nxy--) {
369     pos              = esl_rnd_Roll(r, nxy);
370     xsym             = xs[xycol[pos]];   ysym             = ys[xycol[pos]];    c            = xycol[pos];
371     xs[xycol[pos]]   = xs[xycol[nxy-1]]; ys[xycol[pos]]   = ys[xycol[nxy-1]];  xycol[pos]   = xycol[nxy-1];
372     xs[xycol[nxy-1]] = xsym;             ys[xycol[nxy-1]] = ysym;              xycol[pos]   = c;
373   }
374   for (; nx > 1; nx--) {
375     pos            = esl_rnd_Roll(r, nx);
376     xsym           = xs[xcol[pos]];  ysym           = ys[xcol[pos]];  c          = xcol[pos];
377     xs[xcol[pos]]  = xs[xcol[nx-1]]; ys[xcol[pos]]  = ys[xcol[nx-1]]; xcol[pos]  = xcol[nx-1];
378     xs[xcol[nx-1]] = xsym;           ys[xcol[nx-1]] = ysym;           xcol[nx-1] = c;
379   }
380   for (; ny > 1; ny--) {
381     pos            = esl_rnd_Roll(r, ny);
382     xsym           = xs[ycol[pos]];  ysym           = ys[ycol[pos]];  c          = ycol[pos];
383     xs[ycol[pos]]  = xs[ycol[ny-1]]; ys[ycol[pos]]  = ys[ycol[ny-1]]; ycol[pos]  = ycol[ny-1];
384     xs[ycol[ny-1]] = xsym;           ys[ycol[ny-1]] = ysym;           ycol[ny-1] = c;
385   }
386 
387   free(xycol); free(xcol); free(ycol);
388   return eslOK;
389 
390  ERROR:
391   if (xycol != NULL) free(xycol);
392   if (xcol  != NULL) free(xcol);
393   if (ycol  != NULL) free(ycol);
394   return status;
395 }
396 
397 /* Function: esl_msashuffle_CQRNA()
398  * Synopsis: Gap-preserving column shuffle of a pairwise alignment.
399  * Incept:   SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg]
400  *
401  * Purpose:  Shuffle a pairwise alignment <x>,<y> while preserving the
402  *           position of gaps, using the random number generator <r>.
403  *           Return the shuffled alignment in <xs>,
404  *           <ys>. Caller provides allocated space for <xs> and <ys>.
405  *
406  *           An alphabet <abc> must also be provided, solely for the
407  *           definition of gap characters. Because Easel's default
408  *           alphabets (DNA, RNA, and protein) all use the same
409  *           definition of gap characters <-_.>, you can actually
410  *           provide any alphabet here, and get the same results.
411  *           (This may save having to determine the alphabet of input
412  *           sequences.)
413  *
414  *           Works by doing three separate
415  *           shuffles, of (1) columns with residues in both
416  *           <x> and <y>, (2) columns with residue in <x> and gap in <y>,
417  *           and (3) columns with gap in <x> and residue in <y>.
418  *
419  *           <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
420  *           an alignment "in place", destroying the original
421  *           alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>.
422  *
423  * Returns:  <eslOK> on success, and the shuffled alignment is
424  *           returned in <xs>, <ys>.
425  *
426  * Throws:   <eslEMEM> on allocation failure.
427  */
428 int
esl_msashuffle_CQRNA(ESL_RANDOMNESS * r,ESL_ALPHABET * abc,char * x,char * y,char * xs,char * ys)429 esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys)
430 {
431   int  L;
432   int *xycol = NULL;
433   int *xcol  = NULL;
434   int *ycol  = NULL;
435   int  nxy, nx, ny;
436   int  i;
437   int  pos, c;
438   char xsym, ysym;
439   int  status;
440 
441   if (xs != x) strcpy(xs, x);
442   if (ys != y) strcpy(ys, y);
443 
444   /* First, construct three arrays containing lists of the column positions
445    * of the three types of columns. (If a column contains gaps in both x and y,
446    * we've already simply copied it to the shuffled sequence.)
447    */
448   L = strlen(x);
449   if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
450   ESL_ALLOC(xycol, sizeof(int) * L);
451   ESL_ALLOC(xcol,  sizeof(int) * L);
452   ESL_ALLOC(ycol,  sizeof(int) * L);
453   nxy = nx = ny = 0;
454 
455   for (i = 0; i < L; i++)
456     {
457       if      (  esl_abc_CIsGap(abc, x[i]) &&   esl_abc_CIsGap(abc, y[i])) { continue; }
458       else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
459       else if (  esl_abc_CIsGap(abc, x[i]))                                { ycol[ny] = i;   ny++;  }
460       else if (  esl_abc_CIsGap(abc, y[i]))                                { xcol[nx] = i;   nx++;  }
461     }
462 
463   /* Second, shuffle the sequences indirectly, via shuffling these arrays.
464    * Yow, careful with those indices, and with order of the statements...
465    */
466   for (; nxy > 1; nxy--) {
467     pos              = esl_rnd_Roll(r, nxy);
468     xsym             = xs[xycol[pos]];   ysym             = ys[xycol[pos]];    c            = xycol[pos];
469     xs[xycol[pos]]   = xs[xycol[nxy-1]]; ys[xycol[pos]]   = ys[xycol[nxy-1]];  xycol[pos]   = xycol[nxy-1];
470     xs[xycol[nxy-1]] = xsym;             ys[xycol[nxy-1]] = ysym;              xycol[pos]   = c;
471   }
472   for (; nx > 1; nx--) {
473     pos            = esl_rnd_Roll(r, nx);
474     xsym           = xs[xcol[pos]];  ysym           = ys[xcol[pos]];  c          = xcol[pos];
475     xs[xcol[pos]]  = xs[xcol[nx-1]]; ys[xcol[pos]]  = ys[xcol[nx-1]]; xcol[pos]  = xcol[nx-1];
476     xs[xcol[nx-1]] = xsym;           ys[xcol[nx-1]] = ysym;           xcol[nx-1] = c;
477   }
478   for (; ny > 1; ny--) {
479     pos            = esl_rnd_Roll(r, ny);
480     xsym           = xs[ycol[pos]];  ysym           = ys[ycol[pos]];  c          = ycol[pos];
481     xs[ycol[pos]]  = xs[ycol[ny-1]]; ys[ycol[pos]]  = ys[ycol[ny-1]]; ycol[pos]  = ycol[ny-1];
482     xs[ycol[ny-1]] = xsym;           ys[ycol[ny-1]] = ysym;           ycol[ny-1] = c;
483   }
484 
485   free(xycol); free(xcol); free(ycol);
486   return eslOK;
487 
488  ERROR:
489   if (xycol != NULL) free(xycol);
490   if (xcol  != NULL) free(xcol);
491   if (ycol  != NULL) free(ycol);
492   return status;
493 }
494 
495 
496 
497 
498 /*****************************************************************
499  * 5. Example.
500  *****************************************************************/
501 #ifdef eslMSASHUFFLE_EXAMPLE
502 #include <stdio.h>
503 #include "easel.h"
504 #include "esl_alphabet.h"
505 #include "esl_getopts.h"
506 #include "esl_msa.h"
507 #include "esl_msafile.h"
508 #include "esl_random.h"
509 
510 static ESL_OPTIONS options[] = {
511   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
512   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
513   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use DNA alphabet",                            0 },
514   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use RNA alphabet",                            0 },
515   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet",                        0 },
516   { "--text",      eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use text mode: no digital alphabet",          0 },
517   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
518 };
519 static char usage[]  = "[-options] <msafile>";
520 static char banner[] = "example of multiple alignment shuffling/permuting";
521 
522 int
main(int argc,char ** argv)523 main(int argc, char **argv)
524 {
525   ESL_GETOPTS    *go        = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
526   ESL_RANDOMNESS *rng       = esl_randomness_Create(0);
527   char           *msafile   = esl_opt_GetArg(go, 1);
528   int             fmt       = eslMSAFILE_UNKNOWN;
529   ESL_ALPHABET   *abc       = NULL;
530   ESL_MSAFILE    *afp       = NULL;
531   ESL_MSA        *msa       = NULL;
532   int             textmode  = esl_opt_GetBoolean(go, "--text");
533   int             nali      = 0;
534   int             status;
535 
536   /* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */
537   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
538   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
539   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
540 
541   /* Open in text or digital mode.
542    *   To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>.
543    *   To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
544    *   To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
545    * esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any
546    * open failure to <stderr>. You can of course handle your own diagnostics instead.
547    */
548   if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, fmt, NULL, &afp);
549   else          status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp);
550   if (status != eslOK)   esl_msafile_OpenFailure(afp, status);
551 
552   fmt = afp->format;
553 
554   while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
555     {
556       /* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data;
557        * if text MSA:  msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
558       nali++;
559 
560       /* permute it */
561       esl_msashuffle_PermuteSequenceOrder(rng, msa);
562 
563       esl_msafile_Write(stdout, msa, fmt);
564       esl_msa_Destroy(msa);
565     }
566   if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
567 
568   esl_alphabet_Destroy(abc);
569   esl_msafile_Close(afp);
570   esl_randomness_Destroy(rng);
571   esl_getopts_Destroy(go);
572   exit(0);
573 }
574 #endif
575 
576