1 /* Generating, shuffling, and randomizing sequences.
2  *
3  * Contents:
4  *   1. Generating simple random character strings.
5  *   2. Generating iid sequences.
6  *   3. Shuffling sequences.
7  *   4. Randomizing sequences.
8  *   5. Generating iid sequences (digital mode).
9  *   6. Shuffling sequences (digital mode).
10  *   7. Randomizing sequences (digital mode).
11  *   8. Statistics drivers.
12  *   9. Unit tests.
13  *  10. Test driver.
14  *  11. Example.
15  */
16 #include "esl_config.h"
17 
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21 #include <math.h>
22 #include <time.h>
23 
24 #include "easel.h"
25 #include "esl_alphabet.h"
26 #include "esl_arr2.h"
27 #include "esl_random.h"
28 #include "esl_randomseq.h"
29 
30 
31 /*****************************************************************
32  * 1. Generating simple random character strings.
33  *****************************************************************/
34 
35 /* Function:  esl_rsq_Sample()
36  * Synopsis:  Generate a random character string.
37  *
38  * Purpose:   Sample a random character string of length <L>,
39  *            consisting of characters in the set defined by
40  *            an integer flag <allowed_chars>, using
41  *            random number generator <rng>.
42  *
43  *            Return the new NUL-terminated string in <*ret_s>.  This
44  *            may either be a new allocation, or in pre-allocated
45  *            storage provided by the caller. If caller passes
46  *            <*ret_s> as <NULL>, new space is allocated, and the
47  *            caller is responsible for freeing it. That is:
48  *              \begin{cchunk}
49  *                 char *s  = NULL;
50  *                 esl_rsq_Sample(..., &s);
51  *                 free(s);
52  *               \end{cchunk}
53  *
54  *            If caller passes a non-<NULL> <*ret_s>, it is assumed to
55  *            be a preallocated space of at least <L+1> characters,
56  *            and caller is (of course) responsible for freeing
57  *            it. That is:
58  *                \begin{cchunk}
59  *                   char *s = malloc(L+1);
60  *                   esl_rsq_Sample(...,L, &s);
61  *                   free(s);
62  *                \end{cchunk}
63  *
64  *            Allowed values of the flag <allowed_char_flag> mirror
65  *            the standard C99 character set functions in <ctype.h>:
66  *
67  *            | <eslRSQ_SAMPLE_ALNUM>  |  isalnum()  | isalpha() or isdigit() |
68  *            | <eslRSQ_SAMPLE_ALPHA>  |  isalpha()  | islower() or isupper() |
69  *            | <eslRSQ_SAMPLE_LOWER>  |  islower()  | [a-z] |
70  *            | <eslRSQ_SAMPLE_UPPER>  |  isupper()  | [A-Z] |
71  *            | <eslRSQ_SAMPLE_DIGIT>  |  isdigit()  | [0-9] |
72  *            | <eslRSQ_SAMPLE_XDIGIT> |  isxdigit() | [0-9] or [a-f] or [A-F] |
73  *            | <eslRSQ_SAMPLE_CNTRL>  |  iscntrl()  | ASCII control characters |
74  *            | <eslRSQ_SAMPLE_GRAPH>  |  isgraph()  | any printing char except space |
75  *            | <eslRSQ_SAMPLE_SPACE>  |  isspace()  | space, and other whitespace such as tab, newline |
76  *            | <eslRSQ_SAMPLE_BLANK>  |  isblank()  | space or tab |
77  *            | <eslRSQ_SAMPLE_PRINT>  |  isprint()  | any printing char including space |
78  *            | <eslRSQ_SAMPLE_PUNCT>  |  ispunct()  | punctuation |
79  *
80  *            Note that with <eslRSQ_SAMPLE_CNTRL>, your string
81  *            may sample NUL control characters (<0>), in addition to
82  *            the string-terminating one at <(*ret_s)[L]>, so <strlen(*ret_s)>
83  *            may not equal <L> in this case.
84  *
85  *            These values are exclusive: you use one and only one of
86  *            them as <allowed_chars>, you can't logically OR or NOT
87  *            them together.
88  *
89  * Args:      rng           - ESL_RANDOMNESS object, the random number generator
90  *            allowed_chars - allowed character set flag: eslRSQ_SAMPLE_*
91  *            L             - length of string to sample
92  *            ret_s         - RETURN: the NUL-terminated string
93  *
94  * Returns:   <eslOK> on success; <*ret_s> is the sampled string, which was
95  *            newly allocated here, and caller becomes responsible for free'ing.
96  *
97  * Throws:    <eslEMEM> on allocation error; <eslEINVAL> if caller
98  *            passes an invalid value of <allowed_chars>. Now <*ret_s>
99  *            is <NULL>.
100  */
101 int
esl_rsq_Sample(ESL_RANDOMNESS * rng,int allowed_chars,int L,char ** ret_s)102 esl_rsq_Sample(ESL_RANDOMNESS *rng, int allowed_chars, int L, char **ret_s)
103 {
104   char *s = *ret_s;  // if s == NULL, we will allocate here. Else, we're using caller-provided allocation
105   int   n = 0;
106   char  c[127];
107   int   x,i;
108   int   status;
109 
110   /* We can't portably make assumptions about char codes (EBCDIC,
111    * ASCII...); and we don't want to write a bunch of fiddly overhead
112    * initializing static tables. So, quickly and portably build an
113    * array c[0..n-1] of characters we will sample uniformly from.
114    * RNG sampling is fairly compute-intensive anyway, so this time
115    * should disappear in the noise of that.
116    */
117   switch (allowed_chars) {
118   case eslRSQ_SAMPLE_ALNUM:  for (x = 0; x < 128; x++) if (isalnum(x))  c[n++] = x; break;
119   case eslRSQ_SAMPLE_ALPHA:  for (x = 0; x < 128; x++) if (isalpha(x))  c[n++] = x; break;
120   case eslRSQ_SAMPLE_LOWER:  for (x = 0; x < 128; x++) if (islower(x))  c[n++] = x; break;
121   case eslRSQ_SAMPLE_UPPER:  for (x = 0; x < 128; x++) if (isupper(x))  c[n++] = x; break;
122   case eslRSQ_SAMPLE_DIGIT:  for (x = 0; x < 128; x++) if (isdigit(x))  c[n++] = x; break;
123   case eslRSQ_SAMPLE_XDIGIT: for (x = 0; x < 128; x++) if (isxdigit(x)) c[n++] = x; break;
124   case eslRSQ_SAMPLE_CNTRL:  for (x = 0; x < 128; x++) if (iscntrl(x))  c[n++] = x; break;
125   case eslRSQ_SAMPLE_GRAPH:  for (x = 0; x < 128; x++) if (isgraph(x))  c[n++] = x; break;
126   case eslRSQ_SAMPLE_SPACE:  for (x = 0; x < 128; x++) if (isspace(x))  c[n++] = x; break;
127   case eslRSQ_SAMPLE_BLANK:  for (x = 0; x < 128; x++) if (isblank(x))  c[n++] = x; break;
128   case eslRSQ_SAMPLE_PRINT:  for (x = 0; x < 128; x++) if (isprint(x))  c[n++] = x; break;
129   case eslRSQ_SAMPLE_PUNCT:  for (x = 0; x < 128; x++) if (ispunct(x))  c[n++] = x; break;
130   default: ESL_XEXCEPTION(eslEINVAL, "bad flag; wanted something like eslRSQ_SAMPLE_ALPHA");
131   }
132 
133   if (!s) ESL_ALLOC(s, sizeof(char) * (L+1)); /* +\0 */
134 
135   for (i = 0; i < L; i++)
136     s[i] = c[ esl_rnd_Roll(rng, n) ];
137   s[L] = '\0';
138 
139   *ret_s = s;   // if using caller-provided space, this is a no-op, passing back the same *ret_s we got.
140   return eslOK;
141 
142  ERROR:
143   if (! *ret_s && s) free(s);  // if we allocated s here, clean up.
144   return status;
145 }
146 
147 
148 /*****************************************************************
149  *# 1. Generating iid sequences.
150  *****************************************************************/
151 
152 /* Function: esl_rsq_IID()
153  * Synopsis: Generate an iid random text sequence.
154  * Incept:   SRE, Thu Aug  5 09:03:03 2004 [St. Louis]
155  *
156  * Purpose:  Generate a <NUL>-terminated i.i.d. symbol string of length <L>,
157  *           $0..L-1$, and leave it in <s>. The symbol alphabet is given
158  *           as a string <alphabet> of <K> total symbols, and the iid
159  *           probability of each residue is given in <p>. The caller
160  *           must provide an <s> that is allocated for at least
161  *           <(L+1)*sizeof(char)>, room for <L> residues and the <NUL> terminator.
162  *
163  *           <esl_rsq_fIID()> does the same, but for a floating point
164  *           probability vector <p>, rather than a double precision
165  *           vector.
166  *
167  * Args:     r         - ESL_RANDOMNESS object
168  *           alphabet  - e.g. "ACGT"
169  *           p         - probability distribution [0..n-1]
170  *           K         - number of symbols in alphabet
171  *           L         - length of generated sequence
172  *           s         - the generated sequence.
173  *                       Caller allocated, >= (L+1) * sizeof(char).
174  *
175  * Return:   <eslOK> on success.
176  */
177 int
esl_rsq_IID(ESL_RANDOMNESS * r,const char * alphabet,const double * p,int K,int L,char * s)178 esl_rsq_IID(ESL_RANDOMNESS *r, const char *alphabet, const double *p, int K, int L, char *s)
179 {
180   int   x;
181 
182   for (x = 0; x < L; x++)
183     s[x] = alphabet[esl_rnd_DChoose(r,p,K)];
184   s[x] = '\0';
185   return eslOK;
186 }
187 int
esl_rsq_fIID(ESL_RANDOMNESS * r,const char * alphabet,const float * p,int K,int L,char * s)188 esl_rsq_fIID(ESL_RANDOMNESS *r, const char *alphabet, const float *p, int K, int L, char *s)
189 {
190   int   x;
191 
192   for (x = 0; x < L; x++)
193     s[x] = alphabet[esl_rnd_FChoose(r,p,K)];
194   s[x] = '\0';
195   return eslOK;
196 }
197 /*------------ end, generating iid sequences --------------------*/
198 
199 
200 /*****************************************************************
201  *# 2. Shuffling sequences.
202  *****************************************************************/
203 
204 /* Function:  esl_rsq_CShuffle()
205  * Synopsis:  Shuffle a text sequence.
206  * Incept:    SRE, Fri Feb 23 08:17:50 2007 [Casa de Gatos]
207  *
208  * Purpose:   Returns a shuffled version of <s> in <shuffled>, given
209  *            a source of randomness <r>.
210  *
211  *            Caller provides allocated storage for <shuffled>, for at
212  *            least the same length as <s>.
213  *
214  *            <shuffled> may also point to the same storage as <s>,
215  *            in which case <s> is shuffled in place.
216  *
217  * Returns:   <eslOK> on success.
218  */
219 int
esl_rsq_CShuffle(ESL_RANDOMNESS * r,const char * s,char * shuffled)220 esl_rsq_CShuffle(ESL_RANDOMNESS *r, const char  *s, char *shuffled)
221 {
222   int  L, i;
223   char c;
224 
225   L = strlen(s);
226   if (shuffled != s) strcpy(shuffled, s);
227   while (L > 1) {
228     i             = esl_rnd_Roll(r, L);
229     c             = shuffled[i];
230     shuffled[i]   = shuffled[L-1];
231     shuffled[L-1] = c;
232     L--;
233   }
234   return eslOK;
235 }
236 
237 /* Function:  esl_rsq_CShuffleDP()
238  * Synopsis:  Shuffle a text sequence, preserving diresidue composition.
239  * Incept:    SRE, Fri Feb 23 08:56:03 2007 [Casa de Gatos]
240  *
241  * Purpose:   Given string <s>, and a source of randomness <r>,
242  *            returns shuffled version in <shuffled>. The shuffle
243  *            is a "doublet-preserving" (DP) shuffle which
244  *            shuffles a sequence while exactly preserving both mono-
245  *            and di-symbol composition.
246  *
247  *            <s> may only consist of alphabetic characters [a-zA-Z].
248  *            The shuffle is done case-insensitively. The shuffled
249  *            string result is all upper case.
250  *
251  *            Caller provides storage in <shuffled> of at least the
252  *            same length as <s>.
253  *
254  *            <shuffled> may also point to the same storage as <s>,
255  *            in which case <s> is shuffled in place.
256  *
257  *            The algorithm does an internal allocation of a
258  *            substantial amount of temporary storage, on the order of
259  *            <26 * strlen(s)>, so an allocation failure is possible
260  *            if <s> is long enough.
261  *
262  *            The algorithm is a search for a random Eulerian walk on
263  *            a directed multigraph \citep{AltschulErickson85}.
264  *
265  *            If <s> is of length 2 or less, this is a no-op, and
266  *            <shuffled> is a copy of <s>.
267  *
268  * Returns:   <eslOK> on success.
269  *
270  * Throws:    <eslEINVAL> if <s> contains nonalphabetic characters.
271  *            <eslEMEM> on allocation failure.
272  */
273 int
esl_rsq_CShuffleDP(ESL_RANDOMNESS * r,const char * s,char * shuffled)274 esl_rsq_CShuffleDP(ESL_RANDOMNESS *r, const char *s, char *shuffled)
275 {
276   int    status;          /* Easel return status code */
277   int    len;	          /* length of s */
278   int    pos;	          /* a position in s or shuffled */
279   int    x,y;             /* indices of two characters */
280   char **E  = NULL;       /* edge lists: E[0] is the edge list from vertex A */
281   int   *nE = NULL;       /* lengths of edge lists */
282   int   *iE = NULL;       /* positions in edge lists */
283   int    n;	          /* tmp: remaining length of an edge list to be shuffled */
284   char   sf;              /* last character in shuffled */
285   char   Z[26];           /* connectivity in last edge graph Z */
286   int    keep_connecting; /* flag used in Z connectivity algorithm */
287   int    is_eulerian;	  /* flag used for when we've got a good Z */
288 
289   /* First, verify that the string is entirely alphabetic. */
290   len = strlen(s);
291   for (pos = 0; pos < len; pos++)
292     if (! isalpha((int) s[pos]))
293       ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
294 
295   /* The edge case of len <= 2 */
296   if (len <= 2)
297     {
298       if (s != shuffled) strcpy(shuffled, s);
299       return eslOK;
300     }
301 
302   /* Allocations. */
303   ESL_ALLOC(E,  sizeof(char *) * 26);   for (x = 0; x < 26; x++) E[x] = NULL;
304   ESL_ALLOC(nE, sizeof(int)    * 26);   for (x = 0; x < 26; x++) nE[x] = 0;
305   ESL_ALLOC(iE, sizeof(int)    * 26);   for (x = 0; x < 26; x++) iE[x] = 0;
306   for (x = 0; x < 26; x++)
307     ESL_ALLOC(E[x], sizeof(char) * (len-1));
308 
309   /* "(1) Construct the doublet graph G and edge ordering E
310    *      corresponding to S."
311    *
312    * Note that these also imply the graph G; and note,
313    * for any list x with nE[x] = 0, vertex x is not part
314    * of G.
315    */
316   x = toupper((int) s[0]) - 'A';
317   for (pos = 1; pos < len; pos++)
318     {
319       y = toupper((int) s[pos]) - 'A';
320       E[x][nE[x]] = y;
321       nE[x]++;
322       x = y;
323     }
324 
325   /* Now we have to find a random Eulerian edge ordering. */
326   sf = toupper((int) s[len-1]) - 'A';
327   is_eulerian = 0;
328   while (! is_eulerian)
329     {
330       /* "(2) For each vertex s in G except s_f, randomly select
331        *      one edge from the s edge list of E(S) to be the
332        *      last edge of the s list in a new edge ordering."
333        *
334        * select random edges and move them to the end of each
335        * edge list.
336        */
337       for (x = 0; x < 26; x++)
338 	{
339 	  if (nE[x] == 0 || x == sf) continue;
340 	  pos           = esl_rnd_Roll(r, nE[x]);
341 	  ESL_SWAP(E[x][pos], E[x][nE[x]-1], char);
342 	}
343 
344       /* "(3) From this last set of edges, construct the last-edge
345        *      graph Z and determine whether or not all of its
346        *      vertices are connected to s_f."
347        *
348        * a probably stupid algorithm for looking at the
349        * connectivity in Z: iteratively sweep through the
350        * edges in Z, and build up an array (confusing called Z[x])
351        * whose elements are 1 if x is connected to sf, else 0.
352        */
353       for (x = 0; x < 26; x++) Z[x] = 0;
354       Z[(int) sf] = keep_connecting = 1;
355 
356       while (keep_connecting) {
357 	keep_connecting = 0;
358 	for (x = 0; x < 26; x++) {
359 	  if (nE[x] == 0) continue;
360 	  y = E[x][nE[x]-1];            /* xy is an edge in Z */
361 	  if (Z[x] == 0 && Z[y] == 1) {  /* x is connected to sf in Z */
362 	    Z[x] = 1;
363 	    keep_connecting = 1;
364 	  }
365 	}
366       }
367 
368       /* if any vertex in Z is tagged with a 0, it's
369        * not connected to sf, and we won't have a Eulerian
370        * walk.
371        */
372       is_eulerian = 1;
373       for (x = 0; x < 26; x++) {
374 	if (nE[x] == 0 || x == sf) continue;
375 	if (Z[x] == 0) {
376 	  is_eulerian = 0;
377 	  break;
378 	}
379       }
380 
381       /* "(4) If any vertex is not connected in Z to s_f, the
382        *      new edge ordering will not be Eulerian, so return to
383        *      (2). If all vertices are connected in Z to s_f,
384        *      the new edge ordering will be Eulerian, so
385        *      continue to (5)."
386        *
387        * e.g. note infinite loop while is_eulerian is FALSE.
388        */
389     }
390 
391   /* "(5) For each vertex s in G, randomly permute the remaining
392    *      edges of the s edge list of E(S) to generate the s
393    *      edge list of the new edge ordering E(S')."
394    *
395    * Essentially a StrShuffle() on the remaining nE[x]-1 elements
396    * of each edge list; unfortunately our edge lists are arrays,
397    * not strings, so we can't just call out to StrShuffle().
398    */
399   for (x = 0; x < 26; x++)
400     for (n = nE[x] - 1; n > 1; n--)
401       {
402 	pos       = esl_rnd_Roll(r, n);
403 	ESL_SWAP(E[x][pos], E[x][n-1], char);
404       }
405 
406   /* "(6) Construct sequence S', a random DP permutation of
407    *      S, from E(S') as follows. Start at the s_1 edge list.
408    *      At each s_i edge list, add s_i to S', delete the
409    *      first edge s_i,s_j of the edge list, and move to
410    *      the s_j edge list. Continue this process until
411    *      all edge lists are exhausted."
412    */
413   pos = 0;
414   x = toupper((int) s[0]) - 'A';
415   while (1)
416     {
417       shuffled[pos++] = 'A'+ x; /* add s_i to S' */
418 
419       y = E[x][iE[x]];
420       iE[x]++;			/* "delete" s_i,s_j from edge list */
421 
422       x = y;			/* move to s_j edge list. */
423 
424       if (iE[x] == nE[x])
425 	break;			/* the edge list is exhausted. */
426     }
427   shuffled[pos++] = 'A' + sf;
428   shuffled[pos]   = '\0';
429 
430   /* Reality checks.
431    */
432   if (x   != sf)  ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, you didn't end on s_f.");
433   if (pos != len) ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, pos (%d) != len (%d).", pos, len);
434 
435   /* Free and return.
436    */
437   esl_arr2_Destroy((void **) E, 26);
438   free(nE);
439   free(iE);
440   return eslOK;
441 
442  ERROR:
443   esl_arr2_Destroy((void **) E, 26);
444   if (nE != NULL) free(nE);
445   if (iE != NULL) free(iE);
446   return status;
447 }
448 
449 
450 /* Function:  esl_rsq_CShuffleKmers()
451  * Synopsis:  Shuffle k-mers in a text sequence.
452  * Incept:    SRE, Tue Nov 17 16:55:57 2009 [NHGRI retreat, Gettysburg]
453  *
454  * Purpose:   Consider a text sequence <s> as a string of nonoverlapping
455  *            k-mers of length <K>. Shuffle the k-mers, given a random
456  *            number generator <r>. Put the shuffled sequence in
457  *            <shuffled>.
458  *
459  *            If the length of <s> is not evenly divisible by <K>, the
460  *            remaining residues are left (unshuffled) as a prefix to
461  *            the shuffled k-mers.
462  *
463  *            For example, shuffling ABCDEFGHIJK as k=3-mers might
464  *            result in ABFIJKFGHCDE.
465  *
466  *            Caller provides allocated storage for <shuffled>,
467  *            for at least the same length as <s>.
468  *
469  *            <shuffled> may also point to the same storage as <s>,
470  *            in which case <s> is shuffled in place.
471  *
472  *            There is almost no formally justifiable reason why you'd
473  *            use this shuffle -- it's not like it preserves any
474  *            particularly well-defined statistical properties of the
475  *            sequence -- but it's a quick and dirty way to sort of
476  *            maybe possibly preserve some higher-than-monomer
477  *            statistics.
478  *
479  * Args:      r        - an <ESL_RANDOMNESS> random generator
480  *            s        - sequence to shuffle
481  *            K        - size of k-mers to break <s> into
482  *            shuffled - RESULT: the shuffled sequence
483  *
484  * Returns:   <eslOK> on success.
485  *
486  * Throws:    <eslEMEM> on allocation error.
487  */
488 int
esl_rsq_CShuffleKmers(ESL_RANDOMNESS * r,const char * s,int K,char * shuffled)489 esl_rsq_CShuffleKmers(ESL_RANDOMNESS *r, const char *s, int K, char *shuffled)
490 {
491   int   L = strlen(s);
492   int   W = L / K;		/* number of kmers "words" excluding leftover prefix */
493   int   P = L % K;		/* leftover residues in prefix */
494   int   i;
495   char *swap = NULL;
496   int   status;
497 
498   if (shuffled != s) strcpy(shuffled, s);
499   ESL_ALLOC(swap, sizeof(char) * K);
500   while (W > 1)
501     {	/* use memmove, not strncpy or memcpy, because i==W-1 creates an overlap case */
502       i = esl_rnd_Roll(r, W);	                                                 /* pick a word          */
503       memmove(swap,                   shuffled + P + i*K,     K * sizeof(char)); /* copy it to tmp space */
504       memmove(shuffled + P + i*K,     shuffled + P + (W-1)*K, K * sizeof(char)); /* move word W-1 to i   */
505       memmove(shuffled + P + (W-1)*K, swap,                   K * sizeof(char)); /* move word i to W-1   */
506       W--;
507     }
508   free(swap);
509   return eslOK;
510 
511  ERROR:
512   free(swap);
513   return status;
514 }
515 
516 
517 /* Function:  esl_rsq_CReverse()
518  * Synopsis:  Reverse a string.
519  * Incept:    SRE, Sat Feb 24 10:06:34 2007 [Casa de Gatos]
520  *
521  * Purpose:   Returns a reversed version of <s> in <rev>.
522  *
523  *            There are no restrictions on the symbols that <s>
524  *            might contain.
525  *
526  *            Caller provides storage in <rev> for at least
527  *            <(strlen(s)+1)*sizeof(char)>.
528  *
529  *            <s> and <rev> can point to the same storage, in which
530  *            case <s> is reversed in place.
531  *
532  * Returns:   <eslOK> on success.
533  */
534 int
esl_rsq_CReverse(const char * s,char * rev)535 esl_rsq_CReverse(const char *s, char *rev)
536 {
537   int  L, i;
538   char c;
539 
540   L = strlen(s);
541   for (i = 0; i < L/2; i++)
542     {				/* swap ends */
543       c          = s[L-i-1];
544       rev[L-i-1] = s[i];
545       rev[i]     = c;
546     }
547   if (L%2) { rev[i] = s[i]; } /* don't forget middle residue in odd-length s */
548   rev[L] = '\0';
549   return eslOK;
550 }
551 
552 /* Function: esl_rsq_CShuffleWindows()
553  * Synopsis: Shuffle local windows of a text string.
554  * Incept:   SRE, Sat Feb 24 10:17:59 2007 [Casa de Gatos]
555  *
556  * Purpose:  Given string <s>, shuffle residues in nonoverlapping
557  *           windows of width <w>, and put the result in <shuffled>.
558  *           See [Pearson88].
559  *
560  *           <s> and <shuffled> can be identical to shuffle in place.
561  *
562  *           Caller provides storage in <shuffled> for at least
563  *           <(strlen(s)+1)*sizeof(char)>.
564  *
565  * Args:     s        - string to shuffle in windows
566  *           w        - window size (typically 10 or 20)
567  *           shuffled - allocated space for window-shuffled result.
568  *
569  * Return:   <eslOK> on success.
570  */
571 int
esl_rsq_CShuffleWindows(ESL_RANDOMNESS * r,const char * s,int w,char * shuffled)572 esl_rsq_CShuffleWindows(ESL_RANDOMNESS *r, const char *s, int w, char *shuffled)
573 {
574   int  L;
575   char c;
576   int  i, j, k;
577 
578   L = strlen(s);
579   if (shuffled != s) strcpy(shuffled, s);
580   for (i = 0; i < L; i += w)
581     for (j = ESL_MIN(L-1, i+w-1); j > i; j--)
582       {
583 	k             = i + esl_rnd_Roll(r, j-i);
584 	c             = shuffled[k];  /* semantics of a j,k swap, because we might be shuffling in-place */
585 	shuffled[k]   = shuffled[j];
586 	shuffled[j]   = c;
587       }
588   return eslOK;
589 }
590 /*------------------ end, shuffling sequences -------------------*/
591 
592 
593 
594 /*****************************************************************
595  *# 3. Randomizing sequences
596  *****************************************************************/
597 
598 /* Function:  esl_rsq_CMarkov0()
599  * Synopsis:  Generate new text string of same 0th order Markov properties.
600  * Incept:    SRE, Sat Feb 24 08:47:43 2007 [Casa de Gatos]
601  *
602  * Purpose:   Makes a random string <markoved> with the same length and
603  *            0-th order Markov properties as <s>, given randomness
604  *            source <r>.
605  *
606  *            <s> and <markoved> can be point to the same storage, in which
607  *            case <s> is randomized in place, destroying the original
608  *            string.
609  *
610  *            <s> must consist only of alphabetic characters [a-zA-Z].
611  *            Statistics are collected case-insensitively over 26 possible
612  *            residues. The random string is generated all upper case.
613  *
614  * Args:      s         - input string
615  *            markoved  - randomly generated string
616  *                        (storage allocated by caller, at least strlen(s)+1)
617  *
618  * Returns:   <eslOK> on success.
619  *
620  * Throws:    <eslEINVAL> if <s> contains nonalphabetic characters.
621  */
622 int
esl_rsq_CMarkov0(ESL_RANDOMNESS * r,const char * s,char * markoved)623 esl_rsq_CMarkov0(ESL_RANDOMNESS *r, const char *s, char *markoved)
624 {
625   int    L;
626   int    i;
627   double p[26];		/* initially counts, then probabilities */
628   int    x;
629 
630   /* First, verify that the string is entirely alphabetic. */
631   L = strlen(s);
632   for (i = 0; i < L; i++)
633     if (! isalpha((int) s[i]))
634       ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
635 
636   /* Collect zeroth order counts and convert to frequencies.
637    */
638   for (x = 0; x < 26; x++) p[x] = 0.;
639   for (i = 0; i < L; i++)
640     p[(int)(toupper((int) s[i]) - 'A')] += 1.0;
641   if (L > 0)
642     for (x = 0; x < 26; x++) p[x] /= (double) L;
643 
644   /* Generate a random string using those p's. */
645   for (i = 0; i < L; i++)
646     markoved[i] = esl_rnd_DChoose(r, p, 26) + 'A';
647   markoved[i] = '\0';
648 
649   return eslOK;
650 }
651 
652 /* Function:  esl_rsq_CMarkov1()
653  * Synopsis:  Generate new text string of same 1st order Markov properties.
654  * Incept:    SRE, Sat Feb 24 09:21:46 2007 [Casa de Gatos]
655  *
656  * Purpose:   Makes a random string <markoved> with the same length and
657  *            1st order (di-residue) Markov properties as <s>, given
658  *            randomness source <r>.
659  *
660  *            <s> and <markoved> can be point to the same storage, in which
661  *            case <s> is randomized in place, destroying the original
662  *            string.
663  *
664  *            <s> must consist only of alphabetic characters [a-zA-Z].
665  *            Statistics are collected case-insensitively over 26 possible
666  *            residues. The random string is generated all upper case.
667  *
668  *            If <s> is of length 2 or less, this is a no-op, and
669  *            <markoved> is a copy of <s>.
670  *
671  * Args:      s         - input string
672  *            markoved  - new randomly generated string
673  *                        (storage allocated by caller, at least strlen(s)+1)
674  *
675  * Returns:   <eslOK> on success.
676  *
677  * Throws:    <eslEINVAL> if <s> contains nonalphabetic characters.
678  */
679 int
esl_rsq_CMarkov1(ESL_RANDOMNESS * r,const char * s,char * markoved)680 esl_rsq_CMarkov1(ESL_RANDOMNESS *r, const char *s, char *markoved)
681 {
682   int    L;
683   int    i;
684   int    x,y;
685   int    i0;			/* initial symbol */
686   double p[26][26];		/* conditional probabilities p[x][y] = P(y | x) */
687   double p0[26];		/* marginal probabilities P(x), just for initial residue. */
688 
689   /* First, verify that the string is entirely alphabetic. */
690   L = strlen(s);
691   for (i = 0; i < L; i++)
692     if (! isalpha((int) s[i]))
693      ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
694 
695   /* The edge case of len <= 2 */
696   if (L <= 2)
697     {
698       if (s != markoved) strcpy(markoved, s);
699       return eslOK;
700     }
701 
702   /* Collect first order counts and convert to frequencies. */
703   for (x = 0; x < 26; x++)
704     for (y = 0; y < 26; y++)
705       p[x][y] = 0.;
706 
707   i0 = x = toupper((int) s[0]) - 'A';
708   for (i = 1; i < L; i++)
709     {
710       y = toupper((int) s[i]) - 'A';
711       p[x][y] += 1.0;
712       x = y;
713     }
714   p[x][i0] += 1.0; 		/* "circularized": avoids a bug; see markov1_bug utest */
715 
716   for (x = 0; x < 26; x++)
717     {
718       p0[x] = 0.;
719       for (y = 0; y < 26; y++)
720 	p0[x] += p[x][y];	/* now p0[x] = marginal counts of x, inclusive of 1st residue */
721 
722       for (y = 0; y < 26; y++)
723 	p[x][y] = (p0[x] > 0. ? p[x][y] / p0[x] : 0.); /* now p[x][y] = P(y | x) */
724 
725       p0[x] /= (double) L;	/* now p0[x] = marginal P(x) */
726     }
727 
728   /* Generate a random string using those p's. */
729   x = esl_rnd_DChoose(r, p0, 26);
730   markoved[0] = x + 'A';
731   for (i = 1; i < L; i++)
732     {
733       y           = esl_rnd_DChoose(r, p[x], 26);
734       markoved[i] = y + 'A';
735       x           = y;
736     }
737   markoved[L] = '\0';
738 
739   return eslOK;
740 }
741 /*----------------- end, randomizing sequences ------------------*/
742 
743 
744 
745 /*****************************************************************
746  *# 4. Generating iid sequences (digital mode).
747  *****************************************************************/
748 
749 /* Function: esl_rsq_xIID()
750  * Synopsis: Generate an iid random digital sequence.
751  * Incept:   SRE, Sat Feb 17 16:39:01 2007 [Casa de Gatos]
752  *
753  * Purpose:  Generate an i.i.d. digital sequence of length <L> (1..L) and
754  *           leave it in <dsq>. The i.i.d. probability of each residue is
755  *           given in the probability vector <p>, and the number of
756  *           possible residues (the alphabet size) is given by <K>.
757  *           (Only the alphabet size <K> is needed here, as opposed to
758  *           a digital <ESL_ALPHABET>, but the caller presumably
759  *           has a digital alphabet.) The caller must provide a <dsq>
760  *           allocated for at least <L+2> residues of type <ESL_DSQ>,
761  *           room for <L> residues and leading/trailing digital sentinel bytes.
762  *
763  *           <esl_rsq_xfIID()> does the same, but for a
764  *           single-precision float vector <p> rather than a
765  *           double-precision vector <p>.
766  *
767  *           As a special case, if <p> is <NULL>, sample residues uniformly.
768  *
769  * Args:     r         - ESL_RANDOMNESS object
770  *           p         - probability distribution [0..n-1] (or NULL for uniform)
771  *           K         - number of symbols in alphabet
772  *           L         - length of generated sequence
773  *           ret_s     - RETURN: the generated sequence.
774  *                       (Caller-allocated, >= (L+2)*ESL_DSQ)
775  *
776  * Return:   <eslOK> on success.
777  */
778 int
esl_rsq_xIID(ESL_RANDOMNESS * r,const double * p,int K,int L,ESL_DSQ * dsq)779 esl_rsq_xIID(ESL_RANDOMNESS *r, const double *p, int K, int L, ESL_DSQ *dsq)
780 {
781   int   x;
782 
783   dsq[0] = dsq[L+1] = eslDSQ_SENTINEL;
784   for (x = 1; x <= L; x++)
785     dsq[x] = p ? esl_rnd_DChoose(r,p,K) : esl_rnd_Roll(r,K);
786   return eslOK;
787 }
788 int
esl_rsq_xfIID(ESL_RANDOMNESS * r,const float * p,int K,int L,ESL_DSQ * dsq)789 esl_rsq_xfIID(ESL_RANDOMNESS *r, const float *p, int K, int L, ESL_DSQ *dsq)
790 {
791   int   x;
792 
793   dsq[0] = dsq[L+1] = eslDSQ_SENTINEL;
794   for (x = 1; x <= L; x++)
795     dsq[x] = p ? esl_rnd_FChoose(r,p,K) : esl_rnd_Roll(r,K);
796   return eslOK;
797 }
798 
799 
800 /* Function:  esl_rsq_SampleDirty()
801  * Synopsis:  Sample a digital sequence with noncanonicals, optionally gaps.
802  * Incept:    SRE, Wed Feb 17 10:57:28 2016 [H1/76]
803  *
804  * Purpose:   Using random number generator <rng>, use probability
805  *            vector <p> to sample an iid digital sequence in alphabet
806  *            <abc> of length <L>. Store it in <dsq>.
807  *
808  *            The <dsq> space, allocated by the caller, has room for
809  *            at least <L+2> residues, counting the digital
810  *            sentinels.
811  *
812  *            Probability vector <p> has <Kp> terms, and sums to 1.0
813  *            over them. The probabilities in <p> for residues <K>,
814  *            <Kp-2>, and <Kp-1> (gap, nonresidue, missing) are
815  *            typically zero, to generate a standard unaligned digital
816  *            sequence with degenerate residues. To sample a random
817  *            "alignment", <p[K]> is nonzero.
818  *
819  *            If <p> is <NULL>, then we sample a probability vector
820  *            according to the following rules, which generates
821  *            *ungapped* dirtied random sequences:
822  *               1. Sample pc, the probability of canonical
823  *                  vs. noncanonical residues, uniformly on [0,1).
824  *               2. Sample a p[] uniformly for canonical residues
825  *                  <0..K-1>, and renormalize by multiplying by pc.
826  *                  Sample a different p[] uniformly for noncanonical
827  *                  residues <K+1..Kp-3>, and renormalize by (1-pc).
828  *               3. p[] = 0 for gap residue K, nonresidue Kp-2,
829  *                  missing residue Kp-1.
830  *            This usage is mainly intended to make it easy to
831  *            sample dirty edge cases for automated tests.
832  *
833  * Args:      rng  :  random number generator
834  *            abc  :  digital alphabet
835  *            p    :  OPTIONAL: p[0..Kp-1] probability vector, or NULL
836  *            L    :  length of digital sequence to sample
837  *            dsq  :  resulting digital seq sample, caller-provided space
838  *
839  * Returns:   <eslOK> on success
840  *
841  * Throws:    <eslEMEM> on allocation failure.
842  */
843 int
esl_rsq_SampleDirty(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc,double ** byp_p,int L,ESL_DSQ * dsq)844 esl_rsq_SampleDirty(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, double **byp_p, int L, ESL_DSQ *dsq)
845 {
846   double *p = NULL;
847   int     i;
848   int     status;
849 
850   /* If p isn't provided, sample one. */
851   if ( esl_byp_IsProvided(byp_p))
852     p = *byp_p;
853   else
854     {
855       double pc = esl_random(rng); /* [0,1) */
856       int    x;
857 
858       ESL_ALLOC(p, sizeof(double) * abc->Kp);
859 
860       esl_rnd_Dirichlet(rng, NULL /* i.e. uniform */, abc->K, p);
861       esl_rnd_Dirichlet(rng, NULL, (abc->Kp - abc->K - 3), (p + abc->K +1));  /* K+1..Kp-3 range of alphabet */
862       for (x = 0;        x <  abc->K;    x++) p[x] = p[x] * pc;
863       for (x = abc->K+1; x <= abc->Kp-3; x++) p[x] = p[x] * (1.-pc);
864       p[abc->K]    = 0.;
865       p[abc->Kp-2] = 0.;
866       p[abc->Kp-1] = 0.;
867     }
868 
869   dsq[0]   = eslDSQ_SENTINEL;
870   for (i = 1; i <= L; i++)
871     dsq[i] = esl_rnd_DChoose(rng, p, abc->Kp);
872   dsq[L+1] = eslDSQ_SENTINEL;
873 
874   if      (esl_byp_IsReturned(byp_p)) *byp_p = p;
875   else if (esl_byp_IsInternal(byp_p)) free(p);
876   return eslOK;
877 
878  ERROR:
879   if (! esl_byp_IsProvided(byp_p) && p) free(p);
880   if (  esl_byp_IsReturned(byp_p))     *byp_p = NULL;
881   return status;
882 }
883 /*--------------------- end, digital generation ---------------- */
884 
885 
886 
887 /*****************************************************************
888  *# 5. Shuffling sequences (digital mode)
889  *****************************************************************/
890 
891 /* Function:  esl_rsq_XShuffle()
892  * Synopsis:  Shuffle a digital sequence.
893  * Incept:    SRE, Fri Feb 23 08:24:20 2007 [Casa de Gatos]
894  *
895  * Purpose:   Given a digital sequence <dsq> of length <L> residues,
896  *            shuffle it, and leave the shuffled version in <shuffled>.
897  *
898  *            Caller provides allocated storage for <shuffled> for at
899  *            least the same length as <dsq>.
900  *
901  *            <shuffled> may also point to the same storage as <dsq>,
902  *            in which case <dsq> is shuffled in place.
903  *
904  * Returns:   <eslOK> on success.
905  */
906 int
esl_rsq_XShuffle(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,ESL_DSQ * shuffled)907 esl_rsq_XShuffle(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, ESL_DSQ *shuffled)
908 {
909   int     i;
910   ESL_DSQ x;
911 
912   if (dsq != shuffled) esl_abc_dsqcpy(dsq, L, shuffled);
913   while (L > 1) {
914     i           = 1 + esl_rnd_Roll(r, L);
915     x           = shuffled[i];
916     shuffled[i] = shuffled[L];
917     shuffled[L] = x;
918     L--;
919   }
920   return eslOK;
921 }
922 
923 /* Function:  esl_rsq_XShuffleDP()
924  * Synopsis:  Shuffle a digital sequence, preserving diresidue composition.
925  * Incept:    SRE, Fri Feb 23 09:23:47 2007 [Casa de Gatos]
926  *
927  * Purpose:   Same as <esl_rsq_CShuffleDP()>, except for a digital
928  *            sequence <dsq> of length <L>, encoded in a digital alphabet
929  *            of <K> residues.
930  *
931  *            <dsq> may only consist of residue codes <0..K-1>; if it
932  *            contains gaps, degeneracies, or missing data, pass the alphabet's
933  *            <Kp> size, not its canonical <K>.
934  *
935  *            If <L> $\leq 2$, this is a no-op; <shuffled> is a copy of <dsq>.
936  *
937  * Returns:   <eslOK> on success.
938  *
939  * Throws:    <eslEINVAL> if <s> contains digital residue codes
940  *            outside the range <0..K-1>.
941  *            <eslEMEM> on allocation failure.
942  */
943 int
esl_rsq_XShuffleDP(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * shuffled)944 esl_rsq_XShuffleDP(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *shuffled)
945 {
946   int     status;           /* Easel return status code */
947   int     i;	            /* a position in dsq or shuffled */
948   ESL_DSQ x,y;              /* indices of two characters */
949   ESL_DSQ **E  = NULL;      /* edge lists: E[0] is the edge list from vertex A */
950   int     *nE  = NULL;      /* lengths of edge lists */
951   int     *iE  = NULL;      /* positions in edge lists */
952   ESL_DSQ *Z   = NULL;      /* connectivity in last edge graph Z */
953   int      n;	            /* tmp: remaining length of an edge list to be shuffled */
954   ESL_DSQ  sf;              /* last character in shuffled */
955 
956   int      keep_connecting; /* flag used in Z connectivity algorithm */
957   int      is_eulerian;	    /* flag used for when we've got a good Z */
958 
959   /* First, verify that we can deal with all the residues in dsq. */
960   for (i = 1; i <= L; i++)
961     if (dsq[i] >= K)
962       ESL_EXCEPTION(eslEINVAL, "dsq contains unexpected residue codes");
963 
964   /* The edge case of L <= 2 */
965   if (L <= 2)
966     {
967       if (dsq != shuffled) memcpy(shuffled, dsq, sizeof(ESL_DSQ) * (L+2));
968       return eslOK;
969     }
970 
971   /* Allocations. */
972   ESL_ALLOC(nE, sizeof(int)       * K);  for (x = 0; x < K; x++) nE[x] = 0;
973   ESL_ALLOC(E,  sizeof(ESL_DSQ *) * K);  for (x = 0; x < K; x++) E[x]  = NULL;
974   ESL_ALLOC(iE, sizeof(int)       * K);  for (x = 0; x < K; x++) iE[x] = 0;
975   ESL_ALLOC(Z,  sizeof(ESL_DSQ)   * K);
976   for (x = 0; x < K; x++)
977     ESL_ALLOC(E[x], sizeof(ESL_DSQ) * (L-1));
978 
979   /* "(1) Construct the doublet graph G and edge ordering E... */
980   x = dsq[1];
981   for (i = 2; i <= L; i++) {
982     E[x][nE[x]] = dsq[i];
983     nE[x]++;
984     x = dsq[i];
985   }
986 
987   /* Now we have to find a random Eulerian edge ordering. */
988   sf = dsq[L];
989   is_eulerian = 0;
990   while (! is_eulerian)
991     {
992       for (x = 0; x < K; x++) {
993 	if (nE[x] == 0 || x == sf) continue;
994 	i           = esl_rnd_Roll(r, nE[x]);
995 	ESL_SWAP(E[x][i], E[x][nE[x]-1], ESL_DSQ);
996       }
997 
998       for (x = 0; x < K; x++) Z[x] = 0;
999       Z[(int) sf] = keep_connecting = 1;
1000       while (keep_connecting) {
1001 	keep_connecting = 0;
1002 	for (x = 0; x < K; x++) {
1003 	  if (nE[x] == 0) continue;
1004 	  y = E[x][nE[x]-1];            /* xy is an edge in Z */
1005 	  if (Z[x] == 0 && Z[y] == 1) {  /* x is connected to sf in Z */
1006 	    Z[x] = 1;
1007 	    keep_connecting = 1;
1008 	  }
1009 	}
1010       }
1011 
1012       is_eulerian = 1;
1013       for (x = 0; x < K; x++) {
1014 	if (nE[x] == 0 || x == sf) continue;
1015 	if (Z[x] == 0) {
1016 	  is_eulerian = 0;
1017 	  break;
1018 	}
1019       }
1020     }
1021 
1022   /* "(5) For each vertex s in G, randomly permute... */
1023   for (x = 0; x < K; x++)
1024     for (n = nE[x] - 1; n > 1; n--)
1025       {
1026 	i       = esl_rnd_Roll(r, n);
1027 	ESL_SWAP(E[x][i], E[x][n-1], ESL_DSQ);
1028       }
1029 
1030   /* "(6) Construct sequence S'... */
1031   i = 1;
1032   x = dsq[1];
1033   while (1) {
1034     shuffled[i++] = x;
1035     y = E[x][iE[x]++];
1036     x = y;
1037     if (iE[x] == nE[x]) break;
1038   }
1039   shuffled[i++] = sf;
1040   shuffled[i]   = eslDSQ_SENTINEL;
1041   shuffled[0]   = eslDSQ_SENTINEL;
1042 
1043   /* Reality checks. */
1044   if (x != sf)   ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, you didn't end on s_f.");
1045   if (i != L+1)  ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, i (%d) overran L+1 (%d).", i, L+1);
1046 
1047   esl_arr2_Destroy((void **) E, K);
1048   free(nE);
1049   free(iE);
1050   free(Z);
1051   return eslOK;
1052 
1053  ERROR:
1054   esl_arr2_Destroy((void **) E, K);
1055   if (nE != NULL) free(nE);
1056   if (iE != NULL) free(iE);
1057   if (Z  != NULL) free(Z);
1058   return status;
1059 }
1060 
1061 
1062 /* Function:  esl_rsq_XShuffleKmers()
1063  * Synopsis:  Shuffle k-mers in a digital sequence.
1064  *
1065  * Purpose:   Same as <esl_rsq_CShuffleKmers()>, but shuffle digital
1066  *            sequence <dsq> of length <L> into digital result <shuffled>.
1067  *
1068  * Args:      r        - an <ESL_RANDOMNESS> random generator
1069  *            dsq      - sequence to shuffle
1070  *            K        - size of k-mers to break <s> into
1071  *            shuffled - RESULT: the shuffled sequence
1072  *
1073  * Returns:   <eslOK> on success.
1074  *
1075  * Throws:    <eslEMEM> on allocation error.
1076  */
1077 int
esl_rsq_XShuffleKmers(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * shuffled)1078 esl_rsq_XShuffleKmers(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *shuffled)
1079 {
1080   int   W = L / K;		/* number of kmers "words" excluding leftover prefix */
1081   int   P = L % K;		/* leftover residues in prefix */
1082   int   i;
1083   char *swap = NULL;
1084   int   status;
1085 
1086   if (shuffled != dsq) esl_abc_dsqcpy(dsq, L, shuffled);
1087   ESL_ALLOC(swap, sizeof(char) * K);
1088   while (W > 1)
1089     {				/* use memmove, not memcpy, because i==W-1 is an overlap case */
1090       i = esl_rnd_Roll(r, W);	                                                 /* pick a word          */
1091       memmove(swap,                   shuffled + P + i*K,     K * sizeof(char)); /* copy it to tmp space */
1092       memmove(shuffled + P + i*K,     shuffled + P + (W-1)*K, K * sizeof(char)); /* move word W-1 to i   */
1093       memmove(shuffled + P + (W-1)*K, swap,                   K * sizeof(char)); /* move word i to W-1   */
1094       W--;
1095     }
1096   free(swap);
1097   return eslOK;
1098 
1099  ERROR:
1100   free(swap);
1101   return status;
1102 }
1103 
1104 
1105 /* Function:  esl_rsq_XReverse()
1106  * Synopsis:  Reverse a digital sequence.
1107  * Incept:    SRE, Sat Feb 24 10:13:30 2007 [Casa de Gatos]
1108  *
1109  * Purpose:   Given a digital sequence <dsq> of length <L>, return
1110  *            reversed version of it in <rev>.
1111  *
1112  *            Caller provides storage in <rev> for at least
1113  *            <(L+2)*sizeof(ESL_DSQ)>.
1114  *
1115  *            <s> and <rev> can point to the same storage, in which
1116  *            case <s> is reversed in place.
1117  *
1118  * Returns:   <eslOK> on success.
1119  */
1120 int
esl_rsq_XReverse(const ESL_DSQ * dsq,int L,ESL_DSQ * rev)1121 esl_rsq_XReverse(const ESL_DSQ *dsq, int L, ESL_DSQ *rev)
1122 {
1123   int     i;
1124   ESL_DSQ x;
1125 
1126   for (i = 1; i <= L/2; i++)
1127     {				/* swap ends */
1128       x          = dsq[L-i+1];
1129       rev[L-i+1] = dsq[i];
1130       rev[i]     = x;
1131     }
1132   if (L%2) { rev[i] = dsq[i]; } /* don't forget middle residue in odd-length dsq */
1133   rev[0]   = eslDSQ_SENTINEL;
1134   rev[L+1] = eslDSQ_SENTINEL;
1135   return eslOK;
1136 }
1137 
1138 
1139 /* Function: esl_rsq_XShuffleWindows()
1140  * Synopsis: Shuffle local windows of a digital sequence.
1141  * Incept:   SRE, Sat Feb 24 10:51:31 2007 [Casa de Gatos]
1142  *
1143  * Purpose:  Given a digital sequence <dsq> of length <L>, shuffle
1144  *           residues in nonoverlapping windows of width <w>, and put
1145  *           the result in <shuffled>.  See [Pearson88].
1146  *
1147  *           Caller provides storage in <shuffled> for at least
1148  *           <(L+2)*sizeof(ESL_DSQ)>.
1149  *
1150  *           <dsq> and <shuffled> can be identical to shuffle in place.
1151  *
1152  * Args:     dsq      - digital sequence to shuffle in windows
1153  *           L        - length of <dsq>
1154  *           w        - window size (typically 10 or 20)
1155  *           shuffled - allocated space for window-shuffled result.
1156  *
1157  * Return:   <eslOK> on success.
1158  */
1159 int
esl_rsq_XShuffleWindows(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int w,ESL_DSQ * shuffled)1160 esl_rsq_XShuffleWindows(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int w, ESL_DSQ *shuffled)
1161 {
1162   ESL_DSQ x;
1163   int  i, j, k;
1164 
1165   if (dsq != shuffled) esl_abc_dsqcpy(dsq, L, shuffled);
1166   for (i = 1; i <= L; i += w)
1167     for (j = ESL_MIN(L, i+w-1); j > i; j--)
1168       {
1169 	k           = i + esl_rnd_Roll(r, j-i+1);
1170 	x           = shuffled[k];  /* semantics of a j,k swap, because we might be shuffling in-place */
1171 	shuffled[k] = shuffled[j];
1172 	shuffled[j] = x;
1173       }
1174   return eslOK;
1175 }
1176 
1177 /*------------------- end, digital shuffling  -------------------*/
1178 
1179 
1180 
1181 /*****************************************************************
1182  *# 6. Randomizing sequences (digital mode)
1183  *****************************************************************/
1184 
1185 /* Function:  esl_rsq_XMarkov0()
1186  * Synopsis:  Generate new digital sequence of same 0th order Markov properties.
1187  * Incept:    SRE, Sat Feb 24 09:12:32 2007 [Casa de Gatos]
1188  *
1189  * Purpose:   Same as <esl_rsq_CMarkov0()>, except for a digital
1190  *            sequence <dsq> of length <L>, encoded in a digital
1191  *            alphabet of <K> residues; caller provides storage
1192  *            for the randomized sequence <markoved> for at least
1193  *            <L+2> <ESL_DSQ> residues, including the two flanking
1194  *            sentinel bytes.
1195  *
1196  *            <dsq> therefore may only consist of residue codes
1197  *            in the range <0..K-1>. If it contains gaps,
1198  *            degeneracies, or missing data, pass the alphabet's
1199  *            <Kp> size, not its canonical <K>.
1200  *
1201  * Returns:   <eslOK> on success.
1202  *
1203  * Throws:    <eslEINVAL> if <s> contains digital residue codes outside
1204  *            the range <0..K-1>.
1205  *            <eslEMEM> on allocation failure.
1206  */
1207 int
esl_rsq_XMarkov0(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * markoved)1208 esl_rsq_XMarkov0(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *markoved)
1209 {
1210   int     status;
1211   int     i;
1212   double *p = NULL;	/* initially counts, then probabilities */
1213   int     x;
1214 
1215   /* First, verify that the string is entirely alphabetic. */
1216   for (i = 1; i <= L; i++)
1217     if (dsq[i] >= K)
1218       ESL_XEXCEPTION(eslEINVAL, "String contains unexpected residue codes");
1219 
1220   ESL_ALLOC(p, sizeof(double) * K);
1221   for (x = 0; x < K; x++) p[x] = 0.;
1222 
1223   for (i = 1; i <= L; i++)
1224     p[(int) dsq[i]] += 1.0;
1225   if (L > 0)
1226     for (x = 0; x < K; x++) p[x] /= (double) L;
1227 
1228   for (i = 1; i <= L; i++)
1229     markoved[i] = esl_rnd_DChoose(r, p, K);
1230   markoved[0]   = eslDSQ_SENTINEL;
1231   markoved[L+1] = eslDSQ_SENTINEL;
1232 
1233   free(p);
1234   return eslOK;
1235 
1236  ERROR:
1237   if (p != NULL) free(p);
1238   return status;
1239 }
1240 
1241 
1242 
1243 /* Function:  esl_rsq_XMarkov1()
1244  * Synopsis:  Generate new digital sequence of same 1st order Markov properties.
1245  * Incept:    SRE, Sat Feb 24 09:46:09 2007 [Casa de Gatos]
1246  *
1247  * Purpose:   Same as <esl_rsq_CMarkov1()>, except for a digital
1248  *            sequence <dsq> of length <L>, encoded in a digital
1249  *            alphabet of <K> residues. Caller provides storage
1250  *            for the randomized sequence <markoved> for at least
1251  *            <L+2> <ESL_DSQ> residues, including the two flanking
1252  *            sentinel bytes.
1253  *
1254  *            <dsq> and <markoved> can be point to the same storage, in which
1255  *            case <dsq> is randomized in place, destroying the original
1256  *            string.
1257  *
1258  *            <dsq> therefore may only consist of residue codes
1259  *            in the range <0..K-1>. If it contains gaps,
1260  *            degeneracies, or missing data, pass the alphabet's
1261  *            <Kp> size, not its canonical <K>.
1262  *
1263  *            If <L> $\leq 2$, this is a no-op; <markoved> is a copy of <dsq>.
1264  *
1265  * Args:      dsq       - input digital sequence 1..L
1266  *            L         - length of dsq
1267  *            K         - residue codes in dsq are in range 0..K-1
1268  *            markoved  - new randomly generated digital sequence;
1269  *                        storage allocated by caller, at least (L+2)*ESL_DSQ;
1270  *                        may be same as dsq to randomize in place.
1271  *
1272  * Returns:   <eslOK> on success.
1273  *
1274  * Throws:    <eslEINVAL> if <s> contains digital residue codes outside
1275  *            the range <0..K-1>.
1276  *            <eslEMEM> on allocation failure.
1277  */
1278 int
esl_rsq_XMarkov1(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * markoved)1279 esl_rsq_XMarkov1(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *markoved)
1280 {
1281   double **p  = NULL;	/* conditional probabilities p[x][y] = P(y | x) */
1282   double  *p0 = NULL;	/* marginal probabilities P(x), just for initial residue. */
1283   int      i;
1284   ESL_DSQ  x,y;
1285   ESL_DSQ  i0;		/* initial symbol */
1286   int      status;
1287 
1288   /* validate the input string */
1289   for (i = 1; i <= L; i++)
1290     if (dsq[i] >= K)
1291       ESL_XEXCEPTION(eslEINVAL, "String contains unexpected residue codes");
1292 
1293   /* The edge case of L <= 2 */
1294   if (L <= 2)
1295     {
1296       if (dsq != markoved) memcpy(markoved, dsq, sizeof(ESL_DSQ) * (L+2));
1297       return eslOK;
1298     }
1299 
1300   /* allocations */
1301   ESL_ALLOC(p0, sizeof(double)   * K);  for (x = 0; x < K; x++) p0[x] = 0.;
1302   ESL_ALLOC(p,  sizeof(double *) * K);  for (x = 0; x < K; x++) p[x]  = NULL;
1303   for (x = 0; x < K; x++)
1304     { ESL_ALLOC(p[x], sizeof(double) * K); for (y = 0; y < K; y++) p[x][y] = 0.; }
1305 
1306   /* Collect first order counts and convert to frequencies. */
1307   i0 = x = dsq[1];
1308   for (i = 2; i <= L; i++)
1309     {
1310       y = dsq[i];
1311       p[x][y] += 1.0;
1312       x = y;
1313     }
1314   p[x][i0] += 1.0;	/* "circularized": avoids a bug; see markov1_bug utest */
1315 
1316   for (x = 0; x < K; x++)
1317     {
1318       p0[x] = 0.;
1319       for (y = 0; y < K; y++)
1320 	p0[x] += p[x][y];	/* now p0[x] = marginal counts of x, inclusive of 1st residue */
1321 
1322       for (y = 0; y < K; y++)
1323 	p[x][y] = (p0[x] > 0. ? p[x][y] / p0[x] : 0.);	/* now p[x][y] = P(y | x) */
1324 
1325       p0[x] /= (double) L;	/* now p0[x] = marginal P(x) inclusive of 1st residue */
1326     }
1327 
1328   /* Generate a random string using those p's. */
1329   markoved[1] = esl_rnd_DChoose(r, p0, K);
1330   for (i = 2; i <= L; i++)
1331     markoved[i] = esl_rnd_DChoose(r, p[markoved[i-1]], K);
1332 
1333   markoved[0]   = eslDSQ_SENTINEL;
1334   markoved[L+1] = eslDSQ_SENTINEL;
1335 
1336   esl_arr2_Destroy((void**)p, K);
1337   free(p0);
1338   return eslOK;
1339 
1340  ERROR:
1341   esl_arr2_Destroy((void**)p, K);
1342   if (p0 != NULL) free(p0);
1343   return status;
1344 }
1345 
1346 /*------------------ end, digital randomizing -------------------*/
1347 
1348 /*****************************************************************
1349  * 7. Statistics driver.
1350  *****************************************************************/
1351 
1352 /* This driver tests (and confirms) the intuition that using
1353  * a DP shuffle on short sequences may be a bad idea; short sequences
1354  * don't shuffle effectively.
1355  * xref J3/20.
1356  */
1357 #ifdef eslRANDOMSEQ_STATS
1358 /* gcc -g -Wall -o randomseq_stats -L. -I. -DeslRANDOMSEQ_STATS esl_randomseq.c -leasel -lm
1359  */
1360 #include "esl_config.h"
1361 
1362 #include <stdio.h>
1363 #include <string.h>
1364 
1365 #include "easel.h"
1366 #include "esl_alphabet.h"
1367 #include "esl_distance.h"
1368 #include "esl_getopts.h"
1369 #include "esl_random.h"
1370 #include "esl_randomseq.h"
1371 #include "esl_stats.h"
1372 #include "esl_vectorops.h"
1373 
1374 static ESL_OPTIONS options[] = {
1375   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1376   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
1377   { "-d",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "diresidue shuffle",                                0 },
1378   { "-R",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "reverse the sequence",                             0 },
1379   { "-2",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "resample an independent sequence",                 0 },
1380   { "-N",        eslARG_INT,  "10000",  NULL, NULL,  NULL,  NULL, NULL, "number of sampled sequences per length",           0 },
1381   { "-s",        eslARG_INT,     "42",  NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
1382   { "--minL",    eslARG_INT,      "5",  NULL, NULL,  NULL,  NULL, NULL, "xaxis minimum L",                                  0 },
1383   { "--maxL",    eslARG_INT,    "200",  NULL, NULL,  NULL,  NULL, NULL, "xaxis maximum L",                                  0 },
1384   { "--stepL",   eslARG_INT,      "5",  NULL, NULL,  NULL,  NULL, NULL, "xaxis step size",                                  0 },
1385   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1386 };
1387 static char usage[]  = "[-options]";
1388 static char banner[] = "stats driver for randomseq module";
1389 
1390 int
main(int argc,char ** argv)1391 main(int argc, char **argv)
1392 {
1393   ESL_GETOPTS    *go       = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1394   ESL_RANDOMNESS *r        = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1395   ESL_ALPHABET   *abc      = esl_alphabet_Create(eslAMINO);
1396   int             N        = esl_opt_GetInteger(go, "-N");
1397   int             minL     = esl_opt_GetInteger(go, "--minL");
1398   int             maxL     = esl_opt_GetInteger(go, "--maxL");
1399   int             stepL    = esl_opt_GetInteger(go, "--stepL");
1400   ESL_DSQ        *dsq1     = malloc(sizeof(ESL_DSQ) * (maxL+2));
1401   ESL_DSQ        *dsq2     = malloc(sizeof(ESL_DSQ) * (maxL+2));
1402   double         *fq       = malloc(sizeof(double) * abc->K);
1403   double         *pid      = malloc(sizeof(double) * N);
1404   double          mean, var;
1405   int             L;
1406   int             i;
1407 
1408   esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K );
1409 
1410   for (L = minL; L <= maxL; L += stepL)
1411     {
1412       for (i = 0; i < N; i++)
1413 	{
1414 	  esl_rsq_xIID(r, fq, abc->K, L, dsq1);
1415 
1416 	  if      (esl_opt_GetBoolean(go, "-d")) esl_rsq_XShuffleDP(r, dsq1, L, abc->K, dsq2);
1417 	  else if (esl_opt_GetBoolean(go, "-R")) esl_rsq_XReverse(dsq1, L, dsq2);
1418 	  else if (esl_opt_GetBoolean(go, "-2")) esl_rsq_xIID(r, fq, abc->K, L, dsq2);
1419 	  else                                   esl_rsq_XShuffle(r, dsq1, L, dsq2);
1420 
1421 	  esl_dst_XPairId(abc, dsq1, dsq2, &(pid[i]), NULL, NULL);
1422 	}
1423 
1424       esl_stats_DMean(pid, N, &mean, &var);
1425       printf("%-6d %.4f %.4f\n", L, mean, sqrt(var));
1426     }
1427   printf("&\n");
1428 
1429   free(pid);
1430   free(fq);
1431   free(dsq2);
1432   free(dsq1);
1433   esl_alphabet_Destroy(abc);
1434   esl_randomness_Destroy(r);
1435   esl_getopts_Destroy(go);
1436   return 0;
1437 }
1438 #endif /*eslRANDOMSEQ_STATS*/
1439 /*-------------- end, statistics driver -------------------------*/
1440 
1441 
1442 
1443 
1444 
1445 /*****************************************************************
1446  * 8. Unit tests.
1447  *****************************************************************/
1448 #ifdef eslRANDOMSEQ_TESTDRIVE
1449 #include "esl_dirichlet.h"
1450 #include "esl_vectorops.h"
1451 
1452 /* count c(x) monoresidue and c(xy) diresidue composition
1453  * used for sequence shuffling unit tests
1454  * mono, di allocated by caller for 26 and 26x26, respectively.
1455  */
1456 static int
composition(char * s,int L,int * mono,int ** di)1457 composition(char *s, int L, int *mono, int **di)
1458 {
1459   int i, x, y;
1460 
1461   for (x = 0; x < 26; x++) {
1462     mono[x] = 0;
1463     for (y = 0; y < 26; y++)
1464       di[x][y] = 0;
1465   }
1466 
1467   for (i = 0; s[i] != '\0'; i++) {
1468     if (!isalpha(s[i])) esl_fatal("bad residue %d", i);
1469     y = toupper(s[i]) - 'A';
1470     mono[y]++;
1471     if (i > 0) {
1472       x = toupper(s[i-1] - 'A');
1473       di[x][y]++;
1474     }
1475   }
1476   if (i != L) esl_fatal("sequence length didn't match expected %d", L);
1477   return eslOK;
1478 }
1479 
1480 /* same, but for digital seq., with alphabet size K */
1481 static int
xcomposition(ESL_DSQ * dsq,int L,int K,int * mono,int ** di)1482 xcomposition(ESL_DSQ *dsq, int L, int K, int *mono, int **di)
1483 {
1484   int i, x, y;
1485 
1486   for (x = 0; x < K; x++) {
1487     mono[x] = 0;
1488     for (y = 0; y < K; y++)
1489       di[x][y] = 0;
1490   }
1491 
1492   for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++) {
1493     if (dsq[i] > K) esl_fatal("bad residue %d", i);
1494     if (i > 1) di[(int) dsq[i-1]][(int) dsq[i]]++;
1495     mono[(int) dsq[i]]++;
1496   }
1497   if (i != L+1) esl_fatal("sequence length didn't match expected %d", L);
1498   return eslOK;
1499 }
1500 
1501 static int
composition_allocate(int K,int ** ret_mono,int *** ret_di)1502 composition_allocate(int K, int **ret_mono, int ***ret_di)
1503 {
1504   int  status;
1505   int *mono = NULL;
1506   int **di  = NULL;
1507   int  x;
1508 
1509   ESL_ALLOC(mono, sizeof(int)   * K);
1510   ESL_ALLOC(di,   sizeof(int *) * K); for (x = 0; x < K; x++) di[x] = NULL;
1511   for (x = 0; x < K; x++)
1512     ESL_ALLOC(di[x], sizeof(int) * K);
1513   *ret_mono = mono;
1514   *ret_di   = di;
1515   return eslOK;
1516 
1517  ERROR:
1518   esl_arr2_Destroy((void **) di, K);
1519   if (mono != NULL) free(mono);
1520   *ret_mono = NULL;
1521   *ret_di   = NULL;
1522   return status;
1523 }
1524 
1525 /* compare compositions before/after.
1526  * either mono (m1,m2) or di (d1,d2) may be NULL, to compare only the other one */
1527 static int
composition_compare(int * m1,int ** di1,int * m2,int ** di2,int K)1528 composition_compare(int *m1, int **di1, int *m2, int **di2, int K)
1529 {
1530   int x,y;
1531 
1532   for (x = 0; x < K; x++) {
1533     if (m1 != NULL && m1[x] != m2[x]) return eslFAIL;
1534     if (di1 != NULL)
1535       for (y = 0; y < K; y++)
1536 	if (di1[x][y] != di2[x][y])   return eslFAIL;
1537   }
1538   return eslOK;
1539 }
1540 
1541 /* Unit tests for:
1542  *     esl_rsq_CShuffle()
1543  *     esl_rsq_CShuffleDP()
1544  *     esl_rsq_CShuffleWindows()
1545  *     esl_rsq_CReverse()
1546  *
1547  * All of these exactly preserve residue composition, which is
1548  * the basis of the unit tests.
1549  */
1550 static void
utest_CShufflers(ESL_RANDOMNESS * r,int L,char * alphabet,int K)1551 utest_CShufflers(ESL_RANDOMNESS *r, int L, char *alphabet, int K)
1552 {
1553   char   *logmsg  = "Failure in one of the CShuffle* unit tests";
1554   int     status;
1555   char   *s   = NULL;
1556   char   *s2  = NULL;
1557   int    *m1  = NULL,
1558          *m2  = NULL;	    /* mono, before and after */
1559   int   **di1 = NULL,
1560         **di2 = NULL;       /* di, before and after */
1561   double  *p;
1562   int      w = 12;   	    /* window width for CShuffleWindows() */
1563 
1564   /* allocations */
1565   ESL_ALLOC(s,   sizeof(char)   * (L+1));
1566   ESL_ALLOC(s2,  sizeof(char)   * (L+1));
1567   ESL_ALLOC(p,   sizeof(double) * K);
1568   if (composition_allocate(26, &m1, &di1) != eslOK) esl_fatal(logmsg);
1569   if (composition_allocate(26, &m2, &di2) != eslOK) esl_fatal(logmsg);
1570 
1571   /* generate the string we'll start shuffling */
1572   if (esl_dirichlet_DSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1573   if (esl_rsq_IID(r, alphabet, p, K, L, s)  != eslOK) esl_fatal(logmsg);
1574 
1575   /* esl_rsq_CShuffle: mono composition should stay exactly the same, di may change */
1576   memset(s2, 0, (L+1)*sizeof(char));
1577   if (composition(s,   L, m1, di1)                != eslOK) esl_fatal(logmsg);
1578   if (esl_rsq_CShuffle(r, s, s2)                  != eslOK) esl_fatal(logmsg);
1579   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1580   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1581   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1582 
1583   /* esl_rsq_CShuffle, in place */
1584   strcpy(s, s2);
1585   if (composition(s2, L, m1, di1)                 != eslOK) esl_fatal(logmsg);
1586   if (esl_rsq_CShuffle(r, s2, s2)                 != eslOK) esl_fatal(logmsg);
1587   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1588   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1589   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1590 
1591   /* esl_rsq_CShuffleDP: mono and di compositions stay exactly the same */
1592   memset(s2, 0, (L+1)*sizeof(char));
1593   if (composition(s, L, m1,  di1)                 != eslOK) esl_fatal(logmsg);
1594   if (esl_rsq_CShuffleDP(r, s, s2)                != eslOK) esl_fatal(logmsg);
1595   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1596   if (composition_compare(m1, di1, m2, di2, 26)   != eslOK) esl_fatal(logmsg);
1597   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1598 
1599   /* esl_rsq_CShuffleDP, in place */
1600   strcpy(s, s2);
1601   if (composition(s2, L, m1, di1)                 != eslOK) esl_fatal(logmsg);
1602   if (esl_rsq_CShuffleDP(r, s2, s2)               != eslOK) esl_fatal(logmsg);
1603   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1604   if (composition_compare(m1, di1, m2, di2, 26)   != eslOK) esl_fatal(logmsg);
1605   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1606 
1607   /* esl_rsq_CShuffleKmers: mono composition stays the same */
1608   memset(s2, 0, (L+1)*sizeof(char));
1609   if (composition(s, L, m1,  di1)                 != eslOK) esl_fatal(logmsg);
1610   if (esl_rsq_CShuffleKmers(r, s, 3, s2)          != eslOK) esl_fatal(logmsg);
1611   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1612   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1613   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1614 
1615   /* esl_rsq_CShuffleKmers, in place */
1616   strcpy(s, s2);
1617   if (composition(s2, L, m1, di1)                 != eslOK) esl_fatal(logmsg);
1618   if (esl_rsq_CShuffleKmers(r, s2, 3, s2)         != eslOK) esl_fatal(logmsg);
1619   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1620   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1621   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1622 
1623   /* esl_rsq_CShuffleWindows(): mono composition stays the same */
1624   memset(s2, 0, (L+1)*sizeof(char));
1625   if (composition(s,   L, m1, di1)                != eslOK) esl_fatal(logmsg);
1626   if (esl_rsq_CShuffleWindows(r, s, w, s2)        != eslOK) esl_fatal(logmsg);
1627   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1628   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1629   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1630 
1631   /* esl_rsq_CShuffleWindows(), in place */
1632   strcpy(s, s2);
1633   if (composition(s2, L, m1, di1)                 != eslOK) esl_fatal(logmsg);
1634   if (esl_rsq_CShuffleWindows(r, s2, w, s2)       != eslOK) esl_fatal(logmsg);
1635   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1636   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1637   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1638 
1639   /* esl_rsq_CReverse(): two reverses (one in place) give the same seq back */
1640   memset(s2, 0, (L+1)*sizeof(char));
1641   if (composition(s,   L, m1, di1)                != eslOK) esl_fatal(logmsg);
1642   if (esl_rsq_CReverse(s, s2)                     != eslOK) esl_fatal(logmsg);
1643   if (composition(s2, L, m2, di2)                 != eslOK) esl_fatal(logmsg);
1644   if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1645   if (strcmp(s2, s) == 0)                                   esl_fatal(logmsg);
1646   if (esl_rsq_CReverse(s2, s2)                    != eslOK) esl_fatal(logmsg);
1647   if (strcmp(s2, s) != 0)                                   esl_fatal(logmsg);
1648 
1649   free(s);
1650   free(s2);
1651   free(p);
1652   free(m1);
1653   free(m2);
1654   esl_arr2_Destroy((void **) di1, 26);
1655   esl_arr2_Destroy((void **) di2, 26);
1656   return;
1657 
1658  ERROR:
1659   esl_fatal(logmsg);
1660 }
1661 
1662 /* Unit tests for:
1663  *    esl_rsq_CMarkov0()
1664  *    esl_rsq_CMarkov1()
1665  *
1666  * Testing these is less robust than the shufflers, because it's hard
1667  * to concoct deterministic tests. Instead the test is a weak one,
1668  * that zero probability events get zero counts.
1669  */
1670 static void
utest_CMarkovs(ESL_RANDOMNESS * r,int L,char * alphabet)1671 utest_CMarkovs(ESL_RANDOMNESS *r, int L, char *alphabet)
1672 {
1673   char   *logmsg = "Failure in a CMarkov*() unit test";
1674   int     status;
1675   char   *s   = NULL;
1676   char   *s2  = NULL;
1677   float  *p   = NULL;
1678   int     K;
1679   int     pzero;		/* which 0..K-1 residue will have zero prob */
1680   int     zeroidx;		/* index of pzero residue in 0..25 ASCII    */
1681   int    *m1  = NULL,
1682          *m2  = NULL;	    /* mono, before and after */
1683   int   **di1 = NULL,
1684         **di2 = NULL;       /* di, before and after */
1685   int     i,x;
1686 
1687   K = strlen(alphabet);
1688   ESL_ALLOC(p,   sizeof(float)  * K);
1689   ESL_ALLOC(s,   sizeof(char)   * (L+1));
1690   ESL_ALLOC(s2,  sizeof(char)   * (L+1));
1691   if (composition_allocate(26, &m1, &di1) != eslOK) esl_fatal(logmsg);
1692   if (composition_allocate(26, &m2, &di2) != eslOK) esl_fatal(logmsg);
1693 
1694   /* generate string with a random letter prob set to 0  */
1695   pzero   = esl_rnd_Roll(r, K);
1696   zeroidx = toupper(alphabet[pzero]) - 'A';
1697   if (esl_dirichlet_FSampleUniform(r, K, p)  != eslOK) esl_fatal(logmsg);
1698   p[pzero] = 0;
1699   esl_vec_FNorm(p, K);
1700   if (esl_rsq_fIID(r, alphabet, p, K, L, s)  != eslOK) esl_fatal(logmsg);
1701 
1702   /* esl_rsq_CMarkov0()  */
1703   memset(s2, 0, (L+1)*sizeof(char));
1704   if (composition(s,   L, m1, di1)  != eslOK) esl_fatal(logmsg);
1705   if (esl_rsq_CMarkov0(r, s, s2)    != eslOK) esl_fatal(logmsg);
1706   if (composition(s2, L, m2, di2)   != eslOK) esl_fatal(logmsg);
1707   if (m1[zeroidx]                   != 0)     esl_fatal(logmsg);
1708   if (m2[zeroidx]                   != 0)     esl_fatal(logmsg);
1709   if (strcmp(s2, s)                 == 0)     esl_fatal(logmsg);
1710 
1711   /* esl_rsq_CMarkov0(), in place */
1712   strcpy(s, s2);
1713   if (esl_rsq_CMarkov0(r, s2, s2)   != eslOK) esl_fatal(logmsg);
1714   if (composition(s2, L, m2, di2)   != eslOK) esl_fatal(logmsg);
1715   if (m2[zeroidx]                   != 0)     esl_fatal(logmsg);
1716   if (strcmp(s2, s)                 == 0)     esl_fatal(logmsg);
1717 
1718   /* generate string with all homodiresidues set to 0 */
1719   if (esl_dirichlet_FSampleUniform(r, K, p)  != eslOK) esl_fatal(logmsg);
1720   do {
1721     if (esl_rsq_fIID(r, alphabet, p, K, L, s)  != eslOK) esl_fatal(logmsg);
1722     for (i = 1; i < L; i++)
1723       if (s[i] == s[i-1]) /* this incantation will rotate letter forward in alphabet: */
1724 	s[i] = alphabet[(1+strchr(alphabet,s[i])-alphabet)%K];
1725   } while (s[0] == s[L-1]);	/* lazy: reject strings where circularization would count a homodimer */
1726 
1727   /* esl_rsq_CMarkov1()  */
1728   memset(s2, 0, (L+1)*sizeof(char));
1729   if (composition(s,   L, m1, di1)  != eslOK) esl_fatal(logmsg);
1730   if (esl_rsq_CMarkov1(r, s, s2)    != eslOK) esl_fatal(logmsg);
1731   if (composition(s2, L, m2, di2)   != eslOK) esl_fatal(logmsg);
1732   for (x = 0; x < K; x++) {
1733     if (di1[x][x]                   != 0)     esl_fatal(logmsg);
1734     if (di2[x][x]                   != 0)     esl_fatal(logmsg);
1735   }
1736   if (strcmp(s2, s)                 == 0)     esl_fatal(logmsg);
1737 
1738   /* esl_rsq_CMarkov1(), in place  */
1739   strcpy(s, s2);
1740   if (esl_rsq_CMarkov1(r, s2, s2)  != eslOK)   esl_fatal(logmsg);
1741   if (composition(s2, L, m2, di2)  != eslOK) esl_fatal(logmsg);
1742   for (x = 0; x < K; x++) {
1743     if (di1[x][x]                   != 0)     esl_fatal(logmsg);
1744     if (di2[x][x]                   != 0)     esl_fatal(logmsg);
1745   }
1746   if (strcmp(s2, s)                 == 0)     esl_fatal(logmsg);
1747 
1748   free(s);
1749   free(s2);
1750   free(p);
1751   free(m1);
1752   free(m2);
1753   esl_arr2_Destroy((void **) di1, 26);
1754   esl_arr2_Destroy((void **) di2, 26);
1755   return;
1756 
1757  ERROR:
1758   esl_fatal(logmsg);
1759 }
1760 
1761 
1762 /* Unit tests for:
1763  *     esl_rsq_XShuffle()
1764  *     esl_rsq_XShuffleDP()
1765  *     esl_rsq_XShuffleWindows()
1766  *     esl_rsq_XReverse()
1767  * Same ideas as testing the C* versions, adapted for digital sequences.
1768  */
1769 static void
utest_XShufflers(ESL_RANDOMNESS * r,int L,int K)1770 utest_XShufflers(ESL_RANDOMNESS *r, int L, int K)
1771 {
1772   char    *logmsg  = "Failure in one of the XShuffle* unit tests";
1773   int      status;
1774   ESL_DSQ *dsq   = NULL;
1775   ESL_DSQ *ds2   = NULL;
1776   int     *m1    = NULL,
1777           *m2    = NULL;    /* mono, before and after */
1778   int    **di1   = NULL,
1779          **di2   = NULL;    /* di, before and after */
1780   float   *p     = NULL;
1781   int      w = 12;   	    /* window width for XShuffleWindows() */
1782 
1783   /* allocations */
1784   ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
1785   ESL_ALLOC(ds2, sizeof(ESL_DSQ) * (L+2));
1786   ESL_ALLOC(p,   sizeof(float)   * K);
1787   if (composition_allocate(K, &m1, &di1) != eslOK) esl_fatal(logmsg);
1788   if (composition_allocate(K, &m2, &di2) != eslOK) esl_fatal(logmsg);
1789 
1790   /* generate the string we'll test shuffling on, keep its composition stats */
1791   if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1792   if (esl_rsq_xfIID(r, p, K, L, dsq)        != eslOK) esl_fatal(logmsg);
1793 
1794   /* esl_rsq_XShuffle: mono composition should stay exactly the same, di may change */
1795   memset(ds2, eslDSQ_SENTINEL, (L+2));
1796   if (xcomposition(dsq, L, K, m1, di1)           != eslOK) esl_fatal(logmsg);
1797   if (esl_rsq_XShuffle(r, dsq, L, ds2)           != eslOK) esl_fatal(logmsg);
1798   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1799   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1800 
1801   /* esl_rsq_XShuffle, in place */
1802   if (esl_abc_dsqcpy(ds2, L, dsq)                != eslOK) esl_fatal(logmsg);
1803   if (xcomposition(ds2, L, K, m1,  di1)          != eslOK) esl_fatal(logmsg);
1804   if (esl_rsq_XShuffle(r, ds2, L, ds2)           != eslOK) esl_fatal(logmsg);
1805   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1806   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1807 
1808   /* esl_rsq_XShuffleDP: mono and di compositions stay exactly the same */
1809   memset(ds2, eslDSQ_SENTINEL, (L+2));
1810   if (xcomposition(dsq, L, K, m1,  di1)          != eslOK) esl_fatal(logmsg);
1811   if (esl_rsq_XShuffleDP(r, dsq, L, K, ds2)      != eslOK) esl_fatal(logmsg);
1812   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1813   if (composition_compare(m1, di1, m2, di2, K)   != eslOK) esl_fatal(logmsg);
1814 
1815   /* esl_rsq_XShuffleDP, in place */
1816   if (esl_abc_dsqcpy(ds2, L, dsq)                != eslOK) esl_fatal(logmsg);
1817   if (xcomposition(ds2, L, K, m1, di1)           != eslOK) esl_fatal(logmsg);
1818   if (esl_rsq_XShuffleDP(r, ds2, L, K, ds2)      != eslOK) esl_fatal(logmsg);
1819   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1820   if (composition_compare(m1, di1, m2, di2, K)   != eslOK) esl_fatal(logmsg);
1821 
1822   /* esl_rsq_XShuffleKmers: mono compositions stay exactly the same */
1823   memset(ds2, eslDSQ_SENTINEL, (L+2));
1824   if (xcomposition(dsq, L, K, m1,  di1)          != eslOK) esl_fatal(logmsg);
1825   if (esl_rsq_XShuffleKmers(r, dsq, L, 3, ds2)   != eslOK) esl_fatal(logmsg);
1826   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1827   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1828 
1829   /* esl_rsq_XShuffleKmers, in place */
1830   if (esl_abc_dsqcpy(ds2, L, dsq)                != eslOK) esl_fatal(logmsg);
1831   if (xcomposition(ds2, L, K, m1, di1)           != eslOK) esl_fatal(logmsg);
1832   if (esl_rsq_XShuffleKmers(r, ds2, L, 3, ds2)   != eslOK) esl_fatal(logmsg);
1833   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1834   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1835 
1836   /* esl_rsq_XShuffleWindows(): mono composition stays the same */
1837   memset(ds2, eslDSQ_SENTINEL, (L+2));
1838   if (xcomposition(dsq, L, K, m1, di1)           != eslOK) esl_fatal(logmsg);
1839   if (esl_rsq_XShuffleWindows(r, dsq, L, w, ds2) != eslOK) esl_fatal(logmsg);
1840   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1841   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1842 
1843   /* esl_rsq_XShuffleWindows(), in place */
1844   if (esl_abc_dsqcpy(ds2, L, dsq)                != eslOK) esl_fatal(logmsg);
1845   if (xcomposition(ds2, L, K, m1,  di1)          != eslOK) esl_fatal(logmsg);
1846   if (esl_rsq_XShuffleWindows(r, ds2, L, w, ds2) != eslOK) esl_fatal(logmsg);
1847   if (xcomposition(ds2, L, K, m2, di2)           != eslOK) esl_fatal(logmsg);
1848   if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1849 
1850   /* esl_rsq_XReverse(): two reverses (one in place) give the same seq back */
1851   memset(ds2, eslDSQ_SENTINEL, (L+2));
1852   if (xcomposition(dsq, L, K, m1, di1)            != eslOK) esl_fatal(logmsg);
1853   if (esl_rsq_XReverse(dsq, L, ds2)               != eslOK) esl_fatal(logmsg);
1854   if (xcomposition(ds2, L, K, m2, di2)            != eslOK) esl_fatal(logmsg);
1855   if (composition_compare(m1, NULL, m2, NULL, K)  != eslOK) esl_fatal(logmsg);
1856   if (memcmp((void *) ds2, (void *) dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1857   if (esl_rsq_XReverse(ds2, L, ds2)               != eslOK) esl_fatal(logmsg);
1858   if (memcmp((void *) ds2, (void *) dsq, sizeof(ESL_DSQ)*(L+2)) != 0) esl_fatal(logmsg);
1859 
1860   free(dsq);
1861   free(ds2);
1862   free(p);
1863   free(m1);
1864   free(m2);
1865   esl_arr2_Destroy((void **) di1, K);
1866   esl_arr2_Destroy((void **) di2, K);
1867   return;
1868 
1869  ERROR:
1870   esl_fatal(logmsg);
1871 }
1872 
1873 /* Unit tests for:
1874  *    esl_rsq_XMarkov0()
1875  *    esl_rsq_XMarkov1()
1876  * Same ideas as in the C* versions, but for digital sequences.
1877  */
1878 static void
utest_XMarkovs(ESL_RANDOMNESS * r,int L,int K)1879 utest_XMarkovs(ESL_RANDOMNESS *r, int L, int K)
1880 {
1881   char    *logmsg = "Failure in an XMarkov*() unit test";
1882   int      status;
1883   ESL_DSQ *dsq = NULL;
1884   ESL_DSQ *ds2 = NULL;
1885   int     *m1  = NULL,
1886           *m2  = NULL;    /* mono, before and after */
1887   int    **di1 = NULL,
1888          **di2 = NULL;    /* di, before and after */
1889   float   *p   = NULL;
1890   int      pzero;
1891   int      i,x;
1892 
1893   /* allocations */
1894   ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
1895   ESL_ALLOC(ds2, sizeof(ESL_DSQ) * (L+2));
1896   ESL_ALLOC(p,   sizeof(float)   * K);
1897   if (composition_allocate(K, &m1, &di1) != eslOK) esl_fatal(logmsg);
1898   if (composition_allocate(K, &m2, &di2) != eslOK) esl_fatal(logmsg);
1899 
1900   /* generate sequence with a random letter prob set to 0  */
1901   pzero = esl_rnd_Roll(r, K);
1902   if (esl_dirichlet_FSampleUniform(r, K, p)  != eslOK) esl_fatal(logmsg);
1903   p[pzero] = 0.;
1904   esl_vec_FNorm(p, K);
1905   if (esl_rsq_xfIID(r, p, K, L, dsq)         != eslOK) esl_fatal(logmsg);
1906 
1907   /* esl_rsq_XMarkov0()  */
1908   memset(ds2, eslDSQ_SENTINEL, (L+2)*sizeof(ESL_DSQ));
1909   if (xcomposition(dsq, L, K, m1, di1)        != eslOK) esl_fatal(logmsg);
1910   if (esl_rsq_XMarkov0(r, dsq, L, K, ds2)     != eslOK) esl_fatal(logmsg);
1911   if (xcomposition(ds2, L, K, m2, di2)        != eslOK) esl_fatal(logmsg);
1912   if (m1[pzero]                               != 0)     esl_fatal(logmsg);
1913   if (m2[pzero]                               != 0)     esl_fatal(logmsg);
1914   if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0)     esl_fatal(logmsg);
1915 
1916   /* esl_rsq_CMarkov0(), in place */
1917   if (esl_abc_dsqcpy(ds2, L, dsq)             != eslOK) esl_fatal(logmsg);
1918   if (esl_rsq_XMarkov0(r, ds2, L, K, ds2)     != eslOK) esl_fatal(logmsg);
1919   if (xcomposition(ds2, L, K, m2, di2)        != eslOK) esl_fatal(logmsg);
1920   if (m2[pzero]                               != 0)     esl_fatal(logmsg);
1921   if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0)     esl_fatal(logmsg);
1922 
1923   /* generate string with all homodiresidues set to 0 */
1924   if (esl_dirichlet_FSampleUniform(r, K, p)   != eslOK) esl_fatal(logmsg);
1925   do {
1926     if (esl_rsq_xfIID(r, p, K, L, dsq)          != eslOK) esl_fatal(logmsg);
1927     for (i = 2; i <= L; i++)
1928       if (dsq[i] == dsq[i-1]) /* this incantation will rotate letter forward in alphabet: */
1929 	dsq[i] = (dsq[i]+1)%K;
1930   } while (dsq[1] == dsq[L]);	/* lazy. reject strings where circularization would count a homodimer */
1931 
1932   /* esl_rsq_XMarkov1()  */
1933   memset(ds2, eslDSQ_SENTINEL, (L+2)*sizeof(ESL_DSQ));
1934   if (xcomposition(dsq, L, K, m1, di1)        != eslOK) esl_fatal(logmsg);
1935   if (esl_rsq_XMarkov1(r, dsq, L, K, ds2)     != eslOK) esl_fatal(logmsg);
1936   if (xcomposition(ds2, L, K, m2, di2)        != eslOK) esl_fatal(logmsg);
1937   for (x = 0; x < K; x++) {
1938     if (di1[x][x]                             != 0)     esl_fatal(logmsg);
1939     if (di2[x][x]                             != 0)     esl_fatal(logmsg);
1940   }
1941   if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0)     esl_fatal(logmsg);
1942 
1943   /* esl_rsq_XMarkov1(), in place  */
1944   if (esl_abc_dsqcpy(ds2, L, dsq)             != eslOK) esl_fatal(logmsg);
1945   if (esl_rsq_XMarkov1(r, ds2, L, K, ds2)     != eslOK) esl_fatal(logmsg);
1946   if (xcomposition(ds2, L, K, m2, di2)        != eslOK) esl_fatal(logmsg);
1947   for (x = 0; x < K; x++) {
1948     if (di1[x][x]                             != 0)     esl_fatal(logmsg);
1949     if (di2[x][x]                             != 0)     esl_fatal(logmsg);
1950   }
1951   if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0)     esl_fatal(logmsg);
1952 
1953   free(dsq);
1954   free(ds2);
1955   free(p);
1956   free(m1);
1957   free(m2);
1958   esl_arr2_Destroy((void **) di1, K);
1959   esl_arr2_Destroy((void **) di2, K);
1960   return;
1961 
1962  ERROR:
1963   esl_fatal(logmsg);
1964 }
1965 
1966 /* utest_markov1_bug()
1967  *
1968  * Given a sequence like AAAAAAAAAT, where a residue only occurs once
1969  * and at the end of the sequence, a bug can appear: a Markov chain
1970  * can transit to T, but can't leave. Easel handles this by
1971  * counting Markov statistics as if the input sequence were circular.
1972  */
1973 static void
utest_markov1_bug(ESL_RANDOMNESS * r)1974 utest_markov1_bug(ESL_RANDOMNESS *r)
1975 {
1976   char    logmsg[]  = "Failure in markov1_bug test (zero/absorbing transition)";
1977   char    testseq[] = "AAAAAAAAAT";
1978   char   *seq       = NULL;
1979   ESL_DSQ testdsq[] = { eslDSQ_SENTINEL,0,0,0,0,0,0,0,0,0,3,eslDSQ_SENTINEL};
1980   ESL_DSQ *dsq      = NULL;
1981   int     L         = strlen(testseq);
1982   int    *mono      = NULL;
1983   int   **di        = NULL;
1984   int     N         = 100;
1985   int     i;
1986 
1987   if ((seq = malloc(sizeof(char)    * (L+1))) == NULL)    esl_fatal(logmsg);
1988   if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL)    esl_fatal(logmsg);
1989 
1990   if (composition_allocate(4, &mono, &di)       != eslOK) esl_fatal(logmsg);
1991   for (i = 0; i < N; i++) {
1992     if (esl_rsq_XMarkov1(r, testdsq, L, 4, dsq) != eslOK) esl_fatal(logmsg);
1993     if (xcomposition(testdsq, L, 4, mono, di)   != eslOK) esl_fatal(logmsg);
1994     if (mono[0] + mono[3] != L)                           esl_fatal(logmsg);
1995   }
1996   esl_arr2_Destroy((void **) di, 4);
1997   free(mono);
1998 
1999   if (composition_allocate(26, &mono, &di) != eslOK) esl_fatal(logmsg);
2000   for (i = 0; i < N; i++) {
2001     if (esl_rsq_CMarkov1(r, testseq, seq)  != eslOK) esl_fatal(logmsg);
2002     if (composition(seq, L, mono, di)      != eslOK) esl_fatal(logmsg);
2003     if (mono[0] + mono['T'-'A'] != L)                esl_fatal(logmsg);
2004   }
2005   esl_arr2_Destroy((void **) di, 26);
2006   free(mono);
2007   free(seq);
2008   free(dsq);
2009 }
2010 
2011 #endif /*eslRANDOMSEQ_TESTDRIVE*/
2012 /*------------------ end, unit tests ----------------------------*/
2013 
2014 /*****************************************************************
2015  * 9. Test driver.
2016  *****************************************************************/
2017 #ifdef eslRANDOMSEQ_TESTDRIVE
2018 /* gcc -g -Wall -o randomseq_utest -L. -I. -DeslRANDOMSEQ_TESTDRIVE esl_randomseq.c -leasel -lm
2019  */
2020 
2021 #include "esl_config.h"
2022 
2023 #include <stdio.h>
2024 #include <string.h>
2025 
2026 #include "easel.h"
2027 #include "esl_getopts.h"
2028 #include "esl_random.h"
2029 #include "esl_randomseq.h"
2030 
2031 static ESL_OPTIONS options[] = {
2032   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
2033   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
2034   { "-L",        eslARG_INT,   "1000",  NULL, NULL,  NULL,  NULL, NULL, "length of random sequences",                       0 },
2035   { "-s",        eslARG_INT,     "42",  NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
2036   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2037 };
2038 static char usage[]  = "[-options]";
2039 static char banner[] = "test driver for randomseq module";
2040 
2041 int
main(int argc,char ** argv)2042 main(int argc, char **argv)
2043 {
2044   ESL_GETOPTS    *go       = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2045   ESL_RANDOMNESS *r        = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
2046   char           *alphabet = "ACGT";
2047   int             K        = strlen(alphabet);
2048   int             L        = esl_opt_GetInteger(go, "-L");
2049 
2050   utest_CShufflers(r, L, alphabet, K);
2051   utest_CMarkovs  (r, L, alphabet);
2052   utest_XShufflers(r, L, K);
2053   utest_XMarkovs  (r, L, K);
2054 
2055   utest_markov1_bug(r);
2056 
2057   esl_randomness_Destroy(r);
2058   esl_getopts_Destroy(go);
2059   return 0;
2060 }
2061 
2062 #endif /*eslRANDOMSEQ_TESTDRIVE*/
2063 /*----------------- end, test driver ----------------------------*/
2064 
2065 
2066 /*****************************************************************
2067  * 10. Example.
2068  *****************************************************************/
2069 #ifdef eslRANDOMSEQ_EXAMPLE
2070 /*::cexcerpt::randomseq_example::begin::*/
2071 /* compile: gcc -g -Wall -I. -o example -DeslRANDOMSEQ_EXAMPLE esl_randomseq.c\
2072             esl_random.c esl_sqio.c esl_sq.c easel.c -lm
2073  * run:     ./example <FASTA file>
2074  */
2075 #include "easel.h"
2076 #include "esl_sq.h"
2077 #include "esl_sqio.h"
2078 #include "esl_random.h"
2079 #include "esl_randomseq.h"
2080 
2081 int
main(int argc,char ** argv)2082 main(int argc, char **argv)
2083 {
2084   char           *seqfile = argv[1];
2085   int             format  = eslSQFILE_UNKNOWN;
2086   ESL_SQFILE     *sqfp    = NULL;
2087   ESL_SQ         *sq      = esl_sq_Create();
2088   ESL_RANDOMNESS *r       = esl_randomness_Create(0);
2089   int             status;
2090 
2091   if (esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK)
2092     esl_fatal("Failed to open %s\n", seqfile);
2093 
2094   while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
2095   {
2096     printf("[Original sequence:]\n");
2097     esl_sqio_Write(stdout, sq, eslSQFILE_FASTA);
2098 
2099     printf("[After shuffling:]\n");
2100     esl_rsq_CShuffle(r, sq->seq, sq->seq); /* shuffle in place */
2101     esl_sqio_Write(stdout, sq, eslSQFILE_FASTA);
2102 
2103     esl_sq_Reuse(sq);
2104   }
2105   if (status != eslEOF) esl_fatal("Parse failed");
2106   esl_sqfile_Close(sqfp);
2107 
2108   esl_sq_Destroy(sq);
2109   esl_randomness_Destroy(r);
2110   return 0;
2111 }
2112 /*::cexcerpt::randomseq_example::end::*/
2113 #endif /*eslRANDOMSEQ_EXAMPLE*/
2114 /*--------------------- end, example ----------------------------*/
2115