1 /* Generating, shuffling, and randomizing sequences.
2 *
3 * Contents:
4 * 1. Generating simple random character strings.
5 * 2. Generating iid sequences.
6 * 3. Shuffling sequences.
7 * 4. Randomizing sequences.
8 * 5. Generating iid sequences (digital mode).
9 * 6. Shuffling sequences (digital mode).
10 * 7. Randomizing sequences (digital mode).
11 * 8. Statistics drivers.
12 * 9. Unit tests.
13 * 10. Test driver.
14 * 11. Example.
15 */
16 #include "esl_config.h"
17
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21 #include <math.h>
22 #include <time.h>
23
24 #include "easel.h"
25 #include "esl_alphabet.h"
26 #include "esl_arr2.h"
27 #include "esl_random.h"
28 #include "esl_randomseq.h"
29
30
31 /*****************************************************************
32 * 1. Generating simple random character strings.
33 *****************************************************************/
34
35 /* Function: esl_rsq_Sample()
36 * Synopsis: Generate a random character string.
37 *
38 * Purpose: Sample a random character string of length <L>,
39 * consisting of characters in the set defined by
40 * an integer flag <allowed_chars>, using
41 * random number generator <rng>.
42 *
43 * Return the new NUL-terminated string in <*ret_s>. This
44 * may either be a new allocation, or in pre-allocated
45 * storage provided by the caller. If caller passes
46 * <*ret_s> as <NULL>, new space is allocated, and the
47 * caller is responsible for freeing it. That is:
48 * \begin{cchunk}
49 * char *s = NULL;
50 * esl_rsq_Sample(..., &s);
51 * free(s);
52 * \end{cchunk}
53 *
54 * If caller passes a non-<NULL> <*ret_s>, it is assumed to
55 * be a preallocated space of at least <L+1> characters,
56 * and caller is (of course) responsible for freeing
57 * it. That is:
58 * \begin{cchunk}
59 * char *s = malloc(L+1);
60 * esl_rsq_Sample(...,L, &s);
61 * free(s);
62 * \end{cchunk}
63 *
64 * Allowed values of the flag <allowed_char_flag> mirror
65 * the standard C99 character set functions in <ctype.h>:
66 *
67 * | <eslRSQ_SAMPLE_ALNUM> | isalnum() | isalpha() or isdigit() |
68 * | <eslRSQ_SAMPLE_ALPHA> | isalpha() | islower() or isupper() |
69 * | <eslRSQ_SAMPLE_LOWER> | islower() | [a-z] |
70 * | <eslRSQ_SAMPLE_UPPER> | isupper() | [A-Z] |
71 * | <eslRSQ_SAMPLE_DIGIT> | isdigit() | [0-9] |
72 * | <eslRSQ_SAMPLE_XDIGIT> | isxdigit() | [0-9] or [a-f] or [A-F] |
73 * | <eslRSQ_SAMPLE_CNTRL> | iscntrl() | ASCII control characters |
74 * | <eslRSQ_SAMPLE_GRAPH> | isgraph() | any printing char except space |
75 * | <eslRSQ_SAMPLE_SPACE> | isspace() | space, and other whitespace such as tab, newline |
76 * | <eslRSQ_SAMPLE_BLANK> | isblank() | space or tab |
77 * | <eslRSQ_SAMPLE_PRINT> | isprint() | any printing char including space |
78 * | <eslRSQ_SAMPLE_PUNCT> | ispunct() | punctuation |
79 *
80 * Note that with <eslRSQ_SAMPLE_CNTRL>, your string
81 * may sample NUL control characters (<0>), in addition to
82 * the string-terminating one at <(*ret_s)[L]>, so <strlen(*ret_s)>
83 * may not equal <L> in this case.
84 *
85 * These values are exclusive: you use one and only one of
86 * them as <allowed_chars>, you can't logically OR or NOT
87 * them together.
88 *
89 * Args: rng - ESL_RANDOMNESS object, the random number generator
90 * allowed_chars - allowed character set flag: eslRSQ_SAMPLE_*
91 * L - length of string to sample
92 * ret_s - RETURN: the NUL-terminated string
93 *
94 * Returns: <eslOK> on success; <*ret_s> is the sampled string, which was
95 * newly allocated here, and caller becomes responsible for free'ing.
96 *
97 * Throws: <eslEMEM> on allocation error; <eslEINVAL> if caller
98 * passes an invalid value of <allowed_chars>. Now <*ret_s>
99 * is <NULL>.
100 */
101 int
esl_rsq_Sample(ESL_RANDOMNESS * rng,int allowed_chars,int L,char ** ret_s)102 esl_rsq_Sample(ESL_RANDOMNESS *rng, int allowed_chars, int L, char **ret_s)
103 {
104 char *s = *ret_s; // if s == NULL, we will allocate here. Else, we're using caller-provided allocation
105 int n = 0;
106 char c[127];
107 int x,i;
108 int status;
109
110 /* We can't portably make assumptions about char codes (EBCDIC,
111 * ASCII...); and we don't want to write a bunch of fiddly overhead
112 * initializing static tables. So, quickly and portably build an
113 * array c[0..n-1] of characters we will sample uniformly from.
114 * RNG sampling is fairly compute-intensive anyway, so this time
115 * should disappear in the noise of that.
116 */
117 switch (allowed_chars) {
118 case eslRSQ_SAMPLE_ALNUM: for (x = 0; x < 128; x++) if (isalnum(x)) c[n++] = x; break;
119 case eslRSQ_SAMPLE_ALPHA: for (x = 0; x < 128; x++) if (isalpha(x)) c[n++] = x; break;
120 case eslRSQ_SAMPLE_LOWER: for (x = 0; x < 128; x++) if (islower(x)) c[n++] = x; break;
121 case eslRSQ_SAMPLE_UPPER: for (x = 0; x < 128; x++) if (isupper(x)) c[n++] = x; break;
122 case eslRSQ_SAMPLE_DIGIT: for (x = 0; x < 128; x++) if (isdigit(x)) c[n++] = x; break;
123 case eslRSQ_SAMPLE_XDIGIT: for (x = 0; x < 128; x++) if (isxdigit(x)) c[n++] = x; break;
124 case eslRSQ_SAMPLE_CNTRL: for (x = 0; x < 128; x++) if (iscntrl(x)) c[n++] = x; break;
125 case eslRSQ_SAMPLE_GRAPH: for (x = 0; x < 128; x++) if (isgraph(x)) c[n++] = x; break;
126 case eslRSQ_SAMPLE_SPACE: for (x = 0; x < 128; x++) if (isspace(x)) c[n++] = x; break;
127 case eslRSQ_SAMPLE_BLANK: for (x = 0; x < 128; x++) if (isblank(x)) c[n++] = x; break;
128 case eslRSQ_SAMPLE_PRINT: for (x = 0; x < 128; x++) if (isprint(x)) c[n++] = x; break;
129 case eslRSQ_SAMPLE_PUNCT: for (x = 0; x < 128; x++) if (ispunct(x)) c[n++] = x; break;
130 default: ESL_XEXCEPTION(eslEINVAL, "bad flag; wanted something like eslRSQ_SAMPLE_ALPHA");
131 }
132
133 if (!s) ESL_ALLOC(s, sizeof(char) * (L+1)); /* +\0 */
134
135 for (i = 0; i < L; i++)
136 s[i] = c[ esl_rnd_Roll(rng, n) ];
137 s[L] = '\0';
138
139 *ret_s = s; // if using caller-provided space, this is a no-op, passing back the same *ret_s we got.
140 return eslOK;
141
142 ERROR:
143 if (! *ret_s && s) free(s); // if we allocated s here, clean up.
144 return status;
145 }
146
147
148 /*****************************************************************
149 *# 1. Generating iid sequences.
150 *****************************************************************/
151
152 /* Function: esl_rsq_IID()
153 * Synopsis: Generate an iid random text sequence.
154 * Incept: SRE, Thu Aug 5 09:03:03 2004 [St. Louis]
155 *
156 * Purpose: Generate a <NUL>-terminated i.i.d. symbol string of length <L>,
157 * $0..L-1$, and leave it in <s>. The symbol alphabet is given
158 * as a string <alphabet> of <K> total symbols, and the iid
159 * probability of each residue is given in <p>. The caller
160 * must provide an <s> that is allocated for at least
161 * <(L+1)*sizeof(char)>, room for <L> residues and the <NUL> terminator.
162 *
163 * <esl_rsq_fIID()> does the same, but for a floating point
164 * probability vector <p>, rather than a double precision
165 * vector.
166 *
167 * Args: r - ESL_RANDOMNESS object
168 * alphabet - e.g. "ACGT"
169 * p - probability distribution [0..n-1]
170 * K - number of symbols in alphabet
171 * L - length of generated sequence
172 * s - the generated sequence.
173 * Caller allocated, >= (L+1) * sizeof(char).
174 *
175 * Return: <eslOK> on success.
176 */
177 int
esl_rsq_IID(ESL_RANDOMNESS * r,const char * alphabet,const double * p,int K,int L,char * s)178 esl_rsq_IID(ESL_RANDOMNESS *r, const char *alphabet, const double *p, int K, int L, char *s)
179 {
180 int x;
181
182 for (x = 0; x < L; x++)
183 s[x] = alphabet[esl_rnd_DChoose(r,p,K)];
184 s[x] = '\0';
185 return eslOK;
186 }
187 int
esl_rsq_fIID(ESL_RANDOMNESS * r,const char * alphabet,const float * p,int K,int L,char * s)188 esl_rsq_fIID(ESL_RANDOMNESS *r, const char *alphabet, const float *p, int K, int L, char *s)
189 {
190 int x;
191
192 for (x = 0; x < L; x++)
193 s[x] = alphabet[esl_rnd_FChoose(r,p,K)];
194 s[x] = '\0';
195 return eslOK;
196 }
197 /*------------ end, generating iid sequences --------------------*/
198
199
200 /*****************************************************************
201 *# 2. Shuffling sequences.
202 *****************************************************************/
203
204 /* Function: esl_rsq_CShuffle()
205 * Synopsis: Shuffle a text sequence.
206 * Incept: SRE, Fri Feb 23 08:17:50 2007 [Casa de Gatos]
207 *
208 * Purpose: Returns a shuffled version of <s> in <shuffled>, given
209 * a source of randomness <r>.
210 *
211 * Caller provides allocated storage for <shuffled>, for at
212 * least the same length as <s>.
213 *
214 * <shuffled> may also point to the same storage as <s>,
215 * in which case <s> is shuffled in place.
216 *
217 * Returns: <eslOK> on success.
218 */
219 int
esl_rsq_CShuffle(ESL_RANDOMNESS * r,const char * s,char * shuffled)220 esl_rsq_CShuffle(ESL_RANDOMNESS *r, const char *s, char *shuffled)
221 {
222 int L, i;
223 char c;
224
225 L = strlen(s);
226 if (shuffled != s) strcpy(shuffled, s);
227 while (L > 1) {
228 i = esl_rnd_Roll(r, L);
229 c = shuffled[i];
230 shuffled[i] = shuffled[L-1];
231 shuffled[L-1] = c;
232 L--;
233 }
234 return eslOK;
235 }
236
237 /* Function: esl_rsq_CShuffleDP()
238 * Synopsis: Shuffle a text sequence, preserving diresidue composition.
239 * Incept: SRE, Fri Feb 23 08:56:03 2007 [Casa de Gatos]
240 *
241 * Purpose: Given string <s>, and a source of randomness <r>,
242 * returns shuffled version in <shuffled>. The shuffle
243 * is a "doublet-preserving" (DP) shuffle which
244 * shuffles a sequence while exactly preserving both mono-
245 * and di-symbol composition.
246 *
247 * <s> may only consist of alphabetic characters [a-zA-Z].
248 * The shuffle is done case-insensitively. The shuffled
249 * string result is all upper case.
250 *
251 * Caller provides storage in <shuffled> of at least the
252 * same length as <s>.
253 *
254 * <shuffled> may also point to the same storage as <s>,
255 * in which case <s> is shuffled in place.
256 *
257 * The algorithm does an internal allocation of a
258 * substantial amount of temporary storage, on the order of
259 * <26 * strlen(s)>, so an allocation failure is possible
260 * if <s> is long enough.
261 *
262 * The algorithm is a search for a random Eulerian walk on
263 * a directed multigraph \citep{AltschulErickson85}.
264 *
265 * If <s> is of length 2 or less, this is a no-op, and
266 * <shuffled> is a copy of <s>.
267 *
268 * Returns: <eslOK> on success.
269 *
270 * Throws: <eslEINVAL> if <s> contains nonalphabetic characters.
271 * <eslEMEM> on allocation failure.
272 */
273 int
esl_rsq_CShuffleDP(ESL_RANDOMNESS * r,const char * s,char * shuffled)274 esl_rsq_CShuffleDP(ESL_RANDOMNESS *r, const char *s, char *shuffled)
275 {
276 int status; /* Easel return status code */
277 int len; /* length of s */
278 int pos; /* a position in s or shuffled */
279 int x,y; /* indices of two characters */
280 char **E = NULL; /* edge lists: E[0] is the edge list from vertex A */
281 int *nE = NULL; /* lengths of edge lists */
282 int *iE = NULL; /* positions in edge lists */
283 int n; /* tmp: remaining length of an edge list to be shuffled */
284 char sf; /* last character in shuffled */
285 char Z[26]; /* connectivity in last edge graph Z */
286 int keep_connecting; /* flag used in Z connectivity algorithm */
287 int is_eulerian; /* flag used for when we've got a good Z */
288
289 /* First, verify that the string is entirely alphabetic. */
290 len = strlen(s);
291 for (pos = 0; pos < len; pos++)
292 if (! isalpha((int) s[pos]))
293 ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
294
295 /* The edge case of len <= 2 */
296 if (len <= 2)
297 {
298 if (s != shuffled) strcpy(shuffled, s);
299 return eslOK;
300 }
301
302 /* Allocations. */
303 ESL_ALLOC(E, sizeof(char *) * 26); for (x = 0; x < 26; x++) E[x] = NULL;
304 ESL_ALLOC(nE, sizeof(int) * 26); for (x = 0; x < 26; x++) nE[x] = 0;
305 ESL_ALLOC(iE, sizeof(int) * 26); for (x = 0; x < 26; x++) iE[x] = 0;
306 for (x = 0; x < 26; x++)
307 ESL_ALLOC(E[x], sizeof(char) * (len-1));
308
309 /* "(1) Construct the doublet graph G and edge ordering E
310 * corresponding to S."
311 *
312 * Note that these also imply the graph G; and note,
313 * for any list x with nE[x] = 0, vertex x is not part
314 * of G.
315 */
316 x = toupper((int) s[0]) - 'A';
317 for (pos = 1; pos < len; pos++)
318 {
319 y = toupper((int) s[pos]) - 'A';
320 E[x][nE[x]] = y;
321 nE[x]++;
322 x = y;
323 }
324
325 /* Now we have to find a random Eulerian edge ordering. */
326 sf = toupper((int) s[len-1]) - 'A';
327 is_eulerian = 0;
328 while (! is_eulerian)
329 {
330 /* "(2) For each vertex s in G except s_f, randomly select
331 * one edge from the s edge list of E(S) to be the
332 * last edge of the s list in a new edge ordering."
333 *
334 * select random edges and move them to the end of each
335 * edge list.
336 */
337 for (x = 0; x < 26; x++)
338 {
339 if (nE[x] == 0 || x == sf) continue;
340 pos = esl_rnd_Roll(r, nE[x]);
341 ESL_SWAP(E[x][pos], E[x][nE[x]-1], char);
342 }
343
344 /* "(3) From this last set of edges, construct the last-edge
345 * graph Z and determine whether or not all of its
346 * vertices are connected to s_f."
347 *
348 * a probably stupid algorithm for looking at the
349 * connectivity in Z: iteratively sweep through the
350 * edges in Z, and build up an array (confusing called Z[x])
351 * whose elements are 1 if x is connected to sf, else 0.
352 */
353 for (x = 0; x < 26; x++) Z[x] = 0;
354 Z[(int) sf] = keep_connecting = 1;
355
356 while (keep_connecting) {
357 keep_connecting = 0;
358 for (x = 0; x < 26; x++) {
359 if (nE[x] == 0) continue;
360 y = E[x][nE[x]-1]; /* xy is an edge in Z */
361 if (Z[x] == 0 && Z[y] == 1) { /* x is connected to sf in Z */
362 Z[x] = 1;
363 keep_connecting = 1;
364 }
365 }
366 }
367
368 /* if any vertex in Z is tagged with a 0, it's
369 * not connected to sf, and we won't have a Eulerian
370 * walk.
371 */
372 is_eulerian = 1;
373 for (x = 0; x < 26; x++) {
374 if (nE[x] == 0 || x == sf) continue;
375 if (Z[x] == 0) {
376 is_eulerian = 0;
377 break;
378 }
379 }
380
381 /* "(4) If any vertex is not connected in Z to s_f, the
382 * new edge ordering will not be Eulerian, so return to
383 * (2). If all vertices are connected in Z to s_f,
384 * the new edge ordering will be Eulerian, so
385 * continue to (5)."
386 *
387 * e.g. note infinite loop while is_eulerian is FALSE.
388 */
389 }
390
391 /* "(5) For each vertex s in G, randomly permute the remaining
392 * edges of the s edge list of E(S) to generate the s
393 * edge list of the new edge ordering E(S')."
394 *
395 * Essentially a StrShuffle() on the remaining nE[x]-1 elements
396 * of each edge list; unfortunately our edge lists are arrays,
397 * not strings, so we can't just call out to StrShuffle().
398 */
399 for (x = 0; x < 26; x++)
400 for (n = nE[x] - 1; n > 1; n--)
401 {
402 pos = esl_rnd_Roll(r, n);
403 ESL_SWAP(E[x][pos], E[x][n-1], char);
404 }
405
406 /* "(6) Construct sequence S', a random DP permutation of
407 * S, from E(S') as follows. Start at the s_1 edge list.
408 * At each s_i edge list, add s_i to S', delete the
409 * first edge s_i,s_j of the edge list, and move to
410 * the s_j edge list. Continue this process until
411 * all edge lists are exhausted."
412 */
413 pos = 0;
414 x = toupper((int) s[0]) - 'A';
415 while (1)
416 {
417 shuffled[pos++] = 'A'+ x; /* add s_i to S' */
418
419 y = E[x][iE[x]];
420 iE[x]++; /* "delete" s_i,s_j from edge list */
421
422 x = y; /* move to s_j edge list. */
423
424 if (iE[x] == nE[x])
425 break; /* the edge list is exhausted. */
426 }
427 shuffled[pos++] = 'A' + sf;
428 shuffled[pos] = '\0';
429
430 /* Reality checks.
431 */
432 if (x != sf) ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, you didn't end on s_f.");
433 if (pos != len) ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, pos (%d) != len (%d).", pos, len);
434
435 /* Free and return.
436 */
437 esl_arr2_Destroy((void **) E, 26);
438 free(nE);
439 free(iE);
440 return eslOK;
441
442 ERROR:
443 esl_arr2_Destroy((void **) E, 26);
444 if (nE != NULL) free(nE);
445 if (iE != NULL) free(iE);
446 return status;
447 }
448
449
450 /* Function: esl_rsq_CShuffleKmers()
451 * Synopsis: Shuffle k-mers in a text sequence.
452 * Incept: SRE, Tue Nov 17 16:55:57 2009 [NHGRI retreat, Gettysburg]
453 *
454 * Purpose: Consider a text sequence <s> as a string of nonoverlapping
455 * k-mers of length <K>. Shuffle the k-mers, given a random
456 * number generator <r>. Put the shuffled sequence in
457 * <shuffled>.
458 *
459 * If the length of <s> is not evenly divisible by <K>, the
460 * remaining residues are left (unshuffled) as a prefix to
461 * the shuffled k-mers.
462 *
463 * For example, shuffling ABCDEFGHIJK as k=3-mers might
464 * result in ABFIJKFGHCDE.
465 *
466 * Caller provides allocated storage for <shuffled>,
467 * for at least the same length as <s>.
468 *
469 * <shuffled> may also point to the same storage as <s>,
470 * in which case <s> is shuffled in place.
471 *
472 * There is almost no formally justifiable reason why you'd
473 * use this shuffle -- it's not like it preserves any
474 * particularly well-defined statistical properties of the
475 * sequence -- but it's a quick and dirty way to sort of
476 * maybe possibly preserve some higher-than-monomer
477 * statistics.
478 *
479 * Args: r - an <ESL_RANDOMNESS> random generator
480 * s - sequence to shuffle
481 * K - size of k-mers to break <s> into
482 * shuffled - RESULT: the shuffled sequence
483 *
484 * Returns: <eslOK> on success.
485 *
486 * Throws: <eslEMEM> on allocation error.
487 */
488 int
esl_rsq_CShuffleKmers(ESL_RANDOMNESS * r,const char * s,int K,char * shuffled)489 esl_rsq_CShuffleKmers(ESL_RANDOMNESS *r, const char *s, int K, char *shuffled)
490 {
491 int L = strlen(s);
492 int W = L / K; /* number of kmers "words" excluding leftover prefix */
493 int P = L % K; /* leftover residues in prefix */
494 int i;
495 char *swap = NULL;
496 int status;
497
498 if (shuffled != s) strcpy(shuffled, s);
499 ESL_ALLOC(swap, sizeof(char) * K);
500 while (W > 1)
501 { /* use memmove, not strncpy or memcpy, because i==W-1 creates an overlap case */
502 i = esl_rnd_Roll(r, W); /* pick a word */
503 memmove(swap, shuffled + P + i*K, K * sizeof(char)); /* copy it to tmp space */
504 memmove(shuffled + P + i*K, shuffled + P + (W-1)*K, K * sizeof(char)); /* move word W-1 to i */
505 memmove(shuffled + P + (W-1)*K, swap, K * sizeof(char)); /* move word i to W-1 */
506 W--;
507 }
508 free(swap);
509 return eslOK;
510
511 ERROR:
512 free(swap);
513 return status;
514 }
515
516
517 /* Function: esl_rsq_CReverse()
518 * Synopsis: Reverse a string.
519 * Incept: SRE, Sat Feb 24 10:06:34 2007 [Casa de Gatos]
520 *
521 * Purpose: Returns a reversed version of <s> in <rev>.
522 *
523 * There are no restrictions on the symbols that <s>
524 * might contain.
525 *
526 * Caller provides storage in <rev> for at least
527 * <(strlen(s)+1)*sizeof(char)>.
528 *
529 * <s> and <rev> can point to the same storage, in which
530 * case <s> is reversed in place.
531 *
532 * Returns: <eslOK> on success.
533 */
534 int
esl_rsq_CReverse(const char * s,char * rev)535 esl_rsq_CReverse(const char *s, char *rev)
536 {
537 int L, i;
538 char c;
539
540 L = strlen(s);
541 for (i = 0; i < L/2; i++)
542 { /* swap ends */
543 c = s[L-i-1];
544 rev[L-i-1] = s[i];
545 rev[i] = c;
546 }
547 if (L%2) { rev[i] = s[i]; } /* don't forget middle residue in odd-length s */
548 rev[L] = '\0';
549 return eslOK;
550 }
551
552 /* Function: esl_rsq_CShuffleWindows()
553 * Synopsis: Shuffle local windows of a text string.
554 * Incept: SRE, Sat Feb 24 10:17:59 2007 [Casa de Gatos]
555 *
556 * Purpose: Given string <s>, shuffle residues in nonoverlapping
557 * windows of width <w>, and put the result in <shuffled>.
558 * See [Pearson88].
559 *
560 * <s> and <shuffled> can be identical to shuffle in place.
561 *
562 * Caller provides storage in <shuffled> for at least
563 * <(strlen(s)+1)*sizeof(char)>.
564 *
565 * Args: s - string to shuffle in windows
566 * w - window size (typically 10 or 20)
567 * shuffled - allocated space for window-shuffled result.
568 *
569 * Return: <eslOK> on success.
570 */
571 int
esl_rsq_CShuffleWindows(ESL_RANDOMNESS * r,const char * s,int w,char * shuffled)572 esl_rsq_CShuffleWindows(ESL_RANDOMNESS *r, const char *s, int w, char *shuffled)
573 {
574 int L;
575 char c;
576 int i, j, k;
577
578 L = strlen(s);
579 if (shuffled != s) strcpy(shuffled, s);
580 for (i = 0; i < L; i += w)
581 for (j = ESL_MIN(L-1, i+w-1); j > i; j--)
582 {
583 k = i + esl_rnd_Roll(r, j-i);
584 c = shuffled[k]; /* semantics of a j,k swap, because we might be shuffling in-place */
585 shuffled[k] = shuffled[j];
586 shuffled[j] = c;
587 }
588 return eslOK;
589 }
590 /*------------------ end, shuffling sequences -------------------*/
591
592
593
594 /*****************************************************************
595 *# 3. Randomizing sequences
596 *****************************************************************/
597
598 /* Function: esl_rsq_CMarkov0()
599 * Synopsis: Generate new text string of same 0th order Markov properties.
600 * Incept: SRE, Sat Feb 24 08:47:43 2007 [Casa de Gatos]
601 *
602 * Purpose: Makes a random string <markoved> with the same length and
603 * 0-th order Markov properties as <s>, given randomness
604 * source <r>.
605 *
606 * <s> and <markoved> can be point to the same storage, in which
607 * case <s> is randomized in place, destroying the original
608 * string.
609 *
610 * <s> must consist only of alphabetic characters [a-zA-Z].
611 * Statistics are collected case-insensitively over 26 possible
612 * residues. The random string is generated all upper case.
613 *
614 * Args: s - input string
615 * markoved - randomly generated string
616 * (storage allocated by caller, at least strlen(s)+1)
617 *
618 * Returns: <eslOK> on success.
619 *
620 * Throws: <eslEINVAL> if <s> contains nonalphabetic characters.
621 */
622 int
esl_rsq_CMarkov0(ESL_RANDOMNESS * r,const char * s,char * markoved)623 esl_rsq_CMarkov0(ESL_RANDOMNESS *r, const char *s, char *markoved)
624 {
625 int L;
626 int i;
627 double p[26]; /* initially counts, then probabilities */
628 int x;
629
630 /* First, verify that the string is entirely alphabetic. */
631 L = strlen(s);
632 for (i = 0; i < L; i++)
633 if (! isalpha((int) s[i]))
634 ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
635
636 /* Collect zeroth order counts and convert to frequencies.
637 */
638 for (x = 0; x < 26; x++) p[x] = 0.;
639 for (i = 0; i < L; i++)
640 p[(int)(toupper((int) s[i]) - 'A')] += 1.0;
641 if (L > 0)
642 for (x = 0; x < 26; x++) p[x] /= (double) L;
643
644 /* Generate a random string using those p's. */
645 for (i = 0; i < L; i++)
646 markoved[i] = esl_rnd_DChoose(r, p, 26) + 'A';
647 markoved[i] = '\0';
648
649 return eslOK;
650 }
651
652 /* Function: esl_rsq_CMarkov1()
653 * Synopsis: Generate new text string of same 1st order Markov properties.
654 * Incept: SRE, Sat Feb 24 09:21:46 2007 [Casa de Gatos]
655 *
656 * Purpose: Makes a random string <markoved> with the same length and
657 * 1st order (di-residue) Markov properties as <s>, given
658 * randomness source <r>.
659 *
660 * <s> and <markoved> can be point to the same storage, in which
661 * case <s> is randomized in place, destroying the original
662 * string.
663 *
664 * <s> must consist only of alphabetic characters [a-zA-Z].
665 * Statistics are collected case-insensitively over 26 possible
666 * residues. The random string is generated all upper case.
667 *
668 * If <s> is of length 2 or less, this is a no-op, and
669 * <markoved> is a copy of <s>.
670 *
671 * Args: s - input string
672 * markoved - new randomly generated string
673 * (storage allocated by caller, at least strlen(s)+1)
674 *
675 * Returns: <eslOK> on success.
676 *
677 * Throws: <eslEINVAL> if <s> contains nonalphabetic characters.
678 */
679 int
esl_rsq_CMarkov1(ESL_RANDOMNESS * r,const char * s,char * markoved)680 esl_rsq_CMarkov1(ESL_RANDOMNESS *r, const char *s, char *markoved)
681 {
682 int L;
683 int i;
684 int x,y;
685 int i0; /* initial symbol */
686 double p[26][26]; /* conditional probabilities p[x][y] = P(y | x) */
687 double p0[26]; /* marginal probabilities P(x), just for initial residue. */
688
689 /* First, verify that the string is entirely alphabetic. */
690 L = strlen(s);
691 for (i = 0; i < L; i++)
692 if (! isalpha((int) s[i]))
693 ESL_EXCEPTION(eslEINVAL, "String contains nonalphabetic characters");
694
695 /* The edge case of len <= 2 */
696 if (L <= 2)
697 {
698 if (s != markoved) strcpy(markoved, s);
699 return eslOK;
700 }
701
702 /* Collect first order counts and convert to frequencies. */
703 for (x = 0; x < 26; x++)
704 for (y = 0; y < 26; y++)
705 p[x][y] = 0.;
706
707 i0 = x = toupper((int) s[0]) - 'A';
708 for (i = 1; i < L; i++)
709 {
710 y = toupper((int) s[i]) - 'A';
711 p[x][y] += 1.0;
712 x = y;
713 }
714 p[x][i0] += 1.0; /* "circularized": avoids a bug; see markov1_bug utest */
715
716 for (x = 0; x < 26; x++)
717 {
718 p0[x] = 0.;
719 for (y = 0; y < 26; y++)
720 p0[x] += p[x][y]; /* now p0[x] = marginal counts of x, inclusive of 1st residue */
721
722 for (y = 0; y < 26; y++)
723 p[x][y] = (p0[x] > 0. ? p[x][y] / p0[x] : 0.); /* now p[x][y] = P(y | x) */
724
725 p0[x] /= (double) L; /* now p0[x] = marginal P(x) */
726 }
727
728 /* Generate a random string using those p's. */
729 x = esl_rnd_DChoose(r, p0, 26);
730 markoved[0] = x + 'A';
731 for (i = 1; i < L; i++)
732 {
733 y = esl_rnd_DChoose(r, p[x], 26);
734 markoved[i] = y + 'A';
735 x = y;
736 }
737 markoved[L] = '\0';
738
739 return eslOK;
740 }
741 /*----------------- end, randomizing sequences ------------------*/
742
743
744
745 /*****************************************************************
746 *# 4. Generating iid sequences (digital mode).
747 *****************************************************************/
748
749 /* Function: esl_rsq_xIID()
750 * Synopsis: Generate an iid random digital sequence.
751 * Incept: SRE, Sat Feb 17 16:39:01 2007 [Casa de Gatos]
752 *
753 * Purpose: Generate an i.i.d. digital sequence of length <L> (1..L) and
754 * leave it in <dsq>. The i.i.d. probability of each residue is
755 * given in the probability vector <p>, and the number of
756 * possible residues (the alphabet size) is given by <K>.
757 * (Only the alphabet size <K> is needed here, as opposed to
758 * a digital <ESL_ALPHABET>, but the caller presumably
759 * has a digital alphabet.) The caller must provide a <dsq>
760 * allocated for at least <L+2> residues of type <ESL_DSQ>,
761 * room for <L> residues and leading/trailing digital sentinel bytes.
762 *
763 * <esl_rsq_xfIID()> does the same, but for a
764 * single-precision float vector <p> rather than a
765 * double-precision vector <p>.
766 *
767 * As a special case, if <p> is <NULL>, sample residues uniformly.
768 *
769 * Args: r - ESL_RANDOMNESS object
770 * p - probability distribution [0..n-1] (or NULL for uniform)
771 * K - number of symbols in alphabet
772 * L - length of generated sequence
773 * ret_s - RETURN: the generated sequence.
774 * (Caller-allocated, >= (L+2)*ESL_DSQ)
775 *
776 * Return: <eslOK> on success.
777 */
778 int
esl_rsq_xIID(ESL_RANDOMNESS * r,const double * p,int K,int L,ESL_DSQ * dsq)779 esl_rsq_xIID(ESL_RANDOMNESS *r, const double *p, int K, int L, ESL_DSQ *dsq)
780 {
781 int x;
782
783 dsq[0] = dsq[L+1] = eslDSQ_SENTINEL;
784 for (x = 1; x <= L; x++)
785 dsq[x] = p ? esl_rnd_DChoose(r,p,K) : esl_rnd_Roll(r,K);
786 return eslOK;
787 }
788 int
esl_rsq_xfIID(ESL_RANDOMNESS * r,const float * p,int K,int L,ESL_DSQ * dsq)789 esl_rsq_xfIID(ESL_RANDOMNESS *r, const float *p, int K, int L, ESL_DSQ *dsq)
790 {
791 int x;
792
793 dsq[0] = dsq[L+1] = eslDSQ_SENTINEL;
794 for (x = 1; x <= L; x++)
795 dsq[x] = p ? esl_rnd_FChoose(r,p,K) : esl_rnd_Roll(r,K);
796 return eslOK;
797 }
798
799
800 /* Function: esl_rsq_SampleDirty()
801 * Synopsis: Sample a digital sequence with noncanonicals, optionally gaps.
802 * Incept: SRE, Wed Feb 17 10:57:28 2016 [H1/76]
803 *
804 * Purpose: Using random number generator <rng>, use probability
805 * vector <p> to sample an iid digital sequence in alphabet
806 * <abc> of length <L>. Store it in <dsq>.
807 *
808 * The <dsq> space, allocated by the caller, has room for
809 * at least <L+2> residues, counting the digital
810 * sentinels.
811 *
812 * Probability vector <p> has <Kp> terms, and sums to 1.0
813 * over them. The probabilities in <p> for residues <K>,
814 * <Kp-2>, and <Kp-1> (gap, nonresidue, missing) are
815 * typically zero, to generate a standard unaligned digital
816 * sequence with degenerate residues. To sample a random
817 * "alignment", <p[K]> is nonzero.
818 *
819 * If <p> is <NULL>, then we sample a probability vector
820 * according to the following rules, which generates
821 * *ungapped* dirtied random sequences:
822 * 1. Sample pc, the probability of canonical
823 * vs. noncanonical residues, uniformly on [0,1).
824 * 2. Sample a p[] uniformly for canonical residues
825 * <0..K-1>, and renormalize by multiplying by pc.
826 * Sample a different p[] uniformly for noncanonical
827 * residues <K+1..Kp-3>, and renormalize by (1-pc).
828 * 3. p[] = 0 for gap residue K, nonresidue Kp-2,
829 * missing residue Kp-1.
830 * This usage is mainly intended to make it easy to
831 * sample dirty edge cases for automated tests.
832 *
833 * Args: rng : random number generator
834 * abc : digital alphabet
835 * p : OPTIONAL: p[0..Kp-1] probability vector, or NULL
836 * L : length of digital sequence to sample
837 * dsq : resulting digital seq sample, caller-provided space
838 *
839 * Returns: <eslOK> on success
840 *
841 * Throws: <eslEMEM> on allocation failure.
842 */
843 int
esl_rsq_SampleDirty(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc,double ** byp_p,int L,ESL_DSQ * dsq)844 esl_rsq_SampleDirty(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, double **byp_p, int L, ESL_DSQ *dsq)
845 {
846 double *p = NULL;
847 int i;
848 int status;
849
850 /* If p isn't provided, sample one. */
851 if ( esl_byp_IsProvided(byp_p))
852 p = *byp_p;
853 else
854 {
855 double pc = esl_random(rng); /* [0,1) */
856 int x;
857
858 ESL_ALLOC(p, sizeof(double) * abc->Kp);
859
860 esl_rnd_Dirichlet(rng, NULL /* i.e. uniform */, abc->K, p);
861 esl_rnd_Dirichlet(rng, NULL, (abc->Kp - abc->K - 3), (p + abc->K +1)); /* K+1..Kp-3 range of alphabet */
862 for (x = 0; x < abc->K; x++) p[x] = p[x] * pc;
863 for (x = abc->K+1; x <= abc->Kp-3; x++) p[x] = p[x] * (1.-pc);
864 p[abc->K] = 0.;
865 p[abc->Kp-2] = 0.;
866 p[abc->Kp-1] = 0.;
867 }
868
869 dsq[0] = eslDSQ_SENTINEL;
870 for (i = 1; i <= L; i++)
871 dsq[i] = esl_rnd_DChoose(rng, p, abc->Kp);
872 dsq[L+1] = eslDSQ_SENTINEL;
873
874 if (esl_byp_IsReturned(byp_p)) *byp_p = p;
875 else if (esl_byp_IsInternal(byp_p)) free(p);
876 return eslOK;
877
878 ERROR:
879 if (! esl_byp_IsProvided(byp_p) && p) free(p);
880 if ( esl_byp_IsReturned(byp_p)) *byp_p = NULL;
881 return status;
882 }
883 /*--------------------- end, digital generation ---------------- */
884
885
886
887 /*****************************************************************
888 *# 5. Shuffling sequences (digital mode)
889 *****************************************************************/
890
891 /* Function: esl_rsq_XShuffle()
892 * Synopsis: Shuffle a digital sequence.
893 * Incept: SRE, Fri Feb 23 08:24:20 2007 [Casa de Gatos]
894 *
895 * Purpose: Given a digital sequence <dsq> of length <L> residues,
896 * shuffle it, and leave the shuffled version in <shuffled>.
897 *
898 * Caller provides allocated storage for <shuffled> for at
899 * least the same length as <dsq>.
900 *
901 * <shuffled> may also point to the same storage as <dsq>,
902 * in which case <dsq> is shuffled in place.
903 *
904 * Returns: <eslOK> on success.
905 */
906 int
esl_rsq_XShuffle(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,ESL_DSQ * shuffled)907 esl_rsq_XShuffle(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, ESL_DSQ *shuffled)
908 {
909 int i;
910 ESL_DSQ x;
911
912 if (dsq != shuffled) esl_abc_dsqcpy(dsq, L, shuffled);
913 while (L > 1) {
914 i = 1 + esl_rnd_Roll(r, L);
915 x = shuffled[i];
916 shuffled[i] = shuffled[L];
917 shuffled[L] = x;
918 L--;
919 }
920 return eslOK;
921 }
922
923 /* Function: esl_rsq_XShuffleDP()
924 * Synopsis: Shuffle a digital sequence, preserving diresidue composition.
925 * Incept: SRE, Fri Feb 23 09:23:47 2007 [Casa de Gatos]
926 *
927 * Purpose: Same as <esl_rsq_CShuffleDP()>, except for a digital
928 * sequence <dsq> of length <L>, encoded in a digital alphabet
929 * of <K> residues.
930 *
931 * <dsq> may only consist of residue codes <0..K-1>; if it
932 * contains gaps, degeneracies, or missing data, pass the alphabet's
933 * <Kp> size, not its canonical <K>.
934 *
935 * If <L> $\leq 2$, this is a no-op; <shuffled> is a copy of <dsq>.
936 *
937 * Returns: <eslOK> on success.
938 *
939 * Throws: <eslEINVAL> if <s> contains digital residue codes
940 * outside the range <0..K-1>.
941 * <eslEMEM> on allocation failure.
942 */
943 int
esl_rsq_XShuffleDP(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * shuffled)944 esl_rsq_XShuffleDP(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *shuffled)
945 {
946 int status; /* Easel return status code */
947 int i; /* a position in dsq or shuffled */
948 ESL_DSQ x,y; /* indices of two characters */
949 ESL_DSQ **E = NULL; /* edge lists: E[0] is the edge list from vertex A */
950 int *nE = NULL; /* lengths of edge lists */
951 int *iE = NULL; /* positions in edge lists */
952 ESL_DSQ *Z = NULL; /* connectivity in last edge graph Z */
953 int n; /* tmp: remaining length of an edge list to be shuffled */
954 ESL_DSQ sf; /* last character in shuffled */
955
956 int keep_connecting; /* flag used in Z connectivity algorithm */
957 int is_eulerian; /* flag used for when we've got a good Z */
958
959 /* First, verify that we can deal with all the residues in dsq. */
960 for (i = 1; i <= L; i++)
961 if (dsq[i] >= K)
962 ESL_EXCEPTION(eslEINVAL, "dsq contains unexpected residue codes");
963
964 /* The edge case of L <= 2 */
965 if (L <= 2)
966 {
967 if (dsq != shuffled) memcpy(shuffled, dsq, sizeof(ESL_DSQ) * (L+2));
968 return eslOK;
969 }
970
971 /* Allocations. */
972 ESL_ALLOC(nE, sizeof(int) * K); for (x = 0; x < K; x++) nE[x] = 0;
973 ESL_ALLOC(E, sizeof(ESL_DSQ *) * K); for (x = 0; x < K; x++) E[x] = NULL;
974 ESL_ALLOC(iE, sizeof(int) * K); for (x = 0; x < K; x++) iE[x] = 0;
975 ESL_ALLOC(Z, sizeof(ESL_DSQ) * K);
976 for (x = 0; x < K; x++)
977 ESL_ALLOC(E[x], sizeof(ESL_DSQ) * (L-1));
978
979 /* "(1) Construct the doublet graph G and edge ordering E... */
980 x = dsq[1];
981 for (i = 2; i <= L; i++) {
982 E[x][nE[x]] = dsq[i];
983 nE[x]++;
984 x = dsq[i];
985 }
986
987 /* Now we have to find a random Eulerian edge ordering. */
988 sf = dsq[L];
989 is_eulerian = 0;
990 while (! is_eulerian)
991 {
992 for (x = 0; x < K; x++) {
993 if (nE[x] == 0 || x == sf) continue;
994 i = esl_rnd_Roll(r, nE[x]);
995 ESL_SWAP(E[x][i], E[x][nE[x]-1], ESL_DSQ);
996 }
997
998 for (x = 0; x < K; x++) Z[x] = 0;
999 Z[(int) sf] = keep_connecting = 1;
1000 while (keep_connecting) {
1001 keep_connecting = 0;
1002 for (x = 0; x < K; x++) {
1003 if (nE[x] == 0) continue;
1004 y = E[x][nE[x]-1]; /* xy is an edge in Z */
1005 if (Z[x] == 0 && Z[y] == 1) { /* x is connected to sf in Z */
1006 Z[x] = 1;
1007 keep_connecting = 1;
1008 }
1009 }
1010 }
1011
1012 is_eulerian = 1;
1013 for (x = 0; x < K; x++) {
1014 if (nE[x] == 0 || x == sf) continue;
1015 if (Z[x] == 0) {
1016 is_eulerian = 0;
1017 break;
1018 }
1019 }
1020 }
1021
1022 /* "(5) For each vertex s in G, randomly permute... */
1023 for (x = 0; x < K; x++)
1024 for (n = nE[x] - 1; n > 1; n--)
1025 {
1026 i = esl_rnd_Roll(r, n);
1027 ESL_SWAP(E[x][i], E[x][n-1], ESL_DSQ);
1028 }
1029
1030 /* "(6) Construct sequence S'... */
1031 i = 1;
1032 x = dsq[1];
1033 while (1) {
1034 shuffled[i++] = x;
1035 y = E[x][iE[x]++];
1036 x = y;
1037 if (iE[x] == nE[x]) break;
1038 }
1039 shuffled[i++] = sf;
1040 shuffled[i] = eslDSQ_SENTINEL;
1041 shuffled[0] = eslDSQ_SENTINEL;
1042
1043 /* Reality checks. */
1044 if (x != sf) ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, you didn't end on s_f.");
1045 if (i != L+1) ESL_XEXCEPTION(eslEINCONCEIVABLE, "hey, i (%d) overran L+1 (%d).", i, L+1);
1046
1047 esl_arr2_Destroy((void **) E, K);
1048 free(nE);
1049 free(iE);
1050 free(Z);
1051 return eslOK;
1052
1053 ERROR:
1054 esl_arr2_Destroy((void **) E, K);
1055 if (nE != NULL) free(nE);
1056 if (iE != NULL) free(iE);
1057 if (Z != NULL) free(Z);
1058 return status;
1059 }
1060
1061
1062 /* Function: esl_rsq_XShuffleKmers()
1063 * Synopsis: Shuffle k-mers in a digital sequence.
1064 *
1065 * Purpose: Same as <esl_rsq_CShuffleKmers()>, but shuffle digital
1066 * sequence <dsq> of length <L> into digital result <shuffled>.
1067 *
1068 * Args: r - an <ESL_RANDOMNESS> random generator
1069 * dsq - sequence to shuffle
1070 * K - size of k-mers to break <s> into
1071 * shuffled - RESULT: the shuffled sequence
1072 *
1073 * Returns: <eslOK> on success.
1074 *
1075 * Throws: <eslEMEM> on allocation error.
1076 */
1077 int
esl_rsq_XShuffleKmers(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * shuffled)1078 esl_rsq_XShuffleKmers(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *shuffled)
1079 {
1080 int W = L / K; /* number of kmers "words" excluding leftover prefix */
1081 int P = L % K; /* leftover residues in prefix */
1082 int i;
1083 char *swap = NULL;
1084 int status;
1085
1086 if (shuffled != dsq) esl_abc_dsqcpy(dsq, L, shuffled);
1087 ESL_ALLOC(swap, sizeof(char) * K);
1088 while (W > 1)
1089 { /* use memmove, not memcpy, because i==W-1 is an overlap case */
1090 i = esl_rnd_Roll(r, W); /* pick a word */
1091 memmove(swap, shuffled + P + i*K, K * sizeof(char)); /* copy it to tmp space */
1092 memmove(shuffled + P + i*K, shuffled + P + (W-1)*K, K * sizeof(char)); /* move word W-1 to i */
1093 memmove(shuffled + P + (W-1)*K, swap, K * sizeof(char)); /* move word i to W-1 */
1094 W--;
1095 }
1096 free(swap);
1097 return eslOK;
1098
1099 ERROR:
1100 free(swap);
1101 return status;
1102 }
1103
1104
1105 /* Function: esl_rsq_XReverse()
1106 * Synopsis: Reverse a digital sequence.
1107 * Incept: SRE, Sat Feb 24 10:13:30 2007 [Casa de Gatos]
1108 *
1109 * Purpose: Given a digital sequence <dsq> of length <L>, return
1110 * reversed version of it in <rev>.
1111 *
1112 * Caller provides storage in <rev> for at least
1113 * <(L+2)*sizeof(ESL_DSQ)>.
1114 *
1115 * <s> and <rev> can point to the same storage, in which
1116 * case <s> is reversed in place.
1117 *
1118 * Returns: <eslOK> on success.
1119 */
1120 int
esl_rsq_XReverse(const ESL_DSQ * dsq,int L,ESL_DSQ * rev)1121 esl_rsq_XReverse(const ESL_DSQ *dsq, int L, ESL_DSQ *rev)
1122 {
1123 int i;
1124 ESL_DSQ x;
1125
1126 for (i = 1; i <= L/2; i++)
1127 { /* swap ends */
1128 x = dsq[L-i+1];
1129 rev[L-i+1] = dsq[i];
1130 rev[i] = x;
1131 }
1132 if (L%2) { rev[i] = dsq[i]; } /* don't forget middle residue in odd-length dsq */
1133 rev[0] = eslDSQ_SENTINEL;
1134 rev[L+1] = eslDSQ_SENTINEL;
1135 return eslOK;
1136 }
1137
1138
1139 /* Function: esl_rsq_XShuffleWindows()
1140 * Synopsis: Shuffle local windows of a digital sequence.
1141 * Incept: SRE, Sat Feb 24 10:51:31 2007 [Casa de Gatos]
1142 *
1143 * Purpose: Given a digital sequence <dsq> of length <L>, shuffle
1144 * residues in nonoverlapping windows of width <w>, and put
1145 * the result in <shuffled>. See [Pearson88].
1146 *
1147 * Caller provides storage in <shuffled> for at least
1148 * <(L+2)*sizeof(ESL_DSQ)>.
1149 *
1150 * <dsq> and <shuffled> can be identical to shuffle in place.
1151 *
1152 * Args: dsq - digital sequence to shuffle in windows
1153 * L - length of <dsq>
1154 * w - window size (typically 10 or 20)
1155 * shuffled - allocated space for window-shuffled result.
1156 *
1157 * Return: <eslOK> on success.
1158 */
1159 int
esl_rsq_XShuffleWindows(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int w,ESL_DSQ * shuffled)1160 esl_rsq_XShuffleWindows(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int w, ESL_DSQ *shuffled)
1161 {
1162 ESL_DSQ x;
1163 int i, j, k;
1164
1165 if (dsq != shuffled) esl_abc_dsqcpy(dsq, L, shuffled);
1166 for (i = 1; i <= L; i += w)
1167 for (j = ESL_MIN(L, i+w-1); j > i; j--)
1168 {
1169 k = i + esl_rnd_Roll(r, j-i+1);
1170 x = shuffled[k]; /* semantics of a j,k swap, because we might be shuffling in-place */
1171 shuffled[k] = shuffled[j];
1172 shuffled[j] = x;
1173 }
1174 return eslOK;
1175 }
1176
1177 /*------------------- end, digital shuffling -------------------*/
1178
1179
1180
1181 /*****************************************************************
1182 *# 6. Randomizing sequences (digital mode)
1183 *****************************************************************/
1184
1185 /* Function: esl_rsq_XMarkov0()
1186 * Synopsis: Generate new digital sequence of same 0th order Markov properties.
1187 * Incept: SRE, Sat Feb 24 09:12:32 2007 [Casa de Gatos]
1188 *
1189 * Purpose: Same as <esl_rsq_CMarkov0()>, except for a digital
1190 * sequence <dsq> of length <L>, encoded in a digital
1191 * alphabet of <K> residues; caller provides storage
1192 * for the randomized sequence <markoved> for at least
1193 * <L+2> <ESL_DSQ> residues, including the two flanking
1194 * sentinel bytes.
1195 *
1196 * <dsq> therefore may only consist of residue codes
1197 * in the range <0..K-1>. If it contains gaps,
1198 * degeneracies, or missing data, pass the alphabet's
1199 * <Kp> size, not its canonical <K>.
1200 *
1201 * Returns: <eslOK> on success.
1202 *
1203 * Throws: <eslEINVAL> if <s> contains digital residue codes outside
1204 * the range <0..K-1>.
1205 * <eslEMEM> on allocation failure.
1206 */
1207 int
esl_rsq_XMarkov0(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * markoved)1208 esl_rsq_XMarkov0(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *markoved)
1209 {
1210 int status;
1211 int i;
1212 double *p = NULL; /* initially counts, then probabilities */
1213 int x;
1214
1215 /* First, verify that the string is entirely alphabetic. */
1216 for (i = 1; i <= L; i++)
1217 if (dsq[i] >= K)
1218 ESL_XEXCEPTION(eslEINVAL, "String contains unexpected residue codes");
1219
1220 ESL_ALLOC(p, sizeof(double) * K);
1221 for (x = 0; x < K; x++) p[x] = 0.;
1222
1223 for (i = 1; i <= L; i++)
1224 p[(int) dsq[i]] += 1.0;
1225 if (L > 0)
1226 for (x = 0; x < K; x++) p[x] /= (double) L;
1227
1228 for (i = 1; i <= L; i++)
1229 markoved[i] = esl_rnd_DChoose(r, p, K);
1230 markoved[0] = eslDSQ_SENTINEL;
1231 markoved[L+1] = eslDSQ_SENTINEL;
1232
1233 free(p);
1234 return eslOK;
1235
1236 ERROR:
1237 if (p != NULL) free(p);
1238 return status;
1239 }
1240
1241
1242
1243 /* Function: esl_rsq_XMarkov1()
1244 * Synopsis: Generate new digital sequence of same 1st order Markov properties.
1245 * Incept: SRE, Sat Feb 24 09:46:09 2007 [Casa de Gatos]
1246 *
1247 * Purpose: Same as <esl_rsq_CMarkov1()>, except for a digital
1248 * sequence <dsq> of length <L>, encoded in a digital
1249 * alphabet of <K> residues. Caller provides storage
1250 * for the randomized sequence <markoved> for at least
1251 * <L+2> <ESL_DSQ> residues, including the two flanking
1252 * sentinel bytes.
1253 *
1254 * <dsq> and <markoved> can be point to the same storage, in which
1255 * case <dsq> is randomized in place, destroying the original
1256 * string.
1257 *
1258 * <dsq> therefore may only consist of residue codes
1259 * in the range <0..K-1>. If it contains gaps,
1260 * degeneracies, or missing data, pass the alphabet's
1261 * <Kp> size, not its canonical <K>.
1262 *
1263 * If <L> $\leq 2$, this is a no-op; <markoved> is a copy of <dsq>.
1264 *
1265 * Args: dsq - input digital sequence 1..L
1266 * L - length of dsq
1267 * K - residue codes in dsq are in range 0..K-1
1268 * markoved - new randomly generated digital sequence;
1269 * storage allocated by caller, at least (L+2)*ESL_DSQ;
1270 * may be same as dsq to randomize in place.
1271 *
1272 * Returns: <eslOK> on success.
1273 *
1274 * Throws: <eslEINVAL> if <s> contains digital residue codes outside
1275 * the range <0..K-1>.
1276 * <eslEMEM> on allocation failure.
1277 */
1278 int
esl_rsq_XMarkov1(ESL_RANDOMNESS * r,const ESL_DSQ * dsq,int L,int K,ESL_DSQ * markoved)1279 esl_rsq_XMarkov1(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, int K, ESL_DSQ *markoved)
1280 {
1281 double **p = NULL; /* conditional probabilities p[x][y] = P(y | x) */
1282 double *p0 = NULL; /* marginal probabilities P(x), just for initial residue. */
1283 int i;
1284 ESL_DSQ x,y;
1285 ESL_DSQ i0; /* initial symbol */
1286 int status;
1287
1288 /* validate the input string */
1289 for (i = 1; i <= L; i++)
1290 if (dsq[i] >= K)
1291 ESL_XEXCEPTION(eslEINVAL, "String contains unexpected residue codes");
1292
1293 /* The edge case of L <= 2 */
1294 if (L <= 2)
1295 {
1296 if (dsq != markoved) memcpy(markoved, dsq, sizeof(ESL_DSQ) * (L+2));
1297 return eslOK;
1298 }
1299
1300 /* allocations */
1301 ESL_ALLOC(p0, sizeof(double) * K); for (x = 0; x < K; x++) p0[x] = 0.;
1302 ESL_ALLOC(p, sizeof(double *) * K); for (x = 0; x < K; x++) p[x] = NULL;
1303 for (x = 0; x < K; x++)
1304 { ESL_ALLOC(p[x], sizeof(double) * K); for (y = 0; y < K; y++) p[x][y] = 0.; }
1305
1306 /* Collect first order counts and convert to frequencies. */
1307 i0 = x = dsq[1];
1308 for (i = 2; i <= L; i++)
1309 {
1310 y = dsq[i];
1311 p[x][y] += 1.0;
1312 x = y;
1313 }
1314 p[x][i0] += 1.0; /* "circularized": avoids a bug; see markov1_bug utest */
1315
1316 for (x = 0; x < K; x++)
1317 {
1318 p0[x] = 0.;
1319 for (y = 0; y < K; y++)
1320 p0[x] += p[x][y]; /* now p0[x] = marginal counts of x, inclusive of 1st residue */
1321
1322 for (y = 0; y < K; y++)
1323 p[x][y] = (p0[x] > 0. ? p[x][y] / p0[x] : 0.); /* now p[x][y] = P(y | x) */
1324
1325 p0[x] /= (double) L; /* now p0[x] = marginal P(x) inclusive of 1st residue */
1326 }
1327
1328 /* Generate a random string using those p's. */
1329 markoved[1] = esl_rnd_DChoose(r, p0, K);
1330 for (i = 2; i <= L; i++)
1331 markoved[i] = esl_rnd_DChoose(r, p[markoved[i-1]], K);
1332
1333 markoved[0] = eslDSQ_SENTINEL;
1334 markoved[L+1] = eslDSQ_SENTINEL;
1335
1336 esl_arr2_Destroy((void**)p, K);
1337 free(p0);
1338 return eslOK;
1339
1340 ERROR:
1341 esl_arr2_Destroy((void**)p, K);
1342 if (p0 != NULL) free(p0);
1343 return status;
1344 }
1345
1346 /*------------------ end, digital randomizing -------------------*/
1347
1348 /*****************************************************************
1349 * 7. Statistics driver.
1350 *****************************************************************/
1351
1352 /* This driver tests (and confirms) the intuition that using
1353 * a DP shuffle on short sequences may be a bad idea; short sequences
1354 * don't shuffle effectively.
1355 * xref J3/20.
1356 */
1357 #ifdef eslRANDOMSEQ_STATS
1358 /* gcc -g -Wall -o randomseq_stats -L. -I. -DeslRANDOMSEQ_STATS esl_randomseq.c -leasel -lm
1359 */
1360 #include "esl_config.h"
1361
1362 #include <stdio.h>
1363 #include <string.h>
1364
1365 #include "easel.h"
1366 #include "esl_alphabet.h"
1367 #include "esl_distance.h"
1368 #include "esl_getopts.h"
1369 #include "esl_random.h"
1370 #include "esl_randomseq.h"
1371 #include "esl_stats.h"
1372 #include "esl_vectorops.h"
1373
1374 static ESL_OPTIONS options[] = {
1375 /* name type default env range toggles reqs incomp help docgroup*/
1376 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1377 { "-d", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "diresidue shuffle", 0 },
1378 { "-R", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "reverse the sequence", 0 },
1379 { "-2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "resample an independent sequence", 0 },
1380 { "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of sampled sequences per length", 0 },
1381 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
1382 { "--minL", eslARG_INT, "5", NULL, NULL, NULL, NULL, NULL, "xaxis minimum L", 0 },
1383 { "--maxL", eslARG_INT, "200", NULL, NULL, NULL, NULL, NULL, "xaxis maximum L", 0 },
1384 { "--stepL", eslARG_INT, "5", NULL, NULL, NULL, NULL, NULL, "xaxis step size", 0 },
1385 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1386 };
1387 static char usage[] = "[-options]";
1388 static char banner[] = "stats driver for randomseq module";
1389
1390 int
main(int argc,char ** argv)1391 main(int argc, char **argv)
1392 {
1393 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1394 ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1395 ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
1396 int N = esl_opt_GetInteger(go, "-N");
1397 int minL = esl_opt_GetInteger(go, "--minL");
1398 int maxL = esl_opt_GetInteger(go, "--maxL");
1399 int stepL = esl_opt_GetInteger(go, "--stepL");
1400 ESL_DSQ *dsq1 = malloc(sizeof(ESL_DSQ) * (maxL+2));
1401 ESL_DSQ *dsq2 = malloc(sizeof(ESL_DSQ) * (maxL+2));
1402 double *fq = malloc(sizeof(double) * abc->K);
1403 double *pid = malloc(sizeof(double) * N);
1404 double mean, var;
1405 int L;
1406 int i;
1407
1408 esl_vec_DSet(fq, abc->K, 1.0 / (double) abc->K );
1409
1410 for (L = minL; L <= maxL; L += stepL)
1411 {
1412 for (i = 0; i < N; i++)
1413 {
1414 esl_rsq_xIID(r, fq, abc->K, L, dsq1);
1415
1416 if (esl_opt_GetBoolean(go, "-d")) esl_rsq_XShuffleDP(r, dsq1, L, abc->K, dsq2);
1417 else if (esl_opt_GetBoolean(go, "-R")) esl_rsq_XReverse(dsq1, L, dsq2);
1418 else if (esl_opt_GetBoolean(go, "-2")) esl_rsq_xIID(r, fq, abc->K, L, dsq2);
1419 else esl_rsq_XShuffle(r, dsq1, L, dsq2);
1420
1421 esl_dst_XPairId(abc, dsq1, dsq2, &(pid[i]), NULL, NULL);
1422 }
1423
1424 esl_stats_DMean(pid, N, &mean, &var);
1425 printf("%-6d %.4f %.4f\n", L, mean, sqrt(var));
1426 }
1427 printf("&\n");
1428
1429 free(pid);
1430 free(fq);
1431 free(dsq2);
1432 free(dsq1);
1433 esl_alphabet_Destroy(abc);
1434 esl_randomness_Destroy(r);
1435 esl_getopts_Destroy(go);
1436 return 0;
1437 }
1438 #endif /*eslRANDOMSEQ_STATS*/
1439 /*-------------- end, statistics driver -------------------------*/
1440
1441
1442
1443
1444
1445 /*****************************************************************
1446 * 8. Unit tests.
1447 *****************************************************************/
1448 #ifdef eslRANDOMSEQ_TESTDRIVE
1449 #include "esl_dirichlet.h"
1450 #include "esl_vectorops.h"
1451
1452 /* count c(x) monoresidue and c(xy) diresidue composition
1453 * used for sequence shuffling unit tests
1454 * mono, di allocated by caller for 26 and 26x26, respectively.
1455 */
1456 static int
composition(char * s,int L,int * mono,int ** di)1457 composition(char *s, int L, int *mono, int **di)
1458 {
1459 int i, x, y;
1460
1461 for (x = 0; x < 26; x++) {
1462 mono[x] = 0;
1463 for (y = 0; y < 26; y++)
1464 di[x][y] = 0;
1465 }
1466
1467 for (i = 0; s[i] != '\0'; i++) {
1468 if (!isalpha(s[i])) esl_fatal("bad residue %d", i);
1469 y = toupper(s[i]) - 'A';
1470 mono[y]++;
1471 if (i > 0) {
1472 x = toupper(s[i-1] - 'A');
1473 di[x][y]++;
1474 }
1475 }
1476 if (i != L) esl_fatal("sequence length didn't match expected %d", L);
1477 return eslOK;
1478 }
1479
1480 /* same, but for digital seq., with alphabet size K */
1481 static int
xcomposition(ESL_DSQ * dsq,int L,int K,int * mono,int ** di)1482 xcomposition(ESL_DSQ *dsq, int L, int K, int *mono, int **di)
1483 {
1484 int i, x, y;
1485
1486 for (x = 0; x < K; x++) {
1487 mono[x] = 0;
1488 for (y = 0; y < K; y++)
1489 di[x][y] = 0;
1490 }
1491
1492 for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++) {
1493 if (dsq[i] > K) esl_fatal("bad residue %d", i);
1494 if (i > 1) di[(int) dsq[i-1]][(int) dsq[i]]++;
1495 mono[(int) dsq[i]]++;
1496 }
1497 if (i != L+1) esl_fatal("sequence length didn't match expected %d", L);
1498 return eslOK;
1499 }
1500
1501 static int
composition_allocate(int K,int ** ret_mono,int *** ret_di)1502 composition_allocate(int K, int **ret_mono, int ***ret_di)
1503 {
1504 int status;
1505 int *mono = NULL;
1506 int **di = NULL;
1507 int x;
1508
1509 ESL_ALLOC(mono, sizeof(int) * K);
1510 ESL_ALLOC(di, sizeof(int *) * K); for (x = 0; x < K; x++) di[x] = NULL;
1511 for (x = 0; x < K; x++)
1512 ESL_ALLOC(di[x], sizeof(int) * K);
1513 *ret_mono = mono;
1514 *ret_di = di;
1515 return eslOK;
1516
1517 ERROR:
1518 esl_arr2_Destroy((void **) di, K);
1519 if (mono != NULL) free(mono);
1520 *ret_mono = NULL;
1521 *ret_di = NULL;
1522 return status;
1523 }
1524
1525 /* compare compositions before/after.
1526 * either mono (m1,m2) or di (d1,d2) may be NULL, to compare only the other one */
1527 static int
composition_compare(int * m1,int ** di1,int * m2,int ** di2,int K)1528 composition_compare(int *m1, int **di1, int *m2, int **di2, int K)
1529 {
1530 int x,y;
1531
1532 for (x = 0; x < K; x++) {
1533 if (m1 != NULL && m1[x] != m2[x]) return eslFAIL;
1534 if (di1 != NULL)
1535 for (y = 0; y < K; y++)
1536 if (di1[x][y] != di2[x][y]) return eslFAIL;
1537 }
1538 return eslOK;
1539 }
1540
1541 /* Unit tests for:
1542 * esl_rsq_CShuffle()
1543 * esl_rsq_CShuffleDP()
1544 * esl_rsq_CShuffleWindows()
1545 * esl_rsq_CReverse()
1546 *
1547 * All of these exactly preserve residue composition, which is
1548 * the basis of the unit tests.
1549 */
1550 static void
utest_CShufflers(ESL_RANDOMNESS * r,int L,char * alphabet,int K)1551 utest_CShufflers(ESL_RANDOMNESS *r, int L, char *alphabet, int K)
1552 {
1553 char *logmsg = "Failure in one of the CShuffle* unit tests";
1554 int status;
1555 char *s = NULL;
1556 char *s2 = NULL;
1557 int *m1 = NULL,
1558 *m2 = NULL; /* mono, before and after */
1559 int **di1 = NULL,
1560 **di2 = NULL; /* di, before and after */
1561 double *p;
1562 int w = 12; /* window width for CShuffleWindows() */
1563
1564 /* allocations */
1565 ESL_ALLOC(s, sizeof(char) * (L+1));
1566 ESL_ALLOC(s2, sizeof(char) * (L+1));
1567 ESL_ALLOC(p, sizeof(double) * K);
1568 if (composition_allocate(26, &m1, &di1) != eslOK) esl_fatal(logmsg);
1569 if (composition_allocate(26, &m2, &di2) != eslOK) esl_fatal(logmsg);
1570
1571 /* generate the string we'll start shuffling */
1572 if (esl_dirichlet_DSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1573 if (esl_rsq_IID(r, alphabet, p, K, L, s) != eslOK) esl_fatal(logmsg);
1574
1575 /* esl_rsq_CShuffle: mono composition should stay exactly the same, di may change */
1576 memset(s2, 0, (L+1)*sizeof(char));
1577 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1578 if (esl_rsq_CShuffle(r, s, s2) != eslOK) esl_fatal(logmsg);
1579 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1580 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1581 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1582
1583 /* esl_rsq_CShuffle, in place */
1584 strcpy(s, s2);
1585 if (composition(s2, L, m1, di1) != eslOK) esl_fatal(logmsg);
1586 if (esl_rsq_CShuffle(r, s2, s2) != eslOK) esl_fatal(logmsg);
1587 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1588 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1589 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1590
1591 /* esl_rsq_CShuffleDP: mono and di compositions stay exactly the same */
1592 memset(s2, 0, (L+1)*sizeof(char));
1593 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1594 if (esl_rsq_CShuffleDP(r, s, s2) != eslOK) esl_fatal(logmsg);
1595 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1596 if (composition_compare(m1, di1, m2, di2, 26) != eslOK) esl_fatal(logmsg);
1597 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1598
1599 /* esl_rsq_CShuffleDP, in place */
1600 strcpy(s, s2);
1601 if (composition(s2, L, m1, di1) != eslOK) esl_fatal(logmsg);
1602 if (esl_rsq_CShuffleDP(r, s2, s2) != eslOK) esl_fatal(logmsg);
1603 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1604 if (composition_compare(m1, di1, m2, di2, 26) != eslOK) esl_fatal(logmsg);
1605 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1606
1607 /* esl_rsq_CShuffleKmers: mono composition stays the same */
1608 memset(s2, 0, (L+1)*sizeof(char));
1609 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1610 if (esl_rsq_CShuffleKmers(r, s, 3, s2) != eslOK) esl_fatal(logmsg);
1611 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1612 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1613 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1614
1615 /* esl_rsq_CShuffleKmers, in place */
1616 strcpy(s, s2);
1617 if (composition(s2, L, m1, di1) != eslOK) esl_fatal(logmsg);
1618 if (esl_rsq_CShuffleKmers(r, s2, 3, s2) != eslOK) esl_fatal(logmsg);
1619 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1620 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1621 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1622
1623 /* esl_rsq_CShuffleWindows(): mono composition stays the same */
1624 memset(s2, 0, (L+1)*sizeof(char));
1625 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1626 if (esl_rsq_CShuffleWindows(r, s, w, s2) != eslOK) esl_fatal(logmsg);
1627 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1628 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1629 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1630
1631 /* esl_rsq_CShuffleWindows(), in place */
1632 strcpy(s, s2);
1633 if (composition(s2, L, m1, di1) != eslOK) esl_fatal(logmsg);
1634 if (esl_rsq_CShuffleWindows(r, s2, w, s2) != eslOK) esl_fatal(logmsg);
1635 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1636 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1637 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1638
1639 /* esl_rsq_CReverse(): two reverses (one in place) give the same seq back */
1640 memset(s2, 0, (L+1)*sizeof(char));
1641 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1642 if (esl_rsq_CReverse(s, s2) != eslOK) esl_fatal(logmsg);
1643 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1644 if (composition_compare(m1, NULL, m2, NULL, 26) != eslOK) esl_fatal(logmsg);
1645 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1646 if (esl_rsq_CReverse(s2, s2) != eslOK) esl_fatal(logmsg);
1647 if (strcmp(s2, s) != 0) esl_fatal(logmsg);
1648
1649 free(s);
1650 free(s2);
1651 free(p);
1652 free(m1);
1653 free(m2);
1654 esl_arr2_Destroy((void **) di1, 26);
1655 esl_arr2_Destroy((void **) di2, 26);
1656 return;
1657
1658 ERROR:
1659 esl_fatal(logmsg);
1660 }
1661
1662 /* Unit tests for:
1663 * esl_rsq_CMarkov0()
1664 * esl_rsq_CMarkov1()
1665 *
1666 * Testing these is less robust than the shufflers, because it's hard
1667 * to concoct deterministic tests. Instead the test is a weak one,
1668 * that zero probability events get zero counts.
1669 */
1670 static void
utest_CMarkovs(ESL_RANDOMNESS * r,int L,char * alphabet)1671 utest_CMarkovs(ESL_RANDOMNESS *r, int L, char *alphabet)
1672 {
1673 char *logmsg = "Failure in a CMarkov*() unit test";
1674 int status;
1675 char *s = NULL;
1676 char *s2 = NULL;
1677 float *p = NULL;
1678 int K;
1679 int pzero; /* which 0..K-1 residue will have zero prob */
1680 int zeroidx; /* index of pzero residue in 0..25 ASCII */
1681 int *m1 = NULL,
1682 *m2 = NULL; /* mono, before and after */
1683 int **di1 = NULL,
1684 **di2 = NULL; /* di, before and after */
1685 int i,x;
1686
1687 K = strlen(alphabet);
1688 ESL_ALLOC(p, sizeof(float) * K);
1689 ESL_ALLOC(s, sizeof(char) * (L+1));
1690 ESL_ALLOC(s2, sizeof(char) * (L+1));
1691 if (composition_allocate(26, &m1, &di1) != eslOK) esl_fatal(logmsg);
1692 if (composition_allocate(26, &m2, &di2) != eslOK) esl_fatal(logmsg);
1693
1694 /* generate string with a random letter prob set to 0 */
1695 pzero = esl_rnd_Roll(r, K);
1696 zeroidx = toupper(alphabet[pzero]) - 'A';
1697 if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1698 p[pzero] = 0;
1699 esl_vec_FNorm(p, K);
1700 if (esl_rsq_fIID(r, alphabet, p, K, L, s) != eslOK) esl_fatal(logmsg);
1701
1702 /* esl_rsq_CMarkov0() */
1703 memset(s2, 0, (L+1)*sizeof(char));
1704 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1705 if (esl_rsq_CMarkov0(r, s, s2) != eslOK) esl_fatal(logmsg);
1706 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1707 if (m1[zeroidx] != 0) esl_fatal(logmsg);
1708 if (m2[zeroidx] != 0) esl_fatal(logmsg);
1709 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1710
1711 /* esl_rsq_CMarkov0(), in place */
1712 strcpy(s, s2);
1713 if (esl_rsq_CMarkov0(r, s2, s2) != eslOK) esl_fatal(logmsg);
1714 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1715 if (m2[zeroidx] != 0) esl_fatal(logmsg);
1716 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1717
1718 /* generate string with all homodiresidues set to 0 */
1719 if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1720 do {
1721 if (esl_rsq_fIID(r, alphabet, p, K, L, s) != eslOK) esl_fatal(logmsg);
1722 for (i = 1; i < L; i++)
1723 if (s[i] == s[i-1]) /* this incantation will rotate letter forward in alphabet: */
1724 s[i] = alphabet[(1+strchr(alphabet,s[i])-alphabet)%K];
1725 } while (s[0] == s[L-1]); /* lazy: reject strings where circularization would count a homodimer */
1726
1727 /* esl_rsq_CMarkov1() */
1728 memset(s2, 0, (L+1)*sizeof(char));
1729 if (composition(s, L, m1, di1) != eslOK) esl_fatal(logmsg);
1730 if (esl_rsq_CMarkov1(r, s, s2) != eslOK) esl_fatal(logmsg);
1731 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1732 for (x = 0; x < K; x++) {
1733 if (di1[x][x] != 0) esl_fatal(logmsg);
1734 if (di2[x][x] != 0) esl_fatal(logmsg);
1735 }
1736 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1737
1738 /* esl_rsq_CMarkov1(), in place */
1739 strcpy(s, s2);
1740 if (esl_rsq_CMarkov1(r, s2, s2) != eslOK) esl_fatal(logmsg);
1741 if (composition(s2, L, m2, di2) != eslOK) esl_fatal(logmsg);
1742 for (x = 0; x < K; x++) {
1743 if (di1[x][x] != 0) esl_fatal(logmsg);
1744 if (di2[x][x] != 0) esl_fatal(logmsg);
1745 }
1746 if (strcmp(s2, s) == 0) esl_fatal(logmsg);
1747
1748 free(s);
1749 free(s2);
1750 free(p);
1751 free(m1);
1752 free(m2);
1753 esl_arr2_Destroy((void **) di1, 26);
1754 esl_arr2_Destroy((void **) di2, 26);
1755 return;
1756
1757 ERROR:
1758 esl_fatal(logmsg);
1759 }
1760
1761
1762 /* Unit tests for:
1763 * esl_rsq_XShuffle()
1764 * esl_rsq_XShuffleDP()
1765 * esl_rsq_XShuffleWindows()
1766 * esl_rsq_XReverse()
1767 * Same ideas as testing the C* versions, adapted for digital sequences.
1768 */
1769 static void
utest_XShufflers(ESL_RANDOMNESS * r,int L,int K)1770 utest_XShufflers(ESL_RANDOMNESS *r, int L, int K)
1771 {
1772 char *logmsg = "Failure in one of the XShuffle* unit tests";
1773 int status;
1774 ESL_DSQ *dsq = NULL;
1775 ESL_DSQ *ds2 = NULL;
1776 int *m1 = NULL,
1777 *m2 = NULL; /* mono, before and after */
1778 int **di1 = NULL,
1779 **di2 = NULL; /* di, before and after */
1780 float *p = NULL;
1781 int w = 12; /* window width for XShuffleWindows() */
1782
1783 /* allocations */
1784 ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
1785 ESL_ALLOC(ds2, sizeof(ESL_DSQ) * (L+2));
1786 ESL_ALLOC(p, sizeof(float) * K);
1787 if (composition_allocate(K, &m1, &di1) != eslOK) esl_fatal(logmsg);
1788 if (composition_allocate(K, &m2, &di2) != eslOK) esl_fatal(logmsg);
1789
1790 /* generate the string we'll test shuffling on, keep its composition stats */
1791 if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1792 if (esl_rsq_xfIID(r, p, K, L, dsq) != eslOK) esl_fatal(logmsg);
1793
1794 /* esl_rsq_XShuffle: mono composition should stay exactly the same, di may change */
1795 memset(ds2, eslDSQ_SENTINEL, (L+2));
1796 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1797 if (esl_rsq_XShuffle(r, dsq, L, ds2) != eslOK) esl_fatal(logmsg);
1798 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1799 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1800
1801 /* esl_rsq_XShuffle, in place */
1802 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1803 if (xcomposition(ds2, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1804 if (esl_rsq_XShuffle(r, ds2, L, ds2) != eslOK) esl_fatal(logmsg);
1805 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1806 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1807
1808 /* esl_rsq_XShuffleDP: mono and di compositions stay exactly the same */
1809 memset(ds2, eslDSQ_SENTINEL, (L+2));
1810 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1811 if (esl_rsq_XShuffleDP(r, dsq, L, K, ds2) != eslOK) esl_fatal(logmsg);
1812 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1813 if (composition_compare(m1, di1, m2, di2, K) != eslOK) esl_fatal(logmsg);
1814
1815 /* esl_rsq_XShuffleDP, in place */
1816 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1817 if (xcomposition(ds2, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1818 if (esl_rsq_XShuffleDP(r, ds2, L, K, ds2) != eslOK) esl_fatal(logmsg);
1819 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1820 if (composition_compare(m1, di1, m2, di2, K) != eslOK) esl_fatal(logmsg);
1821
1822 /* esl_rsq_XShuffleKmers: mono compositions stay exactly the same */
1823 memset(ds2, eslDSQ_SENTINEL, (L+2));
1824 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1825 if (esl_rsq_XShuffleKmers(r, dsq, L, 3, ds2) != eslOK) esl_fatal(logmsg);
1826 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1827 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1828
1829 /* esl_rsq_XShuffleKmers, in place */
1830 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1831 if (xcomposition(ds2, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1832 if (esl_rsq_XShuffleKmers(r, ds2, L, 3, ds2) != eslOK) esl_fatal(logmsg);
1833 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1834 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1835
1836 /* esl_rsq_XShuffleWindows(): mono composition stays the same */
1837 memset(ds2, eslDSQ_SENTINEL, (L+2));
1838 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1839 if (esl_rsq_XShuffleWindows(r, dsq, L, w, ds2) != eslOK) esl_fatal(logmsg);
1840 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1841 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1842
1843 /* esl_rsq_XShuffleWindows(), in place */
1844 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1845 if (xcomposition(ds2, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1846 if (esl_rsq_XShuffleWindows(r, ds2, L, w, ds2) != eslOK) esl_fatal(logmsg);
1847 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1848 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1849
1850 /* esl_rsq_XReverse(): two reverses (one in place) give the same seq back */
1851 memset(ds2, eslDSQ_SENTINEL, (L+2));
1852 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1853 if (esl_rsq_XReverse(dsq, L, ds2) != eslOK) esl_fatal(logmsg);
1854 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1855 if (composition_compare(m1, NULL, m2, NULL, K) != eslOK) esl_fatal(logmsg);
1856 if (memcmp((void *) ds2, (void *) dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1857 if (esl_rsq_XReverse(ds2, L, ds2) != eslOK) esl_fatal(logmsg);
1858 if (memcmp((void *) ds2, (void *) dsq, sizeof(ESL_DSQ)*(L+2)) != 0) esl_fatal(logmsg);
1859
1860 free(dsq);
1861 free(ds2);
1862 free(p);
1863 free(m1);
1864 free(m2);
1865 esl_arr2_Destroy((void **) di1, K);
1866 esl_arr2_Destroy((void **) di2, K);
1867 return;
1868
1869 ERROR:
1870 esl_fatal(logmsg);
1871 }
1872
1873 /* Unit tests for:
1874 * esl_rsq_XMarkov0()
1875 * esl_rsq_XMarkov1()
1876 * Same ideas as in the C* versions, but for digital sequences.
1877 */
1878 static void
utest_XMarkovs(ESL_RANDOMNESS * r,int L,int K)1879 utest_XMarkovs(ESL_RANDOMNESS *r, int L, int K)
1880 {
1881 char *logmsg = "Failure in an XMarkov*() unit test";
1882 int status;
1883 ESL_DSQ *dsq = NULL;
1884 ESL_DSQ *ds2 = NULL;
1885 int *m1 = NULL,
1886 *m2 = NULL; /* mono, before and after */
1887 int **di1 = NULL,
1888 **di2 = NULL; /* di, before and after */
1889 float *p = NULL;
1890 int pzero;
1891 int i,x;
1892
1893 /* allocations */
1894 ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
1895 ESL_ALLOC(ds2, sizeof(ESL_DSQ) * (L+2));
1896 ESL_ALLOC(p, sizeof(float) * K);
1897 if (composition_allocate(K, &m1, &di1) != eslOK) esl_fatal(logmsg);
1898 if (composition_allocate(K, &m2, &di2) != eslOK) esl_fatal(logmsg);
1899
1900 /* generate sequence with a random letter prob set to 0 */
1901 pzero = esl_rnd_Roll(r, K);
1902 if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1903 p[pzero] = 0.;
1904 esl_vec_FNorm(p, K);
1905 if (esl_rsq_xfIID(r, p, K, L, dsq) != eslOK) esl_fatal(logmsg);
1906
1907 /* esl_rsq_XMarkov0() */
1908 memset(ds2, eslDSQ_SENTINEL, (L+2)*sizeof(ESL_DSQ));
1909 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1910 if (esl_rsq_XMarkov0(r, dsq, L, K, ds2) != eslOK) esl_fatal(logmsg);
1911 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1912 if (m1[pzero] != 0) esl_fatal(logmsg);
1913 if (m2[pzero] != 0) esl_fatal(logmsg);
1914 if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1915
1916 /* esl_rsq_CMarkov0(), in place */
1917 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1918 if (esl_rsq_XMarkov0(r, ds2, L, K, ds2) != eslOK) esl_fatal(logmsg);
1919 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1920 if (m2[pzero] != 0) esl_fatal(logmsg);
1921 if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1922
1923 /* generate string with all homodiresidues set to 0 */
1924 if (esl_dirichlet_FSampleUniform(r, K, p) != eslOK) esl_fatal(logmsg);
1925 do {
1926 if (esl_rsq_xfIID(r, p, K, L, dsq) != eslOK) esl_fatal(logmsg);
1927 for (i = 2; i <= L; i++)
1928 if (dsq[i] == dsq[i-1]) /* this incantation will rotate letter forward in alphabet: */
1929 dsq[i] = (dsq[i]+1)%K;
1930 } while (dsq[1] == dsq[L]); /* lazy. reject strings where circularization would count a homodimer */
1931
1932 /* esl_rsq_XMarkov1() */
1933 memset(ds2, eslDSQ_SENTINEL, (L+2)*sizeof(ESL_DSQ));
1934 if (xcomposition(dsq, L, K, m1, di1) != eslOK) esl_fatal(logmsg);
1935 if (esl_rsq_XMarkov1(r, dsq, L, K, ds2) != eslOK) esl_fatal(logmsg);
1936 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1937 for (x = 0; x < K; x++) {
1938 if (di1[x][x] != 0) esl_fatal(logmsg);
1939 if (di2[x][x] != 0) esl_fatal(logmsg);
1940 }
1941 if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1942
1943 /* esl_rsq_XMarkov1(), in place */
1944 if (esl_abc_dsqcpy(ds2, L, dsq) != eslOK) esl_fatal(logmsg);
1945 if (esl_rsq_XMarkov1(r, ds2, L, K, ds2) != eslOK) esl_fatal(logmsg);
1946 if (xcomposition(ds2, L, K, m2, di2) != eslOK) esl_fatal(logmsg);
1947 for (x = 0; x < K; x++) {
1948 if (di1[x][x] != 0) esl_fatal(logmsg);
1949 if (di2[x][x] != 0) esl_fatal(logmsg);
1950 }
1951 if (memcmp(ds2, dsq, sizeof(ESL_DSQ)*(L+2)) == 0) esl_fatal(logmsg);
1952
1953 free(dsq);
1954 free(ds2);
1955 free(p);
1956 free(m1);
1957 free(m2);
1958 esl_arr2_Destroy((void **) di1, K);
1959 esl_arr2_Destroy((void **) di2, K);
1960 return;
1961
1962 ERROR:
1963 esl_fatal(logmsg);
1964 }
1965
1966 /* utest_markov1_bug()
1967 *
1968 * Given a sequence like AAAAAAAAAT, where a residue only occurs once
1969 * and at the end of the sequence, a bug can appear: a Markov chain
1970 * can transit to T, but can't leave. Easel handles this by
1971 * counting Markov statistics as if the input sequence were circular.
1972 */
1973 static void
utest_markov1_bug(ESL_RANDOMNESS * r)1974 utest_markov1_bug(ESL_RANDOMNESS *r)
1975 {
1976 char logmsg[] = "Failure in markov1_bug test (zero/absorbing transition)";
1977 char testseq[] = "AAAAAAAAAT";
1978 char *seq = NULL;
1979 ESL_DSQ testdsq[] = { eslDSQ_SENTINEL,0,0,0,0,0,0,0,0,0,3,eslDSQ_SENTINEL};
1980 ESL_DSQ *dsq = NULL;
1981 int L = strlen(testseq);
1982 int *mono = NULL;
1983 int **di = NULL;
1984 int N = 100;
1985 int i;
1986
1987 if ((seq = malloc(sizeof(char) * (L+1))) == NULL) esl_fatal(logmsg);
1988 if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(logmsg);
1989
1990 if (composition_allocate(4, &mono, &di) != eslOK) esl_fatal(logmsg);
1991 for (i = 0; i < N; i++) {
1992 if (esl_rsq_XMarkov1(r, testdsq, L, 4, dsq) != eslOK) esl_fatal(logmsg);
1993 if (xcomposition(testdsq, L, 4, mono, di) != eslOK) esl_fatal(logmsg);
1994 if (mono[0] + mono[3] != L) esl_fatal(logmsg);
1995 }
1996 esl_arr2_Destroy((void **) di, 4);
1997 free(mono);
1998
1999 if (composition_allocate(26, &mono, &di) != eslOK) esl_fatal(logmsg);
2000 for (i = 0; i < N; i++) {
2001 if (esl_rsq_CMarkov1(r, testseq, seq) != eslOK) esl_fatal(logmsg);
2002 if (composition(seq, L, mono, di) != eslOK) esl_fatal(logmsg);
2003 if (mono[0] + mono['T'-'A'] != L) esl_fatal(logmsg);
2004 }
2005 esl_arr2_Destroy((void **) di, 26);
2006 free(mono);
2007 free(seq);
2008 free(dsq);
2009 }
2010
2011 #endif /*eslRANDOMSEQ_TESTDRIVE*/
2012 /*------------------ end, unit tests ----------------------------*/
2013
2014 /*****************************************************************
2015 * 9. Test driver.
2016 *****************************************************************/
2017 #ifdef eslRANDOMSEQ_TESTDRIVE
2018 /* gcc -g -Wall -o randomseq_utest -L. -I. -DeslRANDOMSEQ_TESTDRIVE esl_randomseq.c -leasel -lm
2019 */
2020
2021 #include "esl_config.h"
2022
2023 #include <stdio.h>
2024 #include <string.h>
2025
2026 #include "easel.h"
2027 #include "esl_getopts.h"
2028 #include "esl_random.h"
2029 #include "esl_randomseq.h"
2030
2031 static ESL_OPTIONS options[] = {
2032 /* name type default env range toggles reqs incomp help docgroup*/
2033 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
2034 { "-L", eslARG_INT, "1000", NULL, NULL, NULL, NULL, NULL, "length of random sequences", 0 },
2035 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
2036 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
2037 };
2038 static char usage[] = "[-options]";
2039 static char banner[] = "test driver for randomseq module";
2040
2041 int
main(int argc,char ** argv)2042 main(int argc, char **argv)
2043 {
2044 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2045 ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
2046 char *alphabet = "ACGT";
2047 int K = strlen(alphabet);
2048 int L = esl_opt_GetInteger(go, "-L");
2049
2050 utest_CShufflers(r, L, alphabet, K);
2051 utest_CMarkovs (r, L, alphabet);
2052 utest_XShufflers(r, L, K);
2053 utest_XMarkovs (r, L, K);
2054
2055 utest_markov1_bug(r);
2056
2057 esl_randomness_Destroy(r);
2058 esl_getopts_Destroy(go);
2059 return 0;
2060 }
2061
2062 #endif /*eslRANDOMSEQ_TESTDRIVE*/
2063 /*----------------- end, test driver ----------------------------*/
2064
2065
2066 /*****************************************************************
2067 * 10. Example.
2068 *****************************************************************/
2069 #ifdef eslRANDOMSEQ_EXAMPLE
2070 /*::cexcerpt::randomseq_example::begin::*/
2071 /* compile: gcc -g -Wall -I. -o example -DeslRANDOMSEQ_EXAMPLE esl_randomseq.c\
2072 esl_random.c esl_sqio.c esl_sq.c easel.c -lm
2073 * run: ./example <FASTA file>
2074 */
2075 #include "easel.h"
2076 #include "esl_sq.h"
2077 #include "esl_sqio.h"
2078 #include "esl_random.h"
2079 #include "esl_randomseq.h"
2080
2081 int
main(int argc,char ** argv)2082 main(int argc, char **argv)
2083 {
2084 char *seqfile = argv[1];
2085 int format = eslSQFILE_UNKNOWN;
2086 ESL_SQFILE *sqfp = NULL;
2087 ESL_SQ *sq = esl_sq_Create();
2088 ESL_RANDOMNESS *r = esl_randomness_Create(0);
2089 int status;
2090
2091 if (esl_sqfile_Open(seqfile, format, NULL, &sqfp) != eslOK)
2092 esl_fatal("Failed to open %s\n", seqfile);
2093
2094 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
2095 {
2096 printf("[Original sequence:]\n");
2097 esl_sqio_Write(stdout, sq, eslSQFILE_FASTA);
2098
2099 printf("[After shuffling:]\n");
2100 esl_rsq_CShuffle(r, sq->seq, sq->seq); /* shuffle in place */
2101 esl_sqio_Write(stdout, sq, eslSQFILE_FASTA);
2102
2103 esl_sq_Reuse(sq);
2104 }
2105 if (status != eslEOF) esl_fatal("Parse failed");
2106 esl_sqfile_Close(sqfp);
2107
2108 esl_sq_Destroy(sq);
2109 esl_randomness_Destroy(r);
2110 return 0;
2111 }
2112 /*::cexcerpt::randomseq_example::end::*/
2113 #endif /*eslRANDOMSEQ_EXAMPLE*/
2114 /*--------------------- end, example ----------------------------*/
2115