1 /* Implements the standard digitized alphabets for biosequences.
2 *
3 * 1. ESL_ALPHABET object for digital alphabets.
4 * 2. Digitized sequences (ESL_DSQ *).
5 * 3. Other routines in the API.
6 * 4. Unit tests.
7 * 5. Test driver.
8 * 6. Examples.
9 */
10 #include "esl_config.h"
11
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <math.h>
16 #ifdef HAVE_STRINGS_H
17 #include <strings.h> /* POSIX strcasecmp() */
18 #endif
19
20 #include "easel.h"
21 #include "esl_mem.h"
22
23 #include "esl_alphabet.h"
24
25
26
27 /*****************************************************************
28 * 1. The ESL_ALPHABET object
29 *****************************************************************/
30
31 static ESL_ALPHABET *create_rna(void);
32 static ESL_ALPHABET *create_dna(void);
33 static ESL_ALPHABET *create_amino(void);
34 static ESL_ALPHABET *create_coins(void);
35 static ESL_ALPHABET *create_dice(void);
36
37 static int set_complementarity(ESL_ALPHABET *a);
38
39 /* Function: esl_alphabet_Create()
40 * Synopsis: Create alphabet of a standard type.
41 *
42 * Purpose: Creates one of the three standard bio alphabets:
43 * <eslDNA>, <eslRNA>, or <eslAMINO>, and returns
44 * a pointer to it.
45 *
46 * Args: type - <eslDNA>, <eslRNA>, or <eslAMINO>.
47 *
48 * Returns: pointer to the new alphabet.
49 *
50 * Throws: <NULL> if any allocation or initialization fails.
51 */
52 ESL_ALPHABET *
esl_alphabet_Create(int type)53 esl_alphabet_Create(int type)
54 {
55 ESL_ALPHABET *a = NULL;
56
57 switch(type) {
58 case eslRNA: a = create_rna(); break;
59 case eslDNA: a = create_dna(); break;
60 case eslAMINO: a = create_amino(); break;
61 case eslCOINS: a = create_coins(); break;
62 case eslDICE: a = create_dice(); break;
63 default: esl_fatal("bad alphabet type: unrecognized"); // violation: must be a code error, not user.
64 }
65 return a;
66 }
67
68 /* Function: esl_alphabet_CreateCustom()
69 * Synopsis: Create a custom alphabet.
70 *
71 * Purpose: Creates a customized biosequence alphabet,
72 * and returns a ptr to it. The alphabet type is set
73 * to <eslNONSTANDARD>.
74 *
75 * <alphabet> is the internal alphabet string;
76 * <K> is the size of the base alphabet;
77 * <Kp> is the total size of the alphabet string.
78 *
79 * In the alphabet string, residues <0..K-1> are the base alphabet;
80 * residue <K> is the canonical gap (indel) symbol;
81 * residues <K+1..Kp-4> are additional degeneracy symbols (possibly 0 of them);
82 * residue <Kp-3> is an "any" symbol (such as N or X);
83 * residue <Kp-2> is a "nonresidue" symbol (such as *);
84 * and residue <Kp-1> is a "missing data" gap symbol.
85 *
86 * The two gap symbols, the nonresidue, and the "any"
87 * symbol are mandatory even for nonstandard alphabets, so
88 * <Kp> $\geq$ <K+4>.
89 *
90 * Args: alphabet - internal alphabet; example "ACGT-RYMKSWHBVDN*~"
91 * K - base size; example 4
92 * Kp - total size, including gap, degeneracies; example 18
93 *
94 * Returns: pointer to new <ESL_ALPHABET> structure.
95 *
96 * Throws: <NULL> if any allocation or initialization fails.
97 */
98 ESL_ALPHABET *
esl_alphabet_CreateCustom(const char * alphabet,int K,int Kp)99 esl_alphabet_CreateCustom(const char *alphabet, int K, int Kp)
100 {
101 ESL_ALPHABET *a = NULL;
102 int c,x,y;
103 int status;
104
105 /* Argument checks.
106 */
107 if (strlen(alphabet) != Kp) ESL_XEXCEPTION(eslEINVAL, "alphabet length != Kp");
108 if (Kp < K+4) ESL_XEXCEPTION(eslEINVAL, "Kp too small in alphabet");
109
110 /* Allocation/init, level 1.
111 */
112 ESL_ALLOC(a, sizeof(ESL_ALPHABET));
113 a->sym = NULL;
114 a->degen = NULL;
115 a->ndegen = NULL;
116 a->complement = NULL;
117
118 /* Allocation/init, level 2.
119 */
120 ESL_ALLOC(a->sym, sizeof(char) * (Kp+1));
121 ESL_ALLOC(a->ndegen, sizeof(int) * Kp);
122 ESL_ALLOC(a->degen, sizeof(char *) * Kp);
123 a->degen[0] = NULL;
124
125 /* Allocation/init, level 3.
126 */
127 ESL_ALLOC(a->degen[0], sizeof(char) * (Kp*K));
128 for (x = 1; x < Kp; x++)
129 a->degen[x] = a->degen[0]+(K*x);
130
131 /* Initialize the internal alphabet:
132 */
133 a->type = eslNONSTANDARD;
134 a->K = K;
135 a->Kp = Kp;
136 strcpy(a->sym, alphabet);
137
138 /* Initialize the input map, mapping ASCII seq chars to digital codes,
139 * and eslDSQ_ILLEGAL for everything else.
140 */
141 for (c = 0; c < 128; c++) a->inmap[c] = eslDSQ_ILLEGAL;
142 for (x = 0; x < a->Kp; x++) a->inmap[(int) a->sym[x]] = x;
143
144 /* Initialize the degeneracy map:
145 * Base alphabet (first K syms) are automatically
146 * mapped uniquely; (Kp-3) is assumed to be
147 * the "any" character; other degen chars (K+1..Kp-4) are
148 * unset; gap, nonresidue, missing character are unmapped (ndegen=0)
149 */
150 for (x = 0; x < a->Kp; x++) /* clear everything */
151 {
152 a->ndegen[x] = 0;
153 for (y = 0; y < a->K; y++) a->degen[x][y] = 0;
154 }
155 for (x = 0; x < a->K; x++) /* base alphabet */
156 {
157 a->ndegen[x] = 1;
158 a->degen[x][x] = 1;
159 }
160 /* "any" character */
161 a->ndegen[Kp-3] = K;
162 for (x = 0; x < a->K; x++) a->degen[Kp-3][x] = 1;
163
164 return a;
165
166 ERROR:
167 esl_alphabet_Destroy(a);
168 return NULL;
169 }
170
171
172 /* create_rna():
173 * Creates a standard RNA alphabet.
174 */
175 static ESL_ALPHABET *
create_rna(void)176 create_rna(void)
177 {
178 ESL_ALPHABET *a = NULL;
179 int status;
180
181 /* Create the fundamental alphabet
182 */
183 if ((a = esl_alphabet_CreateCustom("ACGU-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
184 a->type = eslRNA;
185
186 /* Add desired synonyms in the input map.
187 */
188 esl_alphabet_SetEquiv(a, 'T', 'U'); /* read T as a U */
189 esl_alphabet_SetEquiv(a, 'X', 'N'); /* read X as an N (many seq maskers use X) */
190 esl_alphabet_SetEquiv(a, 'I', 'A'); /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
191 esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
192 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
193 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
194
195 /* Define degenerate symbols.
196 */
197 esl_alphabet_SetDegeneracy(a, 'R', "AG");
198 esl_alphabet_SetDegeneracy(a, 'Y', "CU");
199 esl_alphabet_SetDegeneracy(a, 'M', "AC");
200 esl_alphabet_SetDegeneracy(a, 'K', "GU");
201 esl_alphabet_SetDegeneracy(a, 'S', "CG");
202 esl_alphabet_SetDegeneracy(a, 'W', "AU");
203 esl_alphabet_SetDegeneracy(a, 'H', "ACU");
204 esl_alphabet_SetDegeneracy(a, 'B', "CGU");
205 esl_alphabet_SetDegeneracy(a, 'V', "ACG");
206 esl_alphabet_SetDegeneracy(a, 'D', "AGU");
207
208 if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
209
210 return a;
211
212 ERROR:
213 esl_alphabet_Destroy(a);
214 return NULL;
215 }
216
217
218 /* create_dna():
219 * creates and returns a standard DNA alphabet.
220 */
221 static ESL_ALPHABET *
create_dna(void)222 create_dna(void)
223 {
224 ESL_ALPHABET *a = NULL;
225 int status;
226
227 /* Create the fundamental alphabet.
228 */
229 if ((a = esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
230 a->type = eslDNA;
231
232 /* Add desired synonyms in the input map.
233 */
234 esl_alphabet_SetEquiv(a, 'U', 'T'); /* read U as a T */
235 esl_alphabet_SetEquiv(a, 'X', 'N'); /* read X as an N (many seq maskers use X) */
236 esl_alphabet_SetEquiv(a, 'I', 'A'); /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
237 esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
238 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
239 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
240
241 /* Define IUBMB degenerate symbols other than the N.
242 */
243 esl_alphabet_SetDegeneracy(a, 'R', "AG");
244 esl_alphabet_SetDegeneracy(a, 'Y', "CT");
245 esl_alphabet_SetDegeneracy(a, 'M', "AC");
246 esl_alphabet_SetDegeneracy(a, 'K', "GT");
247 esl_alphabet_SetDegeneracy(a, 'S', "CG");
248 esl_alphabet_SetDegeneracy(a, 'W', "AT");
249 esl_alphabet_SetDegeneracy(a, 'H', "ACT");
250 esl_alphabet_SetDegeneracy(a, 'B', "CGT");
251 esl_alphabet_SetDegeneracy(a, 'V', "ACG");
252 esl_alphabet_SetDegeneracy(a, 'D', "AGT");
253
254 if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
255 return a;
256
257 ERROR:
258 esl_alphabet_Destroy(a);
259 return NULL;
260 }
261
262
263 /* create_amino():
264 * Creates a new standard amino acid alphabet.
265 */
266 static ESL_ALPHABET *
create_amino(void)267 create_amino(void)
268 {
269 ESL_ALPHABET *a = NULL;
270
271 /* Create the internal alphabet
272 */
273 if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX*~", 20, 29)) == NULL) return NULL;
274 a->type = eslAMINO;
275
276 /* Add desired synonyms in the input map.
277 */
278 esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
279 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
280 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
281
282 /* Define IUPAC degenerate symbols other than the X.
283 */
284 esl_alphabet_SetDegeneracy(a, 'B', "ND");
285 esl_alphabet_SetDegeneracy(a, 'J', "IL");
286 esl_alphabet_SetDegeneracy(a, 'Z', "QE");
287
288 /* Define unusual residues as one-to-one degeneracies.
289 */
290 esl_alphabet_SetDegeneracy(a, 'U', "C"); /* selenocysteine is scored as cysteine */
291 esl_alphabet_SetDegeneracy(a, 'O', "K"); /* pyrrolysine is scored as lysine */
292
293 return a;
294 }
295
296
297 /* create_coins():
298 * Creates a toy alphabet for coin examples
299 */
300 static ESL_ALPHABET *
create_coins(void)301 create_coins(void)
302 {
303 ESL_ALPHABET *a = NULL;
304
305 /* Create the internal alphabet
306 */
307 if ((a = esl_alphabet_CreateCustom("HT-X*~", 2, 6)) == NULL) return NULL;
308 a->type = eslCOINS;
309
310 /* Add desired synonyms in the input map.
311 */
312 esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
313 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
314 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
315
316 /* There are no degeneracies in the coin alphabet. */
317
318 return a;
319 }
320
321 /* create_dice():
322 * Creates a toy alphabet for dice examples
323 */
324 static ESL_ALPHABET *
create_dice(void)325 create_dice(void)
326 {
327 ESL_ALPHABET *a = NULL;
328
329 /* Create the internal alphabet
330 */
331 if ((a = esl_alphabet_CreateCustom("123456-X*~", 6, 10)) == NULL) return NULL;
332 a->type = eslCOINS;
333
334 /* Add desired synonyms in the input map.
335 */
336 esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
337 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
338 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
339
340 /* There are no degeneracies in the dice alphabet. */
341
342 return a;
343 }
344
345 /* set_complementarity()
346 * Builds the "complement" lookup table for DNA, RNA alphabets.
347 *
348 * Throws <eslEINVAL> if the alphabet isn't <eslDNA> or <eslRNA>.
349 */
350 static int
set_complementarity(ESL_ALPHABET * a)351 set_complementarity(ESL_ALPHABET *a)
352 {
353 int status;
354
355 if (a->type != eslRNA && a->type != eslDNA)
356 ESL_EXCEPTION(eslEINVAL, "alphabet isn't nucleic: no complementarity to set");
357
358 /* We will assume that Kp=18 and sym="ACGT-RYMKSWHBVDN*~" (or RNA equiv).
359 * Bug #h108 happened because routine fell out of sync w/ a change in alphabet.
360 * Don't let that happen again.
361 */
362 ESL_DASSERT1(( a->Kp == 18 ));
363 ESL_DASSERT1(( a->sym[17] == '~' ));
364
365 ESL_ALLOC(a->complement, sizeof(ESL_DSQ) * a->Kp);
366 a->complement[0] = 3; /* A->T */
367 a->complement[1] = 2; /* C->G */
368 a->complement[2] = 1; /* G->C */
369 a->complement[3] = 0; /* T->A */
370 a->complement[4] = 4; /* - - */
371 a->complement[5] = 6; /* R->Y */
372 a->complement[6] = 5; /* Y->R */
373 a->complement[7] = 8; /* M->K */
374 a->complement[8] = 7; /* K->M */
375 a->complement[9] = 9; /* S S */
376 a->complement[10]= 10; /* W W */
377 a->complement[11]= 14; /* H->D */
378 a->complement[12]= 13; /* B->V */
379 a->complement[13]= 12; /* V->B */
380 a->complement[14]= 11; /* D->H */
381 a->complement[15]= 15; /* N N */
382 a->complement[16]= 16; /* * * */
383 a->complement[17]= 17; /* ~ ~ */
384
385 return eslOK;
386
387 ERROR:
388 return status;
389 }
390
391
392 /* Function: esl_alphabet_SetEquiv()
393 * Synopsis: Define an equivalent symbol.
394 *
395 * Purpose: Maps an additional input alphabetic symbol <sym> to
396 * an internal alphabet symbol <c>; for example,
397 * we might map T to U for an RNA alphabet, so that we
398 * allow for reading input DNA sequences.
399 *
400 * Args: sym - symbol to allow in the input alphabet; 'T' for example
401 * c - symbol to map <sym> to in the internal alphabet; 'U' for example
402 *
403 * Returns: <eslOK> on success.
404 *
405 * Throws: <eslEINVAL> if <c> is not in the internal alphabet, or if <sym> is.
406 */
407 int
esl_alphabet_SetEquiv(ESL_ALPHABET * a,char sym,char c)408 esl_alphabet_SetEquiv(ESL_ALPHABET *a, char sym, char c)
409 {
410 char *sp = NULL;
411 ESL_DSQ x;
412
413 /* Contract checks */
414 if ((sp = strchr(a->sym, sym)) != NULL)
415 ESL_EXCEPTION(eslEINVAL, "symbol %c is already in internal alphabet, can't equivalence it", sym);
416 if ((sp = strchr(a->sym, c)) == NULL)
417 ESL_EXCEPTION(eslEINVAL, "char %c not in the alphabet, can't map to it", c);
418
419 x = sp - a->sym;
420 a->inmap[(int) sym] = x;
421 return eslOK;
422 }
423
424 /* Function: esl_alphabet_SetCaseInsensitive()
425 * Synopsis: Make an alphabet's input map case-insensitive.
426 *
427 * Purpose: Given a custom alphabet <a>, with all equivalences set,
428 * make the input map case-insensitive: for every
429 * letter that is mapped in either lower or upper
430 * case, map the other case to the same internal
431 * residue.
432 *
433 * For the standard alphabets, this is done automatically.
434 *
435 * Args: a - alphabet to make case-insensitive.
436 *
437 * Returns: <eslOK> on success.
438 *
439 * Throws: <eslECORRUPT> if any lower/uppercase symbol pairs
440 * are already both mapped to different symbols.
441 */
442 int
esl_alphabet_SetCaseInsensitive(ESL_ALPHABET * a)443 esl_alphabet_SetCaseInsensitive(ESL_ALPHABET *a)
444 {
445 int lc, uc;
446
447 for (lc = 'a'; lc <= 'z'; lc++)
448 {
449 uc = toupper(lc);
450
451 if (esl_abc_CIsValid(a, lc) && ! esl_abc_CIsValid(a, uc)) a->inmap[uc] = a->inmap[lc];
452 else if (esl_abc_CIsValid(a, uc) && ! esl_abc_CIsValid(a, lc)) a->inmap[lc] = a->inmap[uc];
453 else if (esl_abc_CIsValid(a, lc) && esl_abc_CIsValid(a, uc) && a->inmap[uc] != a->inmap[lc])
454 ESL_EXCEPTION(eslECORRUPT, "symbols %c and %c map differently already (%c vs. %c)",
455 lc, uc, a->inmap[lc], a->inmap[uc]);
456 }
457 return eslOK;
458 }
459
460 /* Function: esl_alphabet_SetDegeneracy()
461 * Synopsis: Define degenerate symbol in custom alphabet.
462 *
463 * Purpose: Given an alphabet under construction,
464 * define the degenerate character <c> to mean
465 * any of the characters in the string <ds>.
466 *
467 * <c> must exist in the digital alphabet, as
468 * one of the optional degenerate residues (<K+1>..<Kp-3>).
469 * All the characters in the <ds> string must exist
470 * in the canonical alphabet (<0>..<K-1>).
471 *
472 * You may not redefine the mandatory all-degenerate character
473 * (typically <N> or <X>; <Kp-3> in the digital alphabet).
474 * It is defined automatically in all alphabets.
475 *
476 * Args: a - an alphabet under construction.
477 * c - degenerate character code; example: 'R'
478 * ds - string of base characters for c; example: "AG"
479 *
480 * Returns: <eslOK> on success.
481 *
482 * Throws: <eslEINVAL> if <c> or <ds> arguments aren't valid.
483 */
484 int
esl_alphabet_SetDegeneracy(ESL_ALPHABET * a,char c,char * ds)485 esl_alphabet_SetDegeneracy(ESL_ALPHABET *a, char c, char *ds)
486 {
487 char *sp;
488 ESL_DSQ x,y;
489
490 if ((sp = strchr(a->sym, c)) == NULL)
491 ESL_EXCEPTION(eslEINVAL, "no such degenerate character");
492 x = sp - a->sym;
493
494 /* A degenerate character must have code K+1..Kp-4.
495 * Kp-3, the all-degenerate character, is automatically
496 * created, and can't be remapped.
497 */
498 if (x == a->Kp-3)
499 ESL_EXCEPTION(eslEINVAL, "can't redefine all-degenerate char %c", c);
500 if (x < a->K+1 || x >= a->Kp-2)
501 ESL_EXCEPTION(eslEINVAL, "char %c isn't in expected position in alphabet", c);
502
503 while (*ds != '\0') {
504 if ((sp = strchr(a->sym, *ds)) == NULL) ESL_EXCEPTION(eslEINVAL, "no such base character");
505 y = sp - a->sym;
506 if (! esl_abc_XIsCanonical(a, y)) ESL_EXCEPTION(eslEINVAL, "can't map degeneracy to noncanonical character");
507
508 a->degen[x][y] = 1;
509 a->ndegen[x]++;
510 ds++;
511 }
512 return eslOK;
513 }
514
515
516 /* Function: esl_alphabet_SetIgnored()
517 * Synopsis: Define a set of characters to be ignored in input.
518 *
519 * Purpose: Given an alphabet <a> (either standard or custom), define
520 * all the characters in string <ignoredchars> to be
521 * unmapped: valid, but ignored when converting input text.
522 *
523 * By default, the standard alphabets do not define any
524 * ignored characters.
525 *
526 * The most common ignored characters would be space, tab,
527 * and digits, to skip silently over whitespace and
528 * sequence coordinates when parsing loosely-defined
529 * sequence file formats.
530 *
531 * Args: a - alphabet to modify
532 * ignoredchars - string listing characters to ignore; i.e. " \t"
533 *
534 * Returns: <eslOK> on success.
535 */
536 int
esl_alphabet_SetIgnored(ESL_ALPHABET * a,const char * ignoredchars)537 esl_alphabet_SetIgnored(ESL_ALPHABET *a, const char *ignoredchars)
538 {
539 int i;
540 for (i = 0; ignoredchars[i] != '\0'; i++) a->inmap[(int)ignoredchars[i]] = eslDSQ_IGNORED;
541 return eslOK;
542 }
543
544
545 /* Function: esl_alphabet_Sizeof()
546 * Synopsis: Returns size of an alphabet object, in bytes.
547 *
548 * Purpose: Returns the size of alphabet <a> object, in bytes.
549 */
550 size_t
esl_alphabet_Sizeof(ESL_ALPHABET * a)551 esl_alphabet_Sizeof(ESL_ALPHABET *a)
552 {
553 size_t n = 0;
554 n += sizeof(ESL_ALPHABET);
555 n += sizeof(char) * a->Kp; /* a->sym */
556 n += sizeof(char *) * a->Kp; /* a->degen */
557 n += sizeof(char) * (a->Kp * a->K); /* a->degen[][] */
558 n += sizeof(int) * a->Kp; /* a->ndegen */
559 if (a->complement) n += sizeof(ESL_DSQ) * a->Kp; /* a->complement */
560 return n;
561 }
562
563 /* Function: esl_alphabet_Destroy()
564 * Synopsis: Frees an alphabet object.
565 *
566 * Purpose: Free's an <ESL_ALPHABET> structure.
567 *
568 * Args: a - the <ESL_ALPHABET> to free.
569 *
570 * Returns: (void).
571 */
572 void
esl_alphabet_Destroy(ESL_ALPHABET * a)573 esl_alphabet_Destroy(ESL_ALPHABET *a)
574 {
575 if (a)
576 {
577 if (a->sym) free(a->sym);
578 if (a->ndegen) free(a->ndegen);
579 if (a->degen)
580 {
581 if (a->degen[0]) free(a->degen[0]);
582 free(a->degen);
583 }
584 if (a->complement) free(a->complement);
585 free(a);
586 }
587 }
588 /*--------------- end, ESL_ALPHABET object ----------------------*/
589
590
591
592
593
594 /*****************************************************************
595 * 2. Digitized sequences (ESL_DSQ *)
596 *****************************************************************/
597 /* Design note: SRE, Mon Sep 18 09:11:41 2006
598 *
599 * An ESL_DSQ is considered to a special string type, equivalent to
600 * <char *>, and is not considered to be an Easel "object". Thus it
601 * does not have a standard object API. Rather, the caller deals with
602 * an ESL_DSQ directly: allocate for <(L+2)*sizeof(ESL_DSQ)> to leave
603 * room for sentinels at <0> and <L+1>.
604 *
605 * Additionally, an ESL_DSQ is considered to be "trusted"
606 * data: we're 'guaranteed' that anything in an ESL_DSQ is a valid
607 * symbol, so we don't need to error-check. Anything else is a programming
608 * error.
609 */
610
611 /* Function: esl_abc_CreateDsq()
612 * Synopsis: Digitizes a sequence into new space.
613 *
614 * Purpose: Given an alphabet <a> and an ASCII sequence <seq>,
615 * digitize the sequence into newly allocated space, and
616 * return a pointer to that space in <ret_dsq>.
617 *
618 * Args: a - internal alphabet
619 * seq - text sequence to be digitized
620 * ret_dsq - RETURN: the new digital sequence
621 *
622 * Returns: <eslOK> on success, and <ret_dsq> contains the digitized
623 * sequence; caller is responsible for free'ing this
624 * memory. Returns <eslEINVAL> if <seq> contains
625 * one or more characters that are not in the input map of
626 * alphabet <a>. If this happens, <ret_dsq> is still valid upon
627 * return: invalid characters are replaced by full ambiguities
628 * (typically X or N).
629 *
630 * Throws: <eslEMEM> on allocation failure.
631 *
632 * Xref: STL11/63
633 */
634 int
esl_abc_CreateDsq(const ESL_ALPHABET * a,const char * seq,ESL_DSQ ** ret_dsq)635 esl_abc_CreateDsq(const ESL_ALPHABET *a, const char *seq, ESL_DSQ **ret_dsq)
636 {
637 ESL_DSQ *dsq = NULL;
638 int status;
639 int64_t L;
640
641 L = strlen(seq);
642 ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
643 status = esl_abc_Digitize(a, seq, dsq);
644
645 if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
646 return status;
647
648 ERROR:
649 if (dsq != NULL) free(dsq);
650 if (ret_dsq != NULL) *ret_dsq = NULL;
651 return status;
652 }
653
654
655 /* Function: esl_abc_Digitize()
656 * Synopsis: Digitizes a sequence into existing space.
657 *
658 * Purpose: Given an alphabet <a> and a nul-terminated ASCII sequence
659 * <seq>, digitize the sequence and put it in <dsq>. Caller
660 * provides space in <dsq> allocated for at least <L+2>
661 * <ESL_DSQ> residues, where <L> is the length of <seq>.
662 *
663 * Args: a - internal alphabet
664 * seq - text sequence to be digitized (\0-terminated)
665 * dsq - RETURN: the new digital sequence (caller allocates,
666 * at least <(L+2) * sizeof(ESL_DSQ)>).
667 *
668 * Returns: <eslOK> on success.
669 * Returns <eslEINVAL> if <seq> contains one or more characters
670 * that are not recognized in the alphabet <a>. (This is classed
671 * as a normal error, because the <seq> may be untrusted user input.)
672 * If this happens, the digital sequence <dsq> is still valid upon
673 * return; invalid ASCII characters are replaced by ambiguities
674 * (X or N).
675 */
676 int
esl_abc_Digitize(const ESL_ALPHABET * a,const char * seq,ESL_DSQ * dsq)677 esl_abc_Digitize(const ESL_ALPHABET *a, const char *seq, ESL_DSQ *dsq)
678 {
679 int status;
680 int64_t i; /* position in seq */
681 int64_t j; /* position in dsq */
682 ESL_DSQ x;
683
684 status = eslOK;
685 dsq[0] = eslDSQ_SENTINEL;
686 for (i = 0, j = 1; seq[i] != '\0'; i++)
687 {
688 x = a->inmap[(int) seq[i]];
689 if (esl_abc_XIsValid(a, x)) dsq[j] = x;
690 else if (x == eslDSQ_IGNORED) continue;
691 else {
692 status = eslEINVAL;
693 dsq[j] = esl_abc_XGetUnknown(a);
694 }
695 j++;
696 }
697 dsq[j] = eslDSQ_SENTINEL;
698 return status;
699 }
700
701 /* Function: esl_abc_Textize()
702 * Synopsis: Convert digital sequence to text.
703 *
704 * Purpose: Make an ASCII sequence <seq> by converting a digital
705 * sequence <dsq> of length <L> back to text, according to
706 * the digital alphabet <a>.
707 *
708 * Caller provides space in <seq> allocated for at least
709 * <L+1> bytes (<(L+1) * sizeof(char)>).
710 *
711 * Args: a - internal alphabet
712 * dsq - digital sequence to be converted (1..L)
713 * L - length of dsq
714 * seq - RETURN: the new text sequence (caller allocated
715 * space, at least <(L+1) * sizeof(char)>).
716 *
717 * Returns: <eslOK> on success.
718 */
719 int
esl_abc_Textize(const ESL_ALPHABET * a,const ESL_DSQ * dsq,int64_t L,char * seq)720 esl_abc_Textize(const ESL_ALPHABET *a, const ESL_DSQ *dsq, int64_t L, char *seq)
721 {
722 int64_t i;
723
724 for (i = 0; i < L; i++)
725 seq[i] = a->sym[dsq[i+1]];
726 seq[i] = '\0';
727 return eslOK;
728 }
729
730
731 /* Function: esl_abc_TextizeN()
732 * Synopsis: Convert subsequence from digital to text.
733 *
734 * Purpose: Similar in semantics to <strncpy()>, this procedure takes
735 * a window of <L> residues in a digitized sequence
736 * starting at the residue pointed to by <dptr>,
737 * converts them to ASCII text representation, and
738 * copies them into the buffer <buf>.
739 *
740 * <buf> must be at least <L> residues long; <L+1>, if the
741 * caller needs to NUL-terminate it.
742 *
743 * If a sentinel byte is encountered in the digitized
744 * sequence before <L> residues have been copied, <buf> is
745 * NUL-terminated there. Otherwise, like <strncpy()>, <buf>
746 * will not be NUL-terminated.
747 *
748 * Note that because digital sequences are indexed <1..N>,
749 * not <0..N-1>, the caller must be careful about
750 * off-by-one errors in <dptr>. For example, to copy from
751 * the first residue of a digital sequence <dsq>, you must
752 * pass <dptr=dsq+1>, not <dptr=dsq>. The text in <buf>
753 * on the other hand is a normal C string indexed <0..L-1>.
754 *
755 * Args: a - reference to an internal alphabet
756 * dptr - ptr to starting residue in a digital sequence
757 * L - number of residues to convert and copy
758 * buf - text buffer to store the <L> converted residues in
759 *
760 * Returns: <eslOK> on success.
761 */
762 int
esl_abc_TextizeN(const ESL_ALPHABET * a,const ESL_DSQ * dptr,int64_t L,char * buf)763 esl_abc_TextizeN(const ESL_ALPHABET *a, const ESL_DSQ *dptr, int64_t L, char *buf)
764 {
765 int64_t i;
766
767 for (i = 0; i < L; i++)
768 {
769 if (dptr[i] == eslDSQ_SENTINEL)
770 {
771 buf[i] = '\0';
772 return eslOK;
773 }
774 buf[i] = a->sym[dptr[i]];
775 }
776 return eslOK;
777 }
778
779
780 /* Function: esl_abc_dsqcpy()
781 *
782 * Purpose: Given a digital sequence <dsq> of length <L>,
783 * make a copy of it in <dcopy>. Caller provides
784 * storage in <dcopy> for at least <L+2> <ESL_DSQ>
785 * residues.
786 *
787 * Returns: <eslOK> on success.
788 */
789 int
esl_abc_dsqcpy(const ESL_DSQ * dsq,int64_t L,ESL_DSQ * dcopy)790 esl_abc_dsqcpy(const ESL_DSQ *dsq, int64_t L, ESL_DSQ *dcopy)
791 {
792 memcpy(dcopy, dsq, sizeof(ESL_DSQ) * (L+2));
793 return eslOK;
794 }
795
796
797 /* Function: esl_abc_dsqdup()
798 * Synopsis: Duplicate a digital sequence.
799 *
800 * Purpose: Like <esl_strdup()>, but for digitized sequences:
801 * make a duplicate of <dsq> and leave it in <ret_dup>.
802 * Caller can pass the string length <L> if it's known, saving
803 * some overhead; else pass <-1> and the length will be
804 * determined for you.
805 *
806 * Tolerates <dsq> being <NULL>; in which case, returns
807 * <eslOK> with <*ret_dup> set to <NULL>.
808 *
809 * Args: dsq - digital sequence to duplicate (w/ sentinels at 0,L+1)
810 * L - length of dsq in residues, if known; -1 if unknown
811 * ret_dup - RETURN: allocated duplicate of <dsq>, which caller will
812 * free.
813 *
814 * Returns: <eslOK> on success, and leaves a pointer in <ret_dup>.
815 *
816 * Throws: <eslEMEM> on allocation failure.
817 *
818 * Xref: STL11/48
819 */
820 int
esl_abc_dsqdup(const ESL_DSQ * dsq,int64_t L,ESL_DSQ ** ret_dup)821 esl_abc_dsqdup(const ESL_DSQ *dsq, int64_t L, ESL_DSQ **ret_dup)
822 {
823 int status;
824 ESL_DSQ *new = NULL;
825
826 if (ret_dup == NULL) return eslOK; /* no-op. */
827
828 *ret_dup = NULL;
829 if (dsq == NULL) return eslOK;
830 if (L < 0) L = esl_abc_dsqlen(dsq);
831
832 ESL_ALLOC(new, sizeof(ESL_DSQ) * (L+2));
833 memcpy(new, dsq, sizeof(ESL_DSQ) * (L+2));
834
835 *ret_dup = new;
836 return eslOK;
837
838 ERROR:
839 if (new != NULL) free(new);
840 if (ret_dup != NULL) *ret_dup = NULL;
841 return status;
842 }
843
844
845 /* Function: esl_abc_dsqcat()
846 * Synopsis: Concatenate and map input chars to a digital sequence.
847 *
848 * Purpose: Append the contents of string or memory line <s> of
849 * length <n> to a digital sequence, after digitizing
850 * each input character in <s> according to an Easel
851 * <inmap>. The destination sequence and its length
852 * are passed by reference, <*dsq> and <*L>, so that
853 * the sequence may be reallocated and the length updated
854 * upon return.
855 *
856 * The input map <inmap> may map characters to
857 * <eslDSQ_IGNORED> or <eslDSQ_ILLEGAL>, but not to <eslDSQ_EOL>,
858 * <eslDSQ_EOD>, or <eslDSQ_SENTINEL> codes. <inmap[0]> is
859 * special, and must be set to the code for the 'unknown'
860 * residue (such as 'X' for proteins, 'N' for DNA) that
861 * will be used to replace any invalid <eslDSQ_ILLEGAL>
862 * characters.
863 *
864 * If <*dsq> is properly terminated digital sequence and
865 * the caller doesn't know its length, <*L> may be passed
866 * as -1. Providing the length when it's known saves an
867 * <esl_abc_dsqlen()> call. If <*dsq> is unterminated, <*L>
868 * is mandatory. Essentially the same goes for <*s>, which
869 * may be a NUL-terminated string (pass <n=-1> if length unknown),
870 * or a memory line (<n> is mandatory).
871 *
872 * <*dsq> may also be <NULL>, in which case it is allocated
873 * and initialized here.
874 *
875 * Caller should provide an <s> that is expected to be
876 * essentially all appendable to <*dsq> except for a small
877 * number of chars that map to <eslDSQ_IGNORE>, like an
878 * input sequence data line from a file, for example. We're
879 * going to reallocate <*dsq> to size <*L+n>; if <n> is an
880 * entire large buffer or file, this reallocation will be
881 * inefficient.
882 *
883 * Args: inmap - an Easel input map, inmap[0..127];
884 * inmap[0] is special, set to the 'unknown' character
885 * to replace invalid input chars.
886 * dsq - reference to the current digital seq to append to
887 * (with sentinel bytes at 0,L+1); may be <NULL>.
888 * Upon return, this will probably have
889 * been reallocated, and it will contain the original
890 * <dsq> with <s> digitized and appended.
891 * L - reference to the current length of <dsq> in residues;
892 * may be <-1> if unknown and if <*dsq> is a properly
893 * terminated digital sequence. Upon return, <L> is set to
894 * the new length of <dsq>, after <s> is appended.
895 * s - ASCII text sequence to append. May
896 * contain ignored text characters (flagged with
897 * <eslDSQ_IGNORED> in the input map of alphabet <abc>).
898 * n - Length of <s> in characters, if known; or <-1> if
899 * unknown and if <s> is a NUL-terminated string.
900 *
901 * Returns: <eslOK> on success; <*dsq> contains the result of digitizing
902 * and appending <s> to the original <*dsq>; and <*L> contains
903 * the new length of the <dsq> result in residues.
904 *
905 * If any of the characters in <s> are illegal in the
906 * alphabet <abc>, these characters are digitized as
907 * unknown residues (using <inmap[0]>) and
908 * concatenation/digitization proceeds to completion, but
909 * the function returns <eslEINVAL>. The caller might then
910 * want to call <esl_abc_ValidateSeq()> on <s> if it wants
911 * to figure out where digitization goes awry and get a
912 * more informative error report. This is a normal error,
913 * because the string <s> might be user input.
914 *
915 * Throws: <eslEMEM> on allocation or reallocation failure;
916 * <eslEINCONCEIVABLE> on coding error.
917 *
918 * Xref: SRE:STL11/48; SRE:J7/145.
919 *
920 * Note: This closely parallels a text mode version, <esl_strmapcat()>.
921 */
922 int
esl_abc_dsqcat(const ESL_DSQ * inmap,ESL_DSQ ** dsq,int64_t * L,const char * s,esl_pos_t n)923 esl_abc_dsqcat(const ESL_DSQ *inmap, ESL_DSQ **dsq, int64_t *L, const char *s, esl_pos_t n)
924 {
925 int status = eslOK;
926
927 if (*L < 0) *L = ((*dsq) ? esl_abc_dsqlen(*dsq) : 0);
928 if ( n < 0) n = ( (s) ? strlen(s) : 0);
929
930 if (n == 0) { goto ERROR; } /* that'll return eslOK, leaving *dest untouched, and *ldest its length */
931
932 if (*dsq == NULL) { /* an entirely new dsq is allocated *and* initialized with left sentinel. */
933 ESL_ALLOC(*dsq, sizeof(ESL_DSQ) * (n+2));
934 (*dsq)[0] = eslDSQ_SENTINEL;
935 } else /* else, existing dsq is just reallocated; leftmost sentinel already in place. */
936 ESL_REALLOC(*dsq, sizeof(ESL_DSQ) * (*L+n+2)); /* most we'll need */
937
938 return esl_abc_dsqcat_noalloc(inmap, *dsq, L, s, n);
939
940 ERROR:
941 return status;
942 }
943
944 /* Function: esl_abc_dsqcat_noalloc()
945 * Synopsis: Version of esl_abc_dsqcat() that does no reallocation.
946 *
947 * Purpose: Same as <esl_abc_dsqcat()>, but with no reallocation of
948 * <dsq>. The pointer to the destination string <dsq> is
949 * passed by value not by reference, because it will not
950 * be reallocated or moved. Caller has already allocated
951 * at least <*L + n + 2> bytes in <dsq>. <*L> and <n> are
952 * not optional; caller must know (and provide) the lengths
953 * of both the old string and the new source.
954 *
955 * Note: This version was needed in selex format parsing, where
956 * we need to prepend and append some number of gaps on
957 * each new line of each block of input; allocating once
958 * then adding the gaps and the sequence seemed most efficient.
959 */
960 int
esl_abc_dsqcat_noalloc(const ESL_DSQ * inmap,ESL_DSQ * dsq,int64_t * L,const char * s,esl_pos_t n)961 esl_abc_dsqcat_noalloc(const ESL_DSQ *inmap, ESL_DSQ *dsq, int64_t *L, const char *s, esl_pos_t n)
962 {
963 int64_t xpos;
964 esl_pos_t cpos;
965 ESL_DSQ x;
966 int status = eslOK;
967
968 /* Watch these coords. Start in the 0..n-1 text string at 0;
969 * start in the 1..L dsq at L+1, overwriting its terminal
970 * sentinel byte.
971 */
972 for (xpos = *L+1, cpos = 0; cpos < n; cpos++)
973 {
974 if (! isascii(s[cpos])) { dsq[xpos++] = inmap[0]; status = eslEINVAL; continue; }
975
976 x = inmap[(int) s[cpos]];
977
978 if (x <= 127) dsq[xpos++] = x;
979 else switch (x) {
980 case eslDSQ_SENTINEL: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_SENTINEL"); break;
981 case eslDSQ_ILLEGAL: dsq[xpos++] = inmap[0]; status = eslEINVAL; break;
982 case eslDSQ_IGNORED: break;
983 case eslDSQ_EOL: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOL"); break;
984 case eslDSQ_EOD: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOD"); break;
985 default: ESL_EXCEPTION(eslEINCONCEIVABLE, "bad inmap, no such ESL_DSQ code"); break;
986 }
987 }
988 dsq[xpos] = eslDSQ_SENTINEL;
989 *L = xpos-1;
990 return status;
991 }
992
993 /* Function: esl_abc_dsqlen()
994 * Synopsis: Returns the length of a digital sequence.
995 *
996 * Purpose: Returns the length of digitized sequence <dsq> in
997 * positions (including gaps, if any). The <dsq> must be
998 * properly terminated by a sentinel byte
999 * (<eslDSQ_SENTINEL>).
1000 */
1001 int64_t
esl_abc_dsqlen(const ESL_DSQ * dsq)1002 esl_abc_dsqlen(const ESL_DSQ *dsq)
1003 {
1004 int64_t n = 0;
1005 while (dsq[n+1] != eslDSQ_SENTINEL) n++;
1006 return n;
1007 }
1008
1009 /* Function: esl_abc_dsqrlen()
1010 * Synopsis: Returns the number of residues in a digital seq.
1011 *
1012 * Purpose: Returns the unaligned length of digitized sequence
1013 * <dsq>, in residues, not counting any gaps, nonresidues,
1014 * or missing data symbols.
1015 */
1016 int64_t
esl_abc_dsqrlen(const ESL_ALPHABET * abc,const ESL_DSQ * dsq)1017 esl_abc_dsqrlen(const ESL_ALPHABET *abc, const ESL_DSQ *dsq)
1018 {
1019 int64_t n = 0;
1020 int64_t i;
1021
1022 for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
1023 if (esl_abc_XIsResidue(abc, dsq[i])) n++;
1024 return n;
1025 }
1026
1027 /* Function: esl_abc_CDealign()
1028 * Synopsis: Dealigns a text string, using a reference digital aseq.
1029 *
1030 * Purpose: Dealigns <s> in place by removing characters aligned to
1031 * gaps (or missing data symbols) in the reference digital
1032 * aligned sequence <ref_ax>. Gaps and missing data symbols
1033 * in <ref_ax> are defined by its digital alphabet <abc>.
1034 *
1035 * <s> is typically going to be some kind of textual
1036 * annotation string (secondary structure, consensus, or
1037 * surface accessibility).
1038 *
1039 * Be supercareful of off-by-one errors here! The <ref_ax>
1040 * is a digital sequence that is indexed <1..L>. The
1041 * annotation string <s> is assumed to be <0..L-1> (a
1042 * normal C string), off by one with respect to <ref_ax>.
1043 * In a sequence object, ss annotation is actually stored
1044 * <1..L> -- so if you're going to <esl_abc_CDealign()> a
1045 * <sq->ss>, pass <sq->ss+1> as the argument <s>.
1046 *
1047 * Returns: Returns <eslOK> on success; optionally returns the number
1048 * of characters in the dealigned <s> in <*opt_rlen>.
1049 *
1050 * Throws: (no abnormal error conditions)
1051 */
1052 int
esl_abc_CDealign(const ESL_ALPHABET * abc,char * s,const ESL_DSQ * ref_ax,int64_t * opt_rlen)1053 esl_abc_CDealign(const ESL_ALPHABET *abc, char *s, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
1054 {
1055 int64_t apos;
1056 int64_t n = 0;
1057
1058 if (s == NULL) return eslOK;
1059
1060 for (n=0, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
1061 if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
1062 s[n++] = s[apos-1]; /* apos-1 because we assume s was 0..alen-1, whereas ref_ax was 1..alen */
1063 s[n] = '\0';
1064
1065 if (opt_rlen != NULL) *opt_rlen = n;
1066 return eslOK;
1067 }
1068
1069 /* Function: esl_abc_XDealign()
1070 * Synopsis: Dealigns a digital string, using a reference digital aseq.
1071 *
1072 * Purpose: Dealigns <x> in place by removing characters aligned to
1073 * gaps (or missing data) in the reference digital aligned
1074 * sequence <ref_ax>. Gaps and missing data symbols in
1075 * <ref_ax> are defined by its digital alphabet <abc>.
1076 *
1077 * Returns: Returns <eslOK> on success; optionally returns the number
1078 * of characters in the dealigned <x> in <*opt_rlen>.
1079 *
1080 * Throws: (no abnormal error conditions)
1081 */
1082 int
esl_abc_XDealign(const ESL_ALPHABET * abc,ESL_DSQ * x,const ESL_DSQ * ref_ax,int64_t * opt_rlen)1083 esl_abc_XDealign(const ESL_ALPHABET *abc, ESL_DSQ *x, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
1084 {
1085 int64_t apos;
1086 int64_t n = 0;
1087
1088 if (x == NULL) return eslOK;
1089
1090 x[0] = eslDSQ_SENTINEL;
1091 for (n=1, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
1092 if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
1093 x[n++] = x[apos];
1094 x[n] = eslDSQ_SENTINEL;
1095
1096 if (opt_rlen != NULL) *opt_rlen = n-1;
1097 return eslOK;
1098 }
1099
1100 /* Function: esl_abc_ConvertDegen2X()
1101 * Synopsis: Convert all degenerate residues to X or N.
1102 *
1103 * Purpose: Convert all the degenerate residue codes in digital
1104 * sequence <dsq> to the code for the maximally degenerate
1105 * "unknown residue" code, as specified in digital alphabet
1106 * <abc>. (For example, X for protein, N for nucleic acid.)
1107 *
1108 * This comes in handy when you're dealing with some piece
1109 * of software that can't deal with standard residue codes,
1110 * and you want to massage your sequences into a form that
1111 * can be accepted. For example, WU-BLAST can't deal with O
1112 * (pyrrolysine) residues, but UniProt has O codes.
1113 *
1114 * Returns: <eslOK> on success.
1115 *
1116 * Throws: (no abnormal error conditions)
1117 */
1118 int
esl_abc_ConvertDegen2X(const ESL_ALPHABET * abc,ESL_DSQ * dsq)1119 esl_abc_ConvertDegen2X(const ESL_ALPHABET *abc, ESL_DSQ *dsq)
1120 {
1121 int64_t i;
1122
1123 for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
1124 if (esl_abc_XIsDegenerate(abc, dsq[i]))
1125 dsq[i] = esl_abc_XGetUnknown(abc);
1126 return eslOK;
1127 }
1128
1129
1130 /* Function: esl_abc_revcomp()
1131 * Synopsis: Reverse complement a digital sequence
1132 * Incept: SRE, Wed Feb 10 11:54:48 2016 [JB251 BOS-MCO]
1133 *
1134 * Purpose: Reverse complement <dsq>, in place, according to
1135 * its digital alphabet <abc>.
1136 *
1137 * Args: abc - digital alphabet
1138 * dsq - digital sequence, 1..n
1139 * n - length of <dsq>
1140 *
1141 * Returns: <eslOK> on success.
1142 *
1143 * Throws: <eslEINCOMPAT> if alphabet <abc> can't be reverse complemented
1144 */
1145 int
esl_abc_revcomp(const ESL_ALPHABET * abc,ESL_DSQ * dsq,int n)1146 esl_abc_revcomp(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int n)
1147 {
1148 ESL_DSQ x;
1149 int pos;
1150
1151 if (abc->complement == NULL)
1152 ESL_EXCEPTION(eslEINCOMPAT, "tried to reverse complement using an alphabet that doesn't have one");
1153
1154 for (pos = 1; pos <= n/2; pos++)
1155 {
1156 x = abc->complement[dsq[n-pos+1]];
1157 dsq[n-pos+1] = abc->complement[dsq[pos]];
1158 dsq[pos] = x;
1159 }
1160 if (n%2) dsq[pos] = abc->complement[dsq[pos]];
1161 return eslOK;
1162 }
1163
1164
1165
1166 /*-------------- end, digital sequences (ESL_DSQ) ---------------*/
1167
1168
1169 /*****************************************************************
1170 * 3. Other routines in the API.
1171 *****************************************************************/
1172
1173 /* Function: esl_abc_ValidateType()
1174 * Synopsis: Check that an alphabet is known and valid
1175 * Incept: SRE, Thu Feb 11 15:48:23 2016
1176 *
1177 * Purpose: Returns <eslOK> if <type> is a valid and known Easel
1178 * alphabet type code.
1179 *
1180 * Used to validate "user" input, where we're parsing a
1181 * file format that has stored an Easel alphabet code.
1182 *
1183 * Returns <eslFAIL> for the special <eslUNKNOWN> "unset"
1184 * value, even though that is a valid code, because it's
1185 * not an alphabet, so shouldn't show up in a file.
1186 */
1187 int
esl_abc_ValidateType(int type)1188 esl_abc_ValidateType(int type)
1189 {
1190 if (type <= 0 || type > eslNONSTANDARD) return eslFAIL;
1191 else return eslOK;
1192 }
1193
1194
1195 /* Function: esl_abc_GuessAlphabet()
1196 * Synopsis: Guess alphabet type from residue composition.
1197 *
1198 * Purpose: Guess the alphabet type from a residue composition.
1199 * The input <ct[0..25]> array contains observed counts of
1200 * the letters A..Z, case-insensitive.
1201 *
1202 * The composition <ct> must contain more than 10 residues.
1203 *
1204 * If it contains $\geq$98\% ACGTN and all four of the
1205 * residues ACGT occur, call it <eslDNA>. (Analogously for
1206 * ACGUN, ACGU: call <eslRNA>.)
1207 *
1208 * If it contains any amino-specific residue (EFIJLPQZ),
1209 * call it <eslAMINO>.
1210 *
1211 * If it consists of $\geq$98\% canonical aa residues or X,
1212 * and at least 15 of the different 20 aa residues occur,
1213 * and the number of residues that are canonical aa/degenerate
1214 * nucleic (DHKMRSVWY) is greater than the number of canonicals
1215 * for both amino and nucleic (ACG); then call it <eslAMINO>.
1216 *
1217 * As a special case, if it consists entirely of N's, and
1218 * we have >2000 residues, call it <eslDNA>. This is a
1219 * special case that deals with genome sequence assemblies
1220 * that lead with a swath of N's.
1221 *
1222 * We aim to be very conservative, essentially never making
1223 * a false call; we err towards calling <eslUNKNOWN> if
1224 * unsure. Our test is to classify every individual
1225 * sequence in NCBI NR and NT (or equiv large messy
1226 * sequence database) with no false positives, only correct
1227 * calls or <eslUNKNOWN>.
1228 *
1229 * Returns: <eslOK> on success, and <*ret_type> is set to
1230 * <eslAMINO>, <eslRNA>, or <eslDNA>.
1231 *
1232 * Returns <eslENOALPHABET> if unable to determine the
1233 * alphabet type; in this case, <*ret_type> is set to
1234 * <eslUNKNOWN>.
1235 *
1236 * Notes: As of Jan 2011:
1237 * nr 10M seqs : 6999 unknown, 0 misclassified
1238 * Pfam full 13M seqs : 7930 unknown, 0 misclassified
1239 * Pfam seed 500K seqs: 366 unknown, 0 misclassified
1240 * trembl 14M seqs : 7748 unknown, 0 misclassified
1241 *
1242 * nt 10M seqs : 35620 unknown, 0 misclassified
1243 * Rfam full 3M seqs : 8146 unknown, 0 misclassified
1244 * Rfam seed 27K seqs : 49 unknown, 0 misclassified
1245 *
1246 * xref: esl_alphabet_example3 collects per-sequence classification
1247 * 2012/0201-easel-guess-alphabet
1248 * J1/62; 2007-0517-easel-guess-alphabet
1249 */
1250 int
esl_abc_GuessAlphabet(const int64_t * ct,int * ret_type)1251 esl_abc_GuessAlphabet(const int64_t *ct, int *ret_type)
1252 {
1253 int type = eslUNKNOWN;
1254 char aaonly[] = "EFIJLOPQZ";
1255 char allcanon[] = "ACG";
1256 char aacanon[] = "DHKMRSVWY";
1257 int64_t n1, n2, n3, nn, nt, nu, nx, n; /* n's are counts */
1258 int x1, x2, x3, xn, xt, xu; /* x's are how many different residues are represented */
1259 int i, x;
1260
1261 x1 = x2 = x3 = xn = xt = xu = 0;
1262 n1 = n2 = n3 = n = 0;
1263 for (i = 0; i < 26; i++) n += ct[i];
1264 for (i = 0; aaonly[i] != '\0'; i++) { x = ct[aaonly[i] - 'A']; if (x > 0) { n1 += x; x1++; } }
1265 for (i = 0; allcanon[i] != '\0'; i++) { x = ct[allcanon[i] - 'A']; if (x > 0) { n2 += x; x2++; } }
1266 for (i = 0; aacanon[i] != '\0'; i++) { x = ct[aacanon[i] - 'A']; if (x > 0) { n3 += x; x3++; } }
1267 nt = ct['T' - 'A']; xt = (nt ? 1 : 0);
1268 nu = ct['U' - 'A']; xu = (nu ? 1 : 0);
1269 nx = ct['X' - 'A'];
1270 nn = ct['N' - 'A']; xn = (nn ? 1 : 0);
1271
1272 if (n <= 10) type = eslUNKNOWN; // small sample, don't guess
1273 else if (n > 2000 && nn == n) type = eslDNA; // special case of many N's leading a genome assembly
1274 else if (n1 > 0) type = eslAMINO; // contains giveaway, aa-only chars
1275 else if (n-(n2+nt+nn) <= 0.02*n && x2+xt == 4) type = eslDNA; // nearly all DNA canon (or N), all four seen
1276 else if (n-(n2+nu+nn) <= 0.02*n && x2+xu == 4) type = eslRNA; // nearly all RNA canon (or N), all four seen
1277 else if (n-(n1+n2+n3+nn+nt+nx) <= 0.02*n && n3>n2 && x1+x2+x3+xn+xt >= 15) type = eslAMINO; // nearly all aa canon (or X); more aa canon than ambig; all 20 seen
1278
1279 *ret_type = type;
1280 if (type == eslUNKNOWN) return eslENOALPHABET;
1281 else return eslOK;
1282 }
1283
1284
1285
1286 /* Function: esl_abc_Match()
1287 * Synopsis: Returns the probability that two symbols match.
1288 *
1289 * Purpose: Given two digital symbols <x> and <y> in alphabet
1290 * <abc>, calculate and return the probability that
1291 * <x> and <y> match, taking degenerate residue codes
1292 * into account.
1293 *
1294 * If <p> residue probability vector is NULL, the
1295 * calculation is a simple average. For example, for DNA,
1296 * R/A gives 0.5, C/N gives 0.25, N/R gives 0.25, R/R gives
1297 * 0.5.
1298 *
1299 * If <p> residue probability vector is non-NULL, it gives
1300 * a 0..K-1 array of background frequencies, and the
1301 * returned match probability is an expectation (weighted
1302 * average) given those residue frequencies.
1303 *
1304 * <x> and <y> should only be residue codes. Any other
1305 * comparison, including comparisons involving gap or
1306 * missing data characters, or even comparisons involving
1307 * illegal digital codes, returns 0.0.
1308 *
1309 * Note that comparison of residues from "identical"
1310 * sequences (even a self-comparison) will not result in an
1311 * identity of 1.0, if the sequence(s) contain degenerate
1312 * residue codes.
1313 *
1314 * Args: abc - digtal alphabet to use
1315 * x,y - two symbols to compare
1316 * p - NULL, or background probabilities of the
1317 * canonical residues in this alphabet [0..K-1]
1318 *
1319 * Returns: the probability of an identity (match) between
1320 * residues <x> and <y>.
1321 */
1322 double
esl_abc_Match(const ESL_ALPHABET * abc,ESL_DSQ x,ESL_DSQ y,double * p)1323 esl_abc_Match(const ESL_ALPHABET *abc, ESL_DSQ x, ESL_DSQ y, double *p)
1324 {
1325 int i;
1326 double prob;
1327 double sx, sy;
1328
1329 /* Easy cases */
1330 if (esl_abc_XIsCanonical(abc, x) && esl_abc_XIsCanonical(abc, y))
1331 {
1332 if (x==y) return 1.0; else return 0.0;
1333 }
1334 if ( ! esl_abc_XIsResidue(abc, x) || ! esl_abc_XIsResidue(abc, x)) return 0.0;
1335
1336 /* Else, we have at least one degenerate residue, so calc an average or expectation.
1337 */
1338 if (p != NULL)
1339 {
1340 prob = sx = sy = 0.;
1341 for (i = 0; i < abc->K; i++)
1342 {
1343 if (abc->degen[(int)x][i]) sx += p[i];
1344 if (abc->degen[(int)y][i]) sy += p[i];
1345 if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += p[i] * p[i];
1346 }
1347 prob = prob / (sx*sy);
1348 }
1349 else
1350 {
1351 double uniformp = 1. / (double) abc->K;
1352 prob = sx = sy = 0.;
1353 for (i = 0; i < abc->K; i++)
1354 {
1355 if (abc->degen[(int)x][i]) sx += uniformp;
1356 if (abc->degen[(int)y][i]) sy += uniformp;
1357 if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += uniformp * uniformp;
1358 }
1359 prob = prob / (sx*sy);
1360 }
1361 return prob;
1362 }
1363
1364
1365
1366 /* Function: esl_abc_IAvgScore()
1367 * Synopsis: Returns average score for degenerate residue.
1368 *
1369 * Purpose: Given a residue code <x> in alphabet <a>, and an array of
1370 * integer scores <sc> for the residues in the base
1371 * alphabet, calculate and return the average score
1372 * (rounded to nearest integer).
1373 *
1374 * <x> would usually be a degeneracy code, but it
1375 * may also be a canonical residue. It must not
1376 * be a gap, missing data, or illegal symbol; if it
1377 * is, these functions return a score of 0 without
1378 * raising an error.
1379 *
1380 * <esl_abc_FAvgScore()> and <esl_abc_DAvgScore()> do the
1381 * same, but for float and double scores instead of integers
1382 * (and for real-valued scores, no rounding is done).
1383 *
1384 * Args: a - digital alphabet to use
1385 * x - a symbol to score
1386 * sc - score vector for canonical residues [0..K-1]
1387 *
1388 * Returns: average score for symbol <x>
1389 */
1390 int
esl_abc_IAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const int * sc)1391 esl_abc_IAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc)
1392 {
1393 float result = 0.;
1394 int i;
1395
1396 if (! esl_abc_XIsResidue(a, x)) return 0;
1397 for (i = 0; i < a->K; i++)
1398 if (a->degen[(int) x][i]) result += (float) sc[i];
1399 result /= (float) a->ndegen[(int) x];
1400 if (result < 0) return (int) (result - 0.5);
1401 else return (int) (result + 0.5);
1402 }
1403 float
esl_abc_FAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const float * sc)1404 esl_abc_FAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc)
1405 {
1406 float result = 0.;
1407 int i;
1408
1409 if (! esl_abc_XIsResidue(a, x)) return 0.;
1410 for (i = 0; i < a->K; i++)
1411 if (a->degen[(int) x][i]) result += sc[i];
1412 result /= (float) a->ndegen[(int) x];
1413 return result;
1414 }
1415 double
esl_abc_DAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const double * sc)1416 esl_abc_DAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc)
1417 {
1418 double result = 0.;
1419 int i;
1420
1421 if (! esl_abc_XIsResidue(a, x)) return 0.;
1422 for (i = 0; i < a->K; i++)
1423 if (a->degen[(int) x][i]) result += sc[i];
1424 result /= (double) a->ndegen[(int) x];
1425 return result;
1426 }
1427
1428
1429 /* Function: esl_abc_IExpectScore()
1430 * Synopsis: Returns expected score for degenerate residue.
1431 *
1432 * Purpose: Given a residue code <x> in alphabet <a>, an
1433 * array of integer scores <sc> for the residues in the base
1434 * alphabet, and background frequencies <p> for the
1435 * occurrence frequencies of the residues in the base
1436 * alphabet, calculate and return the expected score
1437 * (weighted by the occurrence frequencies <p>).
1438 *
1439 * <x> would usually be a degeneracy code, but it
1440 * may also be a canonical residue. It must not
1441 * be a gap, missing data, or illegal symbol; if it
1442 * is, these functions return a score of 0 without
1443 * raising an error.
1444 *
1445 * <esl_abc_FExpectScore()> and <esl_abc_DExpectScore()> do the
1446 * same, but for float and double scores instead of integers
1447 * (for real-valued scores, no rounding is done).
1448 *
1449 * Args: a - digital alphabet to use
1450 * x - a symbol to score
1451 * sc - score vector for canonical residues [0..K-1]
1452 * p - background prob's of canonicals [0..K-1]
1453 *
1454 * Returns: average score for symbol <x>
1455 */
1456 int
esl_abc_IExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const int * sc,const float * p)1457 esl_abc_IExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc, const float *p)
1458 {
1459 float result = 0.;
1460 float denom = 0.;
1461 int i;
1462
1463 if (! esl_abc_XIsResidue(a, x)) return 0;
1464 for (i = 0; i < a->K; i++)
1465 if (a->degen[(int) x][i]) {
1466 result += (float) sc[i] * p[i];
1467 denom += p[i];
1468 }
1469 result /= denom;
1470 if (result < 0) return (int) (result - 0.5);
1471 else return (int) (result + 0.5);
1472 }
1473 float
esl_abc_FExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const float * sc,const float * p)1474 esl_abc_FExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc, const float *p)
1475 {
1476 float result = 0.;
1477 float denom = 0.;
1478 int i;
1479
1480 if (! esl_abc_XIsResidue(a, x)) return 0.;
1481 for (i = 0; i < a->K; i++)
1482 if (a->degen[(int) x][i]) {
1483 result += sc[i] * p[i];
1484 denom += p[i];
1485 }
1486 result /= denom;
1487 return result;
1488 }
1489 double
esl_abc_DExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const double * sc,const double * p)1490 esl_abc_DExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc, const double *p)
1491 {
1492 double result = 0.;
1493 double denom = 0.;
1494 int i;
1495
1496 if (! esl_abc_XIsResidue(a, x)) return 0.;
1497 for (i = 0; i < a->K; i++)
1498 if (a->degen[(int) x][i]) {
1499 result += sc[i] * p[i];
1500 denom += p[i];
1501 }
1502 result /= denom;
1503 return result;
1504 }
1505
1506 /* Function: esl_abc_IAvgScVec()
1507 * Synopsis: Fill out score vector with average degenerate scores.
1508 *
1509 * Purpose: Given an alphabet <a> and a score vector <sc> of length
1510 * <a->Kp> that contains integer scores for the base
1511 * alphabet (<0..a->K-1>), fill out the rest of the score
1512 * vector, calculating average scores for
1513 * degenerate residues using <esl_abc_IAvgScore()>.
1514 *
1515 * The score, if any, for a gap character <K>, the
1516 * nonresidue <Kp-2>, and the missing data character <Kp-1>
1517 * are untouched by this function. Only the degenerate
1518 * scores <K+1..Kp-3> are filled in.
1519 *
1520 * <esl_abc_FAvgScVec()> and <esl_abc_DAvgScVec()> do
1521 * the same, but for score vectors of floats or doubles,
1522 * respectively.
1523 *
1524 * Returns: <eslOK> on success.
1525 */
1526 int
esl_abc_IAvgScVec(const ESL_ALPHABET * a,int * sc)1527 esl_abc_IAvgScVec(const ESL_ALPHABET *a, int *sc)
1528 {
1529 ESL_DSQ x;
1530 for (x = a->K+1; x <= a->Kp-3; x++)
1531 sc[x] = esl_abc_IAvgScore(a, x, sc);
1532 return eslOK;
1533 }
1534 int
esl_abc_FAvgScVec(const ESL_ALPHABET * a,float * sc)1535 esl_abc_FAvgScVec(const ESL_ALPHABET *a, float *sc)
1536 {
1537 ESL_DSQ x;
1538 for (x = a->K+1; x <= a->Kp-3; x++)
1539 sc[x] = esl_abc_FAvgScore(a, x, sc);
1540 return eslOK;
1541 }
1542 int
esl_abc_DAvgScVec(const ESL_ALPHABET * a,double * sc)1543 esl_abc_DAvgScVec(const ESL_ALPHABET *a, double *sc)
1544 {
1545 ESL_DSQ x;
1546 for (x = a->K+1; x <= a->Kp-3; x++)
1547 sc[x] = esl_abc_DAvgScore(a, x, sc);
1548 return eslOK;
1549 }
1550
1551 /* Function: esl_abc_IExpectScVec()
1552 * Synopsis: Fill out score vector with average expected scores.
1553 *
1554 * Purpose: Given an alphabet <a>, a score vector <sc> of length
1555 * <a->Kp> that contains integer scores for the base
1556 * alphabet (<0..a->K-1>), and residue occurrence probabilities
1557 * <p[0..a->K-1]>; fill in the scores for the
1558 * degenerate residues <K+1..Kp-3> using <esl_abc_IExpectScore()>.
1559 *
1560 * The score, if any, for a gap character <K>, the
1561 * nonresidue <Kp-2>, and the missing data character <Kp-1>
1562 * are untouched by this function. Only the degenerate
1563 * scores <K+1..Kp-3> are filled in.
1564 *
1565 * <esl_abc_FExpectScVec()> and <esl_abc_DExpectScVec()> do
1566 * the same, but for score vectors of floats or doubles,
1567 * respectively. The probabilities <p> are floats for the
1568 * integer and float versions, and doubles for the double
1569 * version.
1570 *
1571 * Returns: <eslOK> on success.
1572 */
1573 int
esl_abc_IExpectScVec(const ESL_ALPHABET * a,int * sc,const float * p)1574 esl_abc_IExpectScVec(const ESL_ALPHABET *a, int *sc, const float *p)
1575 {
1576 ESL_DSQ x;
1577 for (x = a->K+1; x <= a->Kp-3; x++)
1578 sc[x] = esl_abc_IExpectScore(a, x, sc, p);
1579 return eslOK;
1580 }
1581 int
esl_abc_FExpectScVec(const ESL_ALPHABET * a,float * sc,const float * p)1582 esl_abc_FExpectScVec(const ESL_ALPHABET *a, float *sc, const float *p)
1583 {
1584 ESL_DSQ x;
1585 for (x = a->K+1; x <= a->Kp-3; x++)
1586 sc[x] = esl_abc_FExpectScore(a, x, sc, p);
1587 return eslOK;
1588 }
1589 int
esl_abc_DExpectScVec(const ESL_ALPHABET * a,double * sc,const double * p)1590 esl_abc_DExpectScVec(const ESL_ALPHABET *a, double *sc, const double *p)
1591 {
1592 ESL_DSQ x;
1593 for (x = a->K+1; x <= a->Kp-3; x++)
1594 sc[x] = esl_abc_DExpectScore(a, x, sc, p);
1595 return eslOK;
1596 }
1597
1598
1599 /* Function: esl_abc_FCount()
1600 * Synopsis: Count a degenerate symbol into a count vector.
1601 *
1602 * Purpose: Count a possibly degenerate digital symbol <x> (0..Kp-1)
1603 * into a counts array <ct> for base symbols (0..K-1).
1604 * Assign the symbol a weight of <wt> (often just 1.0).
1605 * The count weight of a degenerate symbol is divided equally
1606 * across the possible base symbols.
1607 *
1608 * <x> can be a gap; if so, <ct> must be allocated 0..K,
1609 * not 0..K-1. If <x> is a missing data symbol, or a nonresidue
1610 * data symbol, nothing is done.
1611 *
1612 * <esl_abc_DCount()> does the same, but for double-precision
1613 * count vectors and weights.
1614 *
1615 * Returns: <eslOK> on success.
1616 */
1617 int
esl_abc_FCount(const ESL_ALPHABET * abc,float * ct,ESL_DSQ x,float wt)1618 esl_abc_FCount(const ESL_ALPHABET *abc, float *ct, ESL_DSQ x, float wt)
1619 {
1620 ESL_DSQ y;
1621
1622 if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
1623 ct[x] += wt;
1624 else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
1625 return eslOK;
1626 else
1627 for (y = 0; y < abc->K; y++) {
1628 if (abc->degen[x][y])
1629 ct[y] += wt / (float) abc->ndegen[x];
1630 }
1631 return eslOK;
1632 }
1633 int
esl_abc_DCount(const ESL_ALPHABET * abc,double * ct,ESL_DSQ x,double wt)1634 esl_abc_DCount(const ESL_ALPHABET *abc, double *ct, ESL_DSQ x, double wt)
1635 {
1636 ESL_DSQ y;
1637
1638 if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
1639 ct[x] += wt;
1640 else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
1641 return eslOK;
1642 else
1643 for (y = 0; y < abc->K; y++) {
1644 if (abc->degen[x][y])
1645 ct[y] += wt / (double) abc->ndegen[x];
1646 }
1647 return eslOK;
1648 }
1649
1650 /* Function: esl_abc_EncodeType()
1651 * Synopsis: Convert descriptive string to alphabet type code.
1652 *
1653 * Purpose: Convert a string like "amino" or "DNA" to the
1654 * corresponding Easel internal alphabet type code
1655 * such as <eslAMINO> or <eslDNA>; return the code.
1656 *
1657 * Returns: the code, such as <eslAMINO>; if <type> isn't
1658 * recognized, returns <eslUNKNOWN>.
1659 */
1660 int
esl_abc_EncodeType(char * type)1661 esl_abc_EncodeType(char *type)
1662 {
1663 if (strcasecmp(type, "amino") == 0) return eslAMINO;
1664 else if (strcasecmp(type, "rna") == 0) return eslRNA;
1665 else if (strcasecmp(type, "dna") == 0) return eslDNA;
1666 else if (strcasecmp(type, "coins") == 0) return eslCOINS;
1667 else if (strcasecmp(type, "dice") == 0) return eslDICE;
1668 else if (strcasecmp(type, "custom")== 0) return eslNONSTANDARD;
1669 else return eslUNKNOWN;
1670 }
1671
1672 /* Function: esl_abc_EncodeTypeMem()
1673 * Synopsis: Convert memory chunk to alphabet type code
1674 * Incept: SRE, Thu 02 Aug 2018
1675 *
1676 * Purpose: Same as <esl_abc_EncodeType()>, but for a
1677 * non-NUL terminated memory chunk <type> of
1678 * length <n>.
1679 */
1680 int
esl_abc_EncodeTypeMem(char * type,int n)1681 esl_abc_EncodeTypeMem(char *type, int n)
1682 {
1683 if (esl_memstrcmp_case(type, n, "amino")) return eslAMINO;
1684 else if (esl_memstrcmp_case(type, n, "rna")) return eslRNA;
1685 else if (esl_memstrcmp_case(type, n, "dna")) return eslDNA;
1686 else if (esl_memstrcmp_case(type, n, "coins")) return eslCOINS;
1687 else if (esl_memstrcmp_case(type, n, "dice")) return eslDICE;
1688 else if (esl_memstrcmp_case(type, n, "custom")) return eslNONSTANDARD;
1689 else return eslUNKNOWN;
1690 }
1691
1692
1693 /* Function: esl_abc_DecodeType()
1694 * Synopsis: Returns descriptive string for alphabet type code.
1695 *
1696 * Purpose: For diagnostics and other output: given an internal
1697 * alphabet code <type> (<eslRNA>, for example), return
1698 * pointer to an internal string ("RNA", for example).
1699 */
1700 char *
esl_abc_DecodeType(int type)1701 esl_abc_DecodeType(int type)
1702 {
1703 switch (type) {
1704 case eslUNKNOWN: return "unknown";
1705 case eslRNA: return "RNA";
1706 case eslDNA: return "DNA";
1707 case eslAMINO: return "amino";
1708 case eslCOINS: return "coins";
1709 case eslDICE: return "dice";
1710 case eslNONSTANDARD: return "custom";
1711 default: break;
1712 }
1713 esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such alphabet type code %d\n", type);
1714 return NULL;
1715 }
1716
1717
1718 /* Function: esl_abc_ValidateSeq()
1719 * Synopsis: Assure that a text sequence can be digitized.
1720 *
1721 * Purpose: Check that sequence <seq> of length <L> can be digitized
1722 * without error; all its symbols are valid in alphabet
1723 * <a>. If so, return <eslOK>. If not, return <eslEINVAL>.
1724 *
1725 * If <a> is <NULL>, we still validate that at least the
1726 * <seq> consists only of ASCII characters.
1727 *
1728 * <errbuf> is either passed as <NULL>, or a pointer to an
1729 * error string buffer allocated by the caller for
1730 * <eslERRBUFSIZE> characters. If <errbuf> is non-NULL, and
1731 * the sequence is invalid, an error message is placed in
1732 * <errbuf>.
1733 *
1734 * Args: a - digital alphabet (or NULL, if unavailable)
1735 * seq - sequence to validate [0..L-1]; NUL-termination unnecessary
1736 * L - length of <seq>
1737 * errbuf - NULL, or ptr to <eslERRBUFSIZE> chars of allocated space
1738 * for an error message.
1739 *
1740 * Returns: <eslOK> if <seq> is valid; <eslEINVAL> if not.
1741 *
1742 * Throws: (no abnormal error conditions).
1743 */
1744 int
esl_abc_ValidateSeq(const ESL_ALPHABET * a,const char * seq,int64_t L,char * errbuf)1745 esl_abc_ValidateSeq(const ESL_ALPHABET *a, const char *seq, int64_t L, char *errbuf)
1746 {
1747 int status;
1748 int64_t i;
1749 int64_t firstpos = -1;
1750 int64_t nbad = 0;
1751
1752 if (errbuf) *errbuf = 0;
1753
1754
1755 if (a) // If we have digital alphabet <a>, it has an <inmap> we can check against
1756 {
1757 for (i = 0; i < L; i++) {
1758 if (! esl_abc_CIsValid(a, seq[i])) {
1759 if (firstpos == -1) firstpos = i;
1760 nbad++;
1761 }
1762 }
1763 }
1764 else // Else, at least validate that the text string is an ASCII text string
1765 {
1766 for (i = 0; i < L; i++) {
1767 if (! isascii(seq[i])) {
1768 if (firstpos == -1) firstpos = i;
1769 nbad++;
1770 }
1771 }
1772 }
1773
1774 if (nbad == 1) ESL_XFAIL(eslEINVAL, errbuf, "invalid char %c at pos %" PRId64, seq[firstpos], firstpos+1);
1775 else if (nbad > 1) ESL_XFAIL(eslEINVAL, errbuf, "%" PRId64 " invalid chars (including %c at pos %" PRId64 ")", nbad, seq[firstpos], firstpos+1);
1776 return eslOK;
1777
1778 ERROR:
1779 return status;
1780 }
1781 /*---------------- end, other API functions ---------------------*/
1782
1783
1784
1785 /*****************************************************************
1786 * 4. Unit tests.
1787 *****************************************************************/
1788 #ifdef eslALPHABET_TESTDRIVE
1789 #include "esl_vectorops.h"
1790
1791 static int
utest_Create(void)1792 utest_Create(void)
1793 {
1794 char msg[] = "esl_alphabet_Create() unit test failed";
1795 int types[] = { eslDNA, eslRNA, eslAMINO, eslCOINS, eslDICE };
1796 int Karr[] = { 4, 4, 20, 2, 6 };
1797 int Kparr[] = { 18, 18, 29, 6, 10 };
1798 int i;
1799 ESL_ALPHABET *a;
1800 ESL_DSQ x;
1801
1802 for (i = 0; i < 3; i++)
1803 {
1804 if ((a = esl_alphabet_Create(types[i])) == NULL) esl_fatal(msg);
1805 if (a->type != types[i]) esl_fatal(msg);
1806 if (a->K != Karr[i]) esl_fatal(msg);
1807 if (a->Kp != Kparr[i]) esl_fatal(msg);
1808 if (strlen(a->sym) != a->Kp) esl_fatal(msg);
1809
1810 x = esl_abc_XGetGap(a);
1811 if (x != a->K) esl_fatal(msg);
1812 if (a->ndegen[x] != 0) esl_fatal(msg);
1813
1814 x = esl_abc_XGetUnknown(a);
1815 if (x != a->Kp-3) esl_fatal(msg);
1816 if (a->ndegen[x] != a->K) esl_fatal(msg);
1817
1818 x = esl_abc_XGetNonresidue(a);
1819 if (x != a->Kp-2) esl_fatal(msg);
1820 if (a->ndegen[x] != 0) esl_fatal(msg);
1821
1822 x = esl_abc_XGetMissing(a);
1823 if (x != a->Kp-1) esl_fatal(msg);
1824 if (a->ndegen[x] != 0) esl_fatal(msg);
1825
1826 esl_alphabet_Destroy(a);
1827 }
1828
1829 /* Thrown errors
1830 */
1831 #ifdef eslTEST_THROWING
1832 if (esl_alphabet_Create(-1) != NULL) esl_fatal(msg);
1833 if (esl_alphabet_Create(eslUNKNOWN) != NULL) esl_fatal(msg);
1834 if (esl_alphabet_Create(eslNONSTANDARD) != NULL) esl_fatal(msg);
1835 #endif
1836
1837 return eslOK;
1838 }
1839
1840 static int
utest_CreateCustom(void)1841 utest_CreateCustom(void)
1842 {
1843 char msg[] = "esl_alphabet_CreateCustom() unit test failed";
1844 ESL_ALPHABET *a;
1845 char *testseq = "AaU-~Z";
1846 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 0, 15, 20, 26, 23, eslDSQ_SENTINEL };
1847 ESL_DSQ *dsq;
1848
1849 if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZX*~", 20, 27)) == NULL) esl_fatal(msg);
1850 if (esl_alphabet_SetEquiv(a, 'O', 'K') != eslOK) esl_fatal(msg); /* read pyrrolysine O as lysine K */
1851 if (esl_alphabet_SetEquiv(a, 'U', 'S') != eslOK) esl_fatal(msg); /* read selenocys U as serine S */
1852 if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg); /* allow lower case input */
1853 if (esl_alphabet_SetDegeneracy(a, 'Z', "QE") != eslOK) esl_fatal(msg);
1854
1855 if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
1856 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1857
1858 free(dsq);
1859 esl_alphabet_Destroy(a);
1860
1861 #ifdef eslTEST_THROWING
1862 if (esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 21) != NULL) esl_fatal(msg); /* Kp mismatches string length */
1863 #endif
1864
1865 return eslOK;
1866 }
1867
1868 static int
utest_SetEquiv(void)1869 utest_SetEquiv(void)
1870 {
1871 char msg[] = "esl_alphabet_SetEquiv() unit test failed";
1872 ESL_ALPHABET *a;
1873 char *testseq = "a1&";
1874 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 4, 7, eslDSQ_SENTINEL };
1875 ESL_DSQ *dsq;
1876
1877 if ((a = esl_alphabet_CreateCustom("ACGT-N*~", 4, 8)) == NULL) esl_fatal(msg);
1878 if (esl_alphabet_SetEquiv(a, 'a', 'A') != eslOK) esl_fatal(msg);
1879 if (esl_alphabet_SetEquiv(a, '1', '-') != eslOK) esl_fatal(msg);
1880 if (esl_alphabet_SetEquiv(a, '&', '~') != eslOK) esl_fatal(msg);
1881
1882 #ifdef eslTEST_THROWING
1883 if (esl_alphabet_SetEquiv(a, 'G', 'C') != eslEINVAL) esl_fatal(msg); /* sym is already in internal alphabet */
1884 if (esl_alphabet_SetEquiv(a, '2', 'V') != eslEINVAL) esl_fatal(msg); /* c is not in internal alphabet */
1885 #endif
1886
1887 if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
1888 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1889 free(dsq);
1890 esl_alphabet_Destroy(a);
1891 return eslOK;
1892 }
1893
1894 static int
utest_SetCaseInsensitive(void)1895 utest_SetCaseInsensitive(void)
1896 {
1897 char msg[] = "esl_alphabet_SetCaseInsensitive() unit test failed";
1898 ESL_ALPHABET *a;
1899 char *testseq = "ACGT";
1900 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
1901 ESL_DSQ *dsq;
1902
1903 if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL) esl_fatal(msg);
1904 if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg);
1905 if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
1906 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1907 free(dsq);
1908 esl_alphabet_Destroy(a);
1909
1910 #ifdef TEST_THROWING
1911 if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL) esl_fatal(msg);
1912 if (esl_alphabet_SetEquiv(a, 'A', 'c') != eslOK) esl_fatal(msg); /* now input A maps to internal c */
1913 if (esl_alphabet_SetCaseInsensitive(a) != eslECORRUPT) esl_fatal(msg); /* and this fails, can't remap A */
1914 esl_alphabet_Destroy(a);
1915 #endif
1916
1917 return eslOK;
1918 }
1919
1920 static int
utest_SetDegeneracy(void)1921 utest_SetDegeneracy(void)
1922 {
1923 char msg[] = "esl_alphabet_SetDegeneracy() unit test failed";
1924 ESL_ALPHABET *a;
1925 char *testseq = "yrn";
1926 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 6, 5, 7, eslDSQ_SENTINEL };
1927 ESL_DSQ *dsq;
1928 ESL_DSQ x;
1929
1930 if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1931 if (esl_alphabet_SetDegeneracy(a, 'R', "AG") != eslOK) esl_fatal(msg);
1932 if (esl_alphabet_SetDegeneracy(a, 'Y', "CT") != eslOK) esl_fatal(msg);
1933 if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg);
1934
1935 if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
1936 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1937
1938 x = esl_abc_DigitizeSymbol(a, 'a'); if (a->ndegen[x] != 1) esl_fatal(msg);
1939 x = esl_abc_DigitizeSymbol(a, 'r'); if (a->ndegen[x] != 2) esl_fatal(msg);
1940 x = esl_abc_DigitizeSymbol(a, 'y'); if (a->ndegen[x] != 2) esl_fatal(msg);
1941 x = esl_abc_DigitizeSymbol(a, 'n'); if (a->ndegen[x] != 4) esl_fatal(msg);
1942
1943 free(dsq);
1944 esl_alphabet_Destroy(a);
1945
1946 #ifdef TEST_THROWING
1947 if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1948 if (esl_abc_SetDegeneracy(a, 'z', "AC") != eslEINVAL) esl_fatal(msg); /* can't map char not in alphabet */
1949 if (esl_abc_SetDegeneracy(a, 'N', "ACGT") != eslEINVAL) esl_fatal(msg); /* can't remap N */
1950 if (esl_abc_SetDegeneracy(a, 'A', "GT") != eslEINVAL) esl_fatal(msg); /* can't map a nondegen character */
1951 if (esl_abc_SetDegeneracy(a, '-', "GT") != eslEINVAL) esl_fatal(msg); /* ... or a gap... */
1952 if (esl_abc_SetDegeneracy(a, '*', "GT") != eslEINVAL) esl_fatal(msg); /* ... or nonresidues... */
1953 if (esl_abc_SetDegeneracy(a, '~', "GT") != eslEINVAL) esl_fatal(msg); /* ... or missing data. */
1954 if (esl_abc_SetDegeneracy(a, 'R', "XY") != eslEINVAL) esl_fatal(msg); /* can't map to unknown chars... */
1955 if (esl_abc_SetDegeneracy(a, 'R', "YN") != eslEINVAL) esl_fatal(msg); /* ... nor to noncanonical chars... */
1956 esl_alphabet_Destroy(a);
1957 #endif
1958 return eslOK;
1959 }
1960
1961 static int
utest_SetIgnored(void)1962 utest_SetIgnored(void)
1963 {
1964 char msg[] = "esl_alphabet_SetIgnored() unit test failed";
1965 ESL_ALPHABET *a;
1966 char *testseq = "y \trn";
1967 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 6, 5, 15, eslDSQ_SENTINEL };
1968 int L = 5;
1969 ESL_DSQ *dsq;
1970
1971 if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
1972 if (esl_alphabet_SetIgnored(a, " \t") != eslOK) esl_fatal(msg);
1973
1974 if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
1975 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * L) != 0) esl_fatal(msg);
1976 free(dsq);
1977 esl_alphabet_Destroy(a);
1978 return eslOK;
1979 }
1980
1981
1982 static int
utest_Destroy(void)1983 utest_Destroy(void)
1984 {
1985 char msg[] = "esl_alphabet_Destroy() unit test failed";
1986 ESL_ALPHABET *a;
1987
1988 if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1989 esl_alphabet_Destroy(a);
1990 esl_alphabet_Destroy(NULL); /* should be robust against NULL pointers */
1991 return eslOK;
1992 }
1993
1994 static int
utest_CreateDsq(void)1995 utest_CreateDsq(void)
1996 {
1997 char msg[] = "esl_abc_CreateDsq() unit test failed";
1998 ESL_ALPHABET *a;
1999 char goodseq[] = "ACDEF";
2000 char badseq[] = "1@%34";
2001 ESL_DSQ *dsq;
2002 ESL_DSQ x;
2003
2004 if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2005
2006 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2007 if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
2008 free(dsq);
2009
2010 if (esl_abc_CreateDsq(a, badseq, &dsq) != eslEINVAL) esl_fatal(msg);
2011 x = esl_abc_XGetUnknown(a);
2012 if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
2013 free(dsq);
2014
2015 if (esl_abc_CreateDsq(a, goodseq, NULL) != eslOK) esl_fatal(msg);
2016
2017 esl_alphabet_Destroy(a);
2018 return eslOK;
2019 }
2020
2021 static int
utest_Digitize(void)2022 utest_Digitize(void)
2023 {
2024 char msg[] = "esl_abc_Digitize() unit test failed";
2025 ESL_ALPHABET *a;
2026 char goodseq[] = "ACDEF";
2027 char badseq[] = "1@%34";
2028 ESL_DSQ *dsq;
2029 ESL_DSQ x;
2030 int status;
2031
2032 ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (strlen(goodseq)+2));
2033
2034 if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2035 esl_abc_Digitize(a, goodseq, dsq);
2036 if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
2037
2038 esl_abc_Digitize(a, badseq, dsq);
2039 x = esl_abc_XGetUnknown(a);
2040 if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
2041
2042 free(dsq);
2043 esl_alphabet_Destroy(a);
2044 return eslOK;
2045
2046 ERROR:
2047 esl_fatal(msg);
2048 return status;
2049 }
2050
2051 static int
utest_Textize(void)2052 utest_Textize(void)
2053 {
2054 char msg[] = "esl_abc_Textize() unit test failed";
2055 ESL_ALPHABET *a;
2056 char goodseq[] = "acdef";
2057 char *newseq;
2058 ESL_DSQ *dsq;
2059 int L;
2060 int status;
2061
2062 L = strlen(goodseq);
2063 ESL_ALLOC(newseq, sizeof(char) * (L+1));
2064 if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2065 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2066 if (esl_abc_Textize(a, dsq, L, newseq ) != eslOK) esl_fatal(msg);
2067 if (strcmp(newseq, "ACDEF") != 0) esl_fatal(msg);
2068 free(dsq);
2069 free(newseq);
2070 esl_alphabet_Destroy(a);
2071 return eslOK;
2072
2073 ERROR:
2074 esl_fatal(msg);
2075 return status;
2076 }
2077
2078 static int
utest_TextizeN(void)2079 utest_TextizeN(void)
2080 {
2081 char msg[] = "esl_abc_TextizeN() unit test failed";
2082 ESL_ALPHABET *a;
2083 char goodseq[] = "acdefrynacdef";
2084 ESL_DSQ *dsq;
2085 ESL_DSQ *dptr;
2086 int W;
2087
2088 if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2089 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2090
2091 dptr = dsq+6; /* points to "r", residue 6 */
2092 W = 5; /* copy/convert 5 residues "rynac" */
2093 if (esl_abc_TextizeN(a, dptr, W, goodseq) != eslOK) esl_fatal(msg);
2094 if (strcmp(goodseq, "RYNACrynacdef") != 0) esl_fatal(msg);
2095
2096 /* test a case where we hit eslDSQ_SENTINEL, and nul-terminate */
2097 dptr = dsq+10; /* points to "c", residue 10 */
2098 W = 20; /* copy/convert remaining residues "cdef" */
2099 if (esl_abc_TextizeN(a, dptr, W, goodseq) != eslOK) esl_fatal(msg);
2100 if (strcmp(goodseq, "CDEF") != 0) esl_fatal(msg);
2101
2102 free(dsq);
2103 esl_alphabet_Destroy(a);
2104 return eslOK;
2105 }
2106
2107 static int
utest_dsqdup(void)2108 utest_dsqdup(void)
2109 {
2110 char msg[] = "esl_abc_dsqdup() unit test failed";
2111 ESL_ALPHABET *a;
2112 char goodseq[] = "ACGt";
2113 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
2114 ESL_DSQ *d1, *d2;
2115 int L;
2116
2117 L = strlen(goodseq);
2118 if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
2119 if (esl_abc_CreateDsq(a, goodseq, &d1) != eslOK) esl_fatal(msg);
2120
2121 if (esl_abc_dsqdup(d1, -1, &d2) != eslOK) esl_fatal(msg);
2122 if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0) esl_fatal(msg);
2123 free(d2);
2124
2125 if (esl_abc_dsqdup(d1, L, &d2) != eslOK) esl_fatal(msg);
2126 if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0) esl_fatal(msg);
2127 free(d2);
2128
2129 free(d1);
2130 esl_alphabet_Destroy(a);
2131 return eslOK;
2132 }
2133
2134 static int
utest_dsqcat(void)2135 utest_dsqcat(void)
2136 {
2137 char msg[] = "esl_abc_dsqcat() unit test failed";
2138 ESL_ALPHABET *a;
2139 char goodseq[] = "ACGt";
2140 char addseq[] = "RYM KN";
2141 char badseq[] = "RYM K&";
2142 ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, 5, 6, 7, 8, 15, eslDSQ_SENTINEL };
2143 ESL_DSQ *dsq;
2144 int64_t L1;
2145 esl_pos_t L2;
2146
2147
2148 if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
2149 a->inmap[0] = esl_abc_XGetUnknown(a);
2150 a->inmap[' '] = eslDSQ_IGNORED;
2151
2152 L1 = strlen(goodseq);
2153 L2 = strlen(addseq);
2154 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2155 if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2) != eslOK) esl_fatal(msg);
2156 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
2157 free(dsq);
2158
2159 L1 = -1;
2160 L2 = -1;
2161 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2162 if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2) != eslOK) esl_fatal(msg);
2163 if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
2164 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
2165 free(dsq);
2166
2167 L1 = 0;
2168 dsq = NULL;
2169 if (esl_abc_dsqcat(a->inmap, &dsq, &L1, goodseq, -1) != eslOK) esl_fatal(msg);
2170 if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, -1) != eslOK) esl_fatal(msg);
2171 if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
2172 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
2173 free(dsq);
2174
2175 L1 = -1;
2176 L2 = strlen(badseq);
2177 if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2178 if (esl_abc_dsqcat(a->inmap, &dsq, &L1, badseq, L2) != eslEINVAL) esl_fatal(msg);
2179 if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
2180 if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
2181 free(dsq);
2182
2183 esl_alphabet_Destroy(a);
2184 return eslOK;
2185 }
2186
2187 /* dsqlen unit test goes here */
2188 /* dsqrlen unit test goes here */
2189 /* utest_Match goes here */
2190
2191 /* This serves to unit test multiple functions:
2192 * esl_abc_IAvgScore()
2193 * esl_abc_IExpectScore()
2194 */
2195 static int
degeneracy_integer_scores(void)2196 degeneracy_integer_scores(void)
2197 {
2198 char *msg = "degeneracy_integer_scores unit test failed";
2199 ESL_ALPHABET *a;
2200 ESL_DSQ x;
2201 float p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2202 int sc[] = { -1, -6, 6, 1};
2203 int val;
2204
2205 a = esl_alphabet_Create(eslDNA);
2206
2207 x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2208 val = esl_abc_IAvgScore(a, x, sc);
2209 /* average of -1,-6,6,1 = 0 */
2210 if (val != 0) esl_fatal(msg);
2211
2212 x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
2213 val = esl_abc_IExpectScore(a, x, sc, p);
2214 /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2215 if (val != -2) esl_fatal(msg);
2216
2217 esl_alphabet_Destroy(a);
2218 return eslOK;
2219 }
2220
2221 /* This serves to unit test multiple functions:
2222 * esl_abc_FAvgScore()
2223 * esl_abc_FExpectScore()
2224 */
2225 static int
degeneracy_float_scores(void)2226 degeneracy_float_scores(void)
2227 {
2228 char *msg = "degeneracy_float_scores unit test failed";
2229 ESL_ALPHABET *a;
2230 ESL_DSQ x;
2231 float p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2232 float sc[] = { -1., -6., 6., 1.};
2233 float val;
2234
2235 a = esl_alphabet_Create(eslRNA);
2236
2237 x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2238 val = esl_abc_FAvgScore(a, x, sc);
2239 /* average of -1,-6,6,1 = 0 */
2240 if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
2241
2242 x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
2243 val = esl_abc_FExpectScore(a, x, sc, p);
2244 /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2245 if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
2246
2247 esl_alphabet_Destroy(a);
2248 return eslOK;
2249 }
2250
2251 /* This serves to unit test multiple functions:
2252 * esl_abc_DAvgScore()
2253 * esl_abc_DExpectScore()
2254 */
2255
2256 static int
degeneracy_double_scores(void)2257 degeneracy_double_scores(void)
2258 {
2259 char *msg = "degeneracy_double_scores unit test failed";
2260 ESL_ALPHABET *a;
2261 ESL_DSQ x;
2262 double p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2263 double sc[] = { -1., -6., 6., 1.};
2264 double val;
2265
2266 a = esl_alphabet_Create(eslRNA);
2267
2268 x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2269 val = esl_abc_DAvgScore(a, x, sc);
2270 /* average of -1,-6,6,1 = 0 */
2271 if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
2272
2273 x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
2274 val = esl_abc_DExpectScore(a, x, sc, p);
2275 /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2276 if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
2277
2278 esl_alphabet_Destroy(a);
2279 return eslOK;
2280 }
2281
2282 /* utest_IAvgScVec */
2283 /* utest_FAvgScVec */
2284 /* utest_DAvgScVec */
2285 /* utest_IExpectScVec */
2286 /* utest_FExpectScVec */
2287 /* utest_DExpectScVec */
2288
2289 static int
utest_FCount(void)2290 utest_FCount(void)
2291 {
2292 char *msg = "FCount unit test failure";
2293 ESL_ALPHABET *a = NULL;
2294 ESL_DSQ x;
2295 char *teststring = "X~-Z.UAX";
2296 char *s;
2297 int status;
2298
2299 /* 0.1 from 2 X's; U -> +1 C; A -> +1 A; Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
2300 /* A C D E F G H I K L M N P Q R S T V W Y - */
2301 float expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
2302 float *vec;
2303
2304 a = esl_alphabet_Create(eslAMINO);
2305 ESL_ALLOC(vec, sizeof(float) * (a->K+1)); /* include gap char for this test */
2306 esl_vec_FSet(vec, a->K+1, 0.);
2307 for (s = teststring; *s != '\0'; s++)
2308 {
2309 x = esl_abc_DigitizeSymbol(a, *s);
2310 if (esl_abc_FCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
2311 }
2312 if (esl_vec_FCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
2313
2314 esl_alphabet_Destroy(a);
2315 free(vec);
2316 return eslOK;
2317
2318 ERROR:
2319 esl_fatal("allocation failed");
2320 return status;
2321 }
2322
2323 static int
utest_DCount(void)2324 utest_DCount(void)
2325 {
2326 char *msg = "DCount unit test failure";
2327 ESL_ALPHABET *a = NULL;
2328 ESL_DSQ x;
2329 char *teststring = "X~-Z.UAX";
2330 char *s;
2331 int status;
2332
2333 /* 0.1 from 2 X's; U -> +1 C; A -> +1 A; Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
2334 /* A C D E F G H I K L M N P Q R S T V W Y - */
2335 double expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
2336 double *vec;
2337
2338 a = esl_alphabet_Create(eslAMINO);
2339 ESL_ALLOC(vec, sizeof(double) * (a->K+1)); /* include gap char for this test */
2340 esl_vec_DSet(vec, a->K+1, 0.);
2341 for (s = teststring; *s != '\0'; s++)
2342 {
2343 x = esl_abc_DigitizeSymbol(a, *s);
2344 if (esl_abc_DCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
2345 }
2346 if (esl_vec_DCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
2347
2348 esl_alphabet_Destroy(a);
2349 free(vec);
2350 return eslOK;
2351
2352 ERROR:
2353 esl_fatal("allocation failed");
2354 return status;
2355 }
2356 #endif /* eslALPHABET_TESTDRIVE*/
2357 /*-------------------- end, unit tests --------------------------*/
2358
2359
2360
2361
2362 /*****************************************************************
2363 * 5. Test driver.
2364 *****************************************************************/
2365
2366 /* gcc -g -Wall -std=gnu99 -I. -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c esl_vectorops.c easel.c -lm
2367 * gcc -g -Wall -std=gnu99 -I. -L. -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c -leasel
2368 * ./test
2369 * valgrind ./test
2370 */
2371 #ifdef eslALPHABET_TESTDRIVE
2372 static int basic_examples(void);
2373
2374 #include "easel.h"
2375 #include "esl_alphabet.h"
2376
2377 int
main(void)2378 main(void)
2379 {
2380 utest_Create();
2381 utest_CreateCustom();
2382 utest_SetEquiv();
2383 utest_SetCaseInsensitive();
2384 utest_SetDegeneracy();
2385 utest_SetIgnored();
2386 utest_Destroy();
2387
2388 utest_CreateDsq();
2389 utest_Digitize();
2390 utest_Textize();
2391 utest_TextizeN();
2392 utest_dsqdup();
2393 utest_dsqcat();
2394
2395 utest_FCount();
2396 utest_DCount();
2397
2398 basic_examples();
2399 degeneracy_integer_scores();
2400 degeneracy_float_scores();
2401 degeneracy_double_scores();
2402
2403 return eslOK;
2404 }
2405
2406 static int
basic_examples(void)2407 basic_examples(void)
2408 {
2409 char *msg = "basic alphabet example tests failed";
2410 ESL_ALPHABET *a1, *a2;
2411 char dnaseq[] = "GARYtcN";
2412 char aaseq[] = "EFILqzU";
2413 int L;
2414 ESL_DSQ *dsq, *dsq2;
2415 int i;
2416
2417 /* Example 1.
2418 * Create a DNA alphabet; digitize a DNA sequence.
2419 */
2420 if ((a1 = esl_alphabet_Create(eslDNA)) == NULL) esl_fatal(msg);
2421 L = strlen(dnaseq);
2422 if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
2423 if (esl_abc_Digitize(a1, dnaseq, dsq) != eslOK) esl_fatal(msg);
2424 if (esl_abc_dsqlen(dsq) != L) esl_fatal(msg);
2425 esl_alphabet_Destroy(a1);
2426
2427 /* Example 2.
2428 * Create an RNA alphabet; digitize the same DNA sequence;
2429 * make sure it is equal to the dsq above (so T=U were
2430 * correctly synonymous on input).
2431 */
2432 if ((a2 = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
2433 if ((dsq2 = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
2434 if (esl_abc_Digitize(a2, dnaseq, dsq2) != eslOK) esl_fatal(msg);
2435 for (i = 1; i <= L; i++)
2436 if (dsq[i] != dsq2[i]) esl_fatal(msg);
2437 esl_alphabet_Destroy(a2);
2438
2439 /* Example 3.
2440 * Create an amino alphabet; digitize a protein sequence,
2441 * while reusing memory already allocated in dsq.
2442 */
2443 if ((a1 = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2444 if (esl_abc_Digitize(a1, aaseq, dsq) != eslOK) esl_fatal(msg);
2445
2446 /* Example 4.
2447 * Create a custom alphabet almost the same as the amino
2448 * acid alphabet; digitize the same protein seq, reusing
2449 * memory in dsq2; check that seqs are identical.
2450 */
2451 if ((a2 = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX~", 20, 28)) == NULL) esl_fatal(msg);
2452 if (esl_alphabet_SetCaseInsensitive(a2) != eslOK) esl_fatal(msg); /* allow lower case input */
2453 if (esl_alphabet_SetDegeneracy(a2, 'Z', "QE") != eslOK) esl_fatal(msg);
2454
2455 if (esl_abc_Digitize(a2, aaseq, dsq2) != eslOK) esl_fatal(msg);
2456 for (i = 1; i <= L; i++)
2457 if (dsq[i] != dsq2[i]) esl_fatal(msg);
2458
2459 /* clean up.
2460 */
2461 esl_alphabet_Destroy(a1);
2462 esl_alphabet_Destroy(a2);
2463 free(dsq);
2464 free(dsq2);
2465 return eslOK;
2466 }
2467
2468
2469 #endif /*eslALPHABET_TESTDRIVE*/
2470
2471 /*****************************************************************
2472 * 6. Examples.
2473 *****************************************************************/
2474
2475 /* gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE esl_alphabet.c easel.c
2476 */
2477 #ifdef eslALPHABET_EXAMPLE
2478 /*::cexcerpt::alphabet_example::begin::*/
2479 #include "easel.h"
2480 #include "esl_alphabet.h"
main(void)2481 int main(void)
2482 {
2483 ESL_ALPHABET *a;
2484 char dnaseq[] = "GARYTC";
2485 int L = 6;
2486 ESL_DSQ *dsq;
2487
2488 a = esl_alphabet_Create(eslDNA);
2489
2490 if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal("malloc failed");
2491 if (esl_abc_Digitize(a, dnaseq, dsq) != eslOK) esl_fatal("failed to digitize the sequence");
2492
2493 free(dsq);
2494 esl_alphabet_Destroy(a);
2495 return 0;
2496 }
2497 /*::cexcerpt::alphabet_example::end::*/
2498 #endif /*eslALPHABET_EXAMPLE*/
2499
2500
2501 /* gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE2 esl_alphabet.c easel.c
2502 */
2503 #ifdef eslALPHABET_EXAMPLE2
2504 /*::cexcerpt::alphabet_example2::begin::*/
2505 #include "easel.h"
2506 #include "esl_alphabet.h"
main(void)2507 int main(void)
2508 {
2509 ESL_ALPHABET *a;
2510
2511 /* 1. Create the base alphabet structure. */
2512 a = esl_alphabet_CreateCustom("ACDEFGHIKLMNOPQRSTUVWY-BJZX~", 22, 28);
2513
2514 /* 2. Set your equivalences in the input map. */
2515 esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap character too */
2516
2517 /* 3. After all synonyms are set, (optionally) make map case-insensitive. */
2518 esl_alphabet_SetCaseInsensitive(a); /* allow lower case input too */
2519
2520 /* 4. Define your optional degeneracy codes in the alphabet, one at a time.
2521 * The 'any' character X was automatically set up. */
2522 esl_alphabet_SetDegeneracy(a, 'B', "DN"); /* read B as {D|N} */
2523 esl_alphabet_SetDegeneracy(a, 'J', "IL"); /* read B as {I|L} */
2524 esl_alphabet_SetDegeneracy(a, 'Z', "QE"); /* read Z as {Q|E} */
2525
2526 /* 5. (do your stuff) */
2527
2528 /* 6. Remember to free it when you're done with it. */
2529 esl_alphabet_Destroy(a);
2530 return 0;
2531 }
2532 /*::cexcerpt::alphabet_example2::end::*/
2533 #endif /*eslALPHABET_EXAMPLE2*/
2534
2535
2536
2537 #ifdef eslALPHABET_EXAMPLE3
2538 #include "easel.h"
2539 #include "esl_alphabet.h"
2540 #include "esl_sq.h"
2541 #include "esl_sqio.h"
2542
2543 int
main(int argc,char ** argv)2544 main(int argc, char **argv)
2545 {
2546 ESL_SQ *sq = esl_sq_Create();
2547 ESL_SQFILE *sqfp;
2548 int format = eslSQFILE_UNKNOWN;
2549 char *seqfile = argv[1];
2550 int type;
2551 int status;
2552
2553 status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
2554 if (status == eslENOTFOUND) esl_fatal("No such file.");
2555 else if (status == eslEFORMAT) esl_fatal("Format couldn't be determined.");
2556 else if (status != eslOK) esl_fatal("Open failed, code %d.", status);
2557
2558 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
2559 {
2560 esl_sq_GuessAlphabet(sq, &type);
2561 printf("%-25s %s\n", sq->name, esl_abc_DecodeType(type));
2562 esl_sq_Reuse(sq);
2563 }
2564 if (status == eslEFORMAT) esl_fatal("Parse failed\n %s", esl_sqfile_GetErrorBuf(sqfp));
2565 else if (status != eslEOF) esl_fatal("Unexpected read error %d", status);
2566
2567 esl_sq_Destroy(sq);
2568 esl_sqfile_Close(sqfp);
2569 return 0;
2570 }
2571 #endif /*eslALPHABET_EXAMPLE3*/
2572
2573
2574
2575
2576