1 /* Implements the standard digitized alphabets for biosequences.
2  *
3  *    1. ESL_ALPHABET object for digital alphabets.
4  *    2. Digitized sequences (ESL_DSQ *).
5  *    3. Other routines in the API.
6  *    4. Unit tests.
7  *    5. Test driver.
8  *    6. Examples.
9  */
10 #include "esl_config.h"
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <math.h>
16 #ifdef HAVE_STRINGS_H
17 #include <strings.h>		/* POSIX strcasecmp() */
18 #endif
19 
20 #include "easel.h"
21 #include "esl_mem.h"
22 
23 #include "esl_alphabet.h"
24 
25 
26 
27 /*****************************************************************
28  * 1. The ESL_ALPHABET object
29  *****************************************************************/
30 
31 static ESL_ALPHABET *create_rna(void);
32 static ESL_ALPHABET *create_dna(void);
33 static ESL_ALPHABET *create_amino(void);
34 static ESL_ALPHABET *create_coins(void);
35 static ESL_ALPHABET *create_dice(void);
36 
37 static int set_complementarity(ESL_ALPHABET *a);
38 
39 /* Function:  esl_alphabet_Create()
40  * Synopsis:  Create alphabet of a standard type.
41  *
42  * Purpose:   Creates one of the three standard bio alphabets:
43  *            <eslDNA>, <eslRNA>, or <eslAMINO>, and returns
44  *            a pointer to it.
45  *
46  * Args:      type  - <eslDNA>, <eslRNA>, or <eslAMINO>.
47  *
48  * Returns:   pointer to the new alphabet.
49  *
50  * Throws:    <NULL> if any allocation or initialization fails.
51  */
52 ESL_ALPHABET *
esl_alphabet_Create(int type)53 esl_alphabet_Create(int type)
54 {
55   ESL_ALPHABET *a = NULL;
56 
57   switch(type) {
58   case eslRNA:    a = create_rna();   break;
59   case eslDNA:    a = create_dna();   break;
60   case eslAMINO:  a = create_amino(); break;
61   case eslCOINS:  a = create_coins(); break;
62   case eslDICE:   a = create_dice();  break;
63   default:        esl_fatal("bad alphabet type: unrecognized");  // violation: must be a code error, not user.
64   }
65   return a;
66 }
67 
68 /* Function:  esl_alphabet_CreateCustom()
69  * Synopsis:  Create a custom alphabet.
70  *
71  * Purpose:   Creates a customized biosequence alphabet,
72  *            and returns a ptr to it. The alphabet type is set
73  *            to <eslNONSTANDARD>.
74  *
75  *            <alphabet> is the internal alphabet string;
76  *            <K> is the size of the base alphabet;
77  *            <Kp> is the total size of the alphabet string.
78  *
79  *            In the alphabet string, residues <0..K-1> are the base alphabet;
80  *            residue <K> is the canonical gap (indel) symbol;
81  *            residues <K+1..Kp-4> are additional degeneracy symbols (possibly 0 of them);
82  *            residue <Kp-3> is an "any" symbol (such as N or X);
83  *            residue <Kp-2> is a "nonresidue" symbol (such as *);
84  *            and residue <Kp-1> is a "missing data" gap symbol.
85  *
86  *            The two gap symbols, the nonresidue, and the "any"
87  *            symbol are mandatory even for nonstandard alphabets, so
88  *            <Kp> $\geq$ <K+4>.
89  *
90  * Args:      alphabet - internal alphabet; example "ACGT-RYMKSWHBVDN*~"
91  *            K        - base size; example 4
92  *            Kp       - total size, including gap, degeneracies; example 18
93  *
94  * Returns:   pointer to new <ESL_ALPHABET> structure.
95  *
96  * Throws:    <NULL> if any allocation or initialization fails.
97  */
98 ESL_ALPHABET *
esl_alphabet_CreateCustom(const char * alphabet,int K,int Kp)99 esl_alphabet_CreateCustom(const char *alphabet, int K, int Kp)
100 {
101   ESL_ALPHABET *a = NULL;
102   int           c,x,y;
103   int           status;
104 
105   /* Argument checks.
106    */
107   if (strlen(alphabet) != Kp) ESL_XEXCEPTION(eslEINVAL, "alphabet length != Kp");
108   if (Kp < K+4)               ESL_XEXCEPTION(eslEINVAL, "Kp too small in alphabet");
109 
110   /* Allocation/init, level 1.
111    */
112   ESL_ALLOC(a, sizeof(ESL_ALPHABET));
113   a->sym        = NULL;
114   a->degen      = NULL;
115   a->ndegen     = NULL;
116   a->complement = NULL;
117 
118   /* Allocation/init, level 2.
119    */
120   ESL_ALLOC(a->sym,    sizeof(char)   * (Kp+1));
121   ESL_ALLOC(a->ndegen, sizeof(int)    * Kp);
122   ESL_ALLOC(a->degen,  sizeof(char *) * Kp);
123   a->degen[0] = NULL;
124 
125   /* Allocation/init, level 3.
126    */
127   ESL_ALLOC(a->degen[0], sizeof(char) * (Kp*K));
128   for (x = 1; x < Kp; x++)
129     a->degen[x] = a->degen[0]+(K*x);
130 
131   /* Initialize the internal alphabet:
132    */
133   a->type = eslNONSTANDARD;
134   a->K    = K;
135   a->Kp   = Kp;
136   strcpy(a->sym, alphabet);
137 
138   /* Initialize the input map, mapping ASCII seq chars to digital codes,
139    * and eslDSQ_ILLEGAL for everything else.
140    */
141   for (c = 0; c < 128; c++)   a->inmap[c]               = eslDSQ_ILLEGAL;
142   for (x = 0; x < a->Kp; x++) a->inmap[(int) a->sym[x]] = x;
143 
144   /* Initialize the degeneracy map:
145    *  Base alphabet (first K syms) are automatically
146    *  mapped uniquely; (Kp-3) is assumed to be
147    *  the "any" character; other degen chars (K+1..Kp-4) are
148    *  unset; gap, nonresidue, missing character are unmapped (ndegen=0)
149    */
150   for (x = 0; x < a->Kp; x++)  	/* clear everything */
151     {
152       a->ndegen[x] = 0;
153       for (y = 0; y < a->K; y++) a->degen[x][y] = 0;
154     }
155   for (x = 0; x < a->K; x++) 	/* base alphabet */
156     {
157       a->ndegen[x]   = 1;
158       a->degen[x][x] = 1;
159     }
160                                 /* "any" character */
161   a->ndegen[Kp-3]  = K;
162   for (x = 0; x < a->K; x++) a->degen[Kp-3][x] = 1;
163 
164   return a;
165 
166  ERROR:
167   esl_alphabet_Destroy(a);
168   return NULL;
169 }
170 
171 
172 /* create_rna():
173  * Creates a standard RNA alphabet.
174  */
175 static ESL_ALPHABET *
create_rna(void)176 create_rna(void)
177 {
178   ESL_ALPHABET *a = NULL;
179   int           status;
180 
181   /* Create the fundamental alphabet
182    */
183   if ((a = esl_alphabet_CreateCustom("ACGU-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
184   a->type = eslRNA;
185 
186   /* Add desired synonyms in the input map.
187    */
188   esl_alphabet_SetEquiv(a, 'T', 'U');	    /* read T as a U */
189   esl_alphabet_SetEquiv(a, 'X', 'N');	    /* read X as an N (many seq maskers use X) */
190   esl_alphabet_SetEquiv(a, 'I', 'A');       /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
191   esl_alphabet_SetEquiv(a, '_', '-');       /* allow _ as a gap too */
192   esl_alphabet_SetEquiv(a, '.', '-');       /* allow . as a gap too */
193   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input */
194 
195   /* Define degenerate symbols.
196    */
197   esl_alphabet_SetDegeneracy(a, 'R', "AG");
198   esl_alphabet_SetDegeneracy(a, 'Y', "CU");
199   esl_alphabet_SetDegeneracy(a, 'M', "AC");
200   esl_alphabet_SetDegeneracy(a, 'K', "GU");
201   esl_alphabet_SetDegeneracy(a, 'S', "CG");
202   esl_alphabet_SetDegeneracy(a, 'W', "AU");
203   esl_alphabet_SetDegeneracy(a, 'H', "ACU");
204   esl_alphabet_SetDegeneracy(a, 'B', "CGU");
205   esl_alphabet_SetDegeneracy(a, 'V', "ACG");
206   esl_alphabet_SetDegeneracy(a, 'D', "AGU");
207 
208   if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
209 
210   return a;
211 
212  ERROR:
213   esl_alphabet_Destroy(a);
214   return NULL;
215 }
216 
217 
218 /* create_dna():
219  * creates and returns a standard DNA alphabet.
220  */
221 static ESL_ALPHABET *
create_dna(void)222 create_dna(void)
223 {
224   ESL_ALPHABET *a = NULL;
225   int           status;
226 
227   /* Create the fundamental alphabet.
228    */
229   if ((a = esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
230   a->type = eslDNA;
231 
232   /* Add desired synonyms in the input map.
233    */
234   esl_alphabet_SetEquiv(a, 'U', 'T');	    /* read U as a T */
235   esl_alphabet_SetEquiv(a, 'X', 'N');	    /* read X as an N (many seq maskers use X) */
236   esl_alphabet_SetEquiv(a, 'I', 'A');       /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
237   esl_alphabet_SetEquiv(a, '_', '-');       /* allow _ as a gap too */
238   esl_alphabet_SetEquiv(a, '.', '-');       /* allow . as a gap too */
239   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input */
240 
241   /* Define IUBMB degenerate symbols other than the N.
242    */
243   esl_alphabet_SetDegeneracy(a, 'R', "AG");
244   esl_alphabet_SetDegeneracy(a, 'Y', "CT");
245   esl_alphabet_SetDegeneracy(a, 'M', "AC");
246   esl_alphabet_SetDegeneracy(a, 'K', "GT");
247   esl_alphabet_SetDegeneracy(a, 'S', "CG");
248   esl_alphabet_SetDegeneracy(a, 'W', "AT");
249   esl_alphabet_SetDegeneracy(a, 'H', "ACT");
250   esl_alphabet_SetDegeneracy(a, 'B', "CGT");
251   esl_alphabet_SetDegeneracy(a, 'V', "ACG");
252   esl_alphabet_SetDegeneracy(a, 'D', "AGT");
253 
254   if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
255   return a;
256 
257  ERROR:
258   esl_alphabet_Destroy(a);
259   return NULL;
260 }
261 
262 
263 /* create_amino():
264  * Creates a new standard amino acid alphabet.
265  */
266 static ESL_ALPHABET *
create_amino(void)267 create_amino(void)
268 {
269   ESL_ALPHABET *a = NULL;
270 
271   /* Create the internal alphabet
272    */
273   if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX*~", 20, 29)) == NULL) return NULL;
274   a->type = eslAMINO;
275 
276   /* Add desired synonyms in the input map.
277    */
278   esl_alphabet_SetEquiv(a, '_', '-');       /* allow _ as a gap too */
279   esl_alphabet_SetEquiv(a, '.', '-');       /* allow . as a gap too */
280   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input */
281 
282   /* Define IUPAC degenerate symbols other than the X.
283    */
284   esl_alphabet_SetDegeneracy(a, 'B', "ND");
285   esl_alphabet_SetDegeneracy(a, 'J', "IL");
286   esl_alphabet_SetDegeneracy(a, 'Z', "QE");
287 
288   /* Define unusual residues as one-to-one degeneracies.
289    */
290   esl_alphabet_SetDegeneracy(a, 'U', "C"); /* selenocysteine is scored as cysteine */
291   esl_alphabet_SetDegeneracy(a, 'O', "K"); /* pyrrolysine is scored as lysine      */
292 
293   return a;
294 }
295 
296 
297 /* create_coins():
298  * Creates a toy alphabet for coin examples
299  */
300 static ESL_ALPHABET *
create_coins(void)301 create_coins(void)
302 {
303   ESL_ALPHABET *a = NULL;
304 
305   /* Create the internal alphabet
306    */
307   if ((a = esl_alphabet_CreateCustom("HT-X*~", 2, 6)) == NULL) return NULL;
308   a->type = eslCOINS;
309 
310   /* Add desired synonyms in the input map.
311    */
312   esl_alphabet_SetEquiv(a, '_', '-');       /* allow _ as a gap too */
313   esl_alphabet_SetEquiv(a, '.', '-');       /* allow . as a gap too */
314   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input */
315 
316   /* There are no degeneracies in the coin alphabet. */
317 
318   return a;
319 }
320 
321 /* create_dice():
322  * Creates a toy alphabet for dice examples
323  */
324 static ESL_ALPHABET *
create_dice(void)325 create_dice(void)
326 {
327   ESL_ALPHABET *a = NULL;
328 
329   /* Create the internal alphabet
330    */
331   if ((a = esl_alphabet_CreateCustom("123456-X*~", 6, 10)) == NULL) return NULL;
332   a->type = eslCOINS;
333 
334   /* Add desired synonyms in the input map.
335    */
336   esl_alphabet_SetEquiv(a, '_', '-');       /* allow _ as a gap too */
337   esl_alphabet_SetEquiv(a, '.', '-');       /* allow . as a gap too */
338   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input */
339 
340   /* There are no degeneracies in the dice alphabet. */
341 
342   return a;
343 }
344 
345 /* set_complementarity()
346  * Builds the "complement" lookup table for DNA, RNA alphabets.
347  *
348  * Throws <eslEINVAL> if the alphabet isn't <eslDNA> or <eslRNA>.
349  */
350 static int
set_complementarity(ESL_ALPHABET * a)351 set_complementarity(ESL_ALPHABET *a)
352 {
353   int  status;
354 
355   if (a->type != eslRNA && a->type != eslDNA)
356     ESL_EXCEPTION(eslEINVAL, "alphabet isn't nucleic: no complementarity to set");
357 
358   /* We will assume that Kp=18 and sym="ACGT-RYMKSWHBVDN*~" (or RNA equiv).
359    * Bug #h108 happened because routine fell out of sync w/ a change in alphabet.
360    * Don't let that happen again.
361    */
362   ESL_DASSERT1((      a->Kp == 18  ));
363   ESL_DASSERT1(( a->sym[17] == '~' ));
364 
365   ESL_ALLOC(a->complement, sizeof(ESL_DSQ) * a->Kp);
366   a->complement[0] = 3;	   /* A->T */
367   a->complement[1] = 2;    /* C->G */
368   a->complement[2] = 1;    /* G->C */
369   a->complement[3] = 0;    /* T->A */
370   a->complement[4] = 4;    /* -  - */
371   a->complement[5] = 6;	   /* R->Y */
372   a->complement[6] = 5;    /* Y->R */
373   a->complement[7] = 8;    /* M->K */
374   a->complement[8] = 7;    /* K->M */
375   a->complement[9] = 9;    /* S  S */
376   a->complement[10]= 10;   /* W  W */
377   a->complement[11]= 14;   /* H->D */
378   a->complement[12]= 13;   /* B->V */
379   a->complement[13]= 12;   /* V->B */
380   a->complement[14]= 11;   /* D->H */
381   a->complement[15]= 15;   /* N  N */
382   a->complement[16]= 16;   /* *  * */
383   a->complement[17]= 17;   /* ~  ~ */
384 
385   return eslOK;
386 
387  ERROR:
388   return status;
389 }
390 
391 
392 /* Function:  esl_alphabet_SetEquiv()
393  * Synopsis:  Define an equivalent symbol.
394  *
395  * Purpose:   Maps an additional input alphabetic symbol <sym> to
396  *            an internal alphabet symbol <c>; for example,
397  *            we might map T to U for an RNA alphabet, so that we
398  *            allow for reading input DNA sequences.
399  *
400  * Args:      sym   - symbol to allow in the input alphabet; 'T' for example
401  *            c     - symbol to map <sym> to in the internal alphabet; 'U' for example
402  *
403  * Returns:   <eslOK> on success.
404  *
405  * Throws:    <eslEINVAL> if <c> is not in the internal alphabet, or if <sym> is.
406  */
407 int
esl_alphabet_SetEquiv(ESL_ALPHABET * a,char sym,char c)408 esl_alphabet_SetEquiv(ESL_ALPHABET *a, char sym, char c)
409 {
410   char    *sp = NULL;
411   ESL_DSQ  x;
412 
413   /* Contract checks */
414   if ((sp = strchr(a->sym, sym)) != NULL)
415     ESL_EXCEPTION(eslEINVAL, "symbol %c is already in internal alphabet, can't equivalence it", sym);
416   if ((sp = strchr(a->sym, c)) == NULL)
417     ESL_EXCEPTION(eslEINVAL, "char %c not in the alphabet, can't map to it", c);
418 
419   x = sp - a->sym;
420   a->inmap[(int) sym] = x;
421   return eslOK;
422 }
423 
424 /* Function:  esl_alphabet_SetCaseInsensitive()
425  * Synopsis:  Make an alphabet's input map case-insensitive.
426  *
427  * Purpose:   Given a custom alphabet <a>, with all equivalences set,
428  *            make the input map case-insensitive: for every
429  *            letter that is mapped in either lower or upper
430  *            case, map the other case to the same internal
431  *            residue.
432  *
433  *            For the standard alphabets, this is done automatically.
434  *
435  * Args:      a  - alphabet to make case-insensitive.
436  *
437  * Returns:   <eslOK> on success.
438  *
439  * Throws:    <eslECORRUPT> if any lower/uppercase symbol pairs
440  *            are already both mapped to different symbols.
441  */
442 int
esl_alphabet_SetCaseInsensitive(ESL_ALPHABET * a)443 esl_alphabet_SetCaseInsensitive(ESL_ALPHABET *a)
444 {
445   int lc, uc;
446 
447   for (lc = 'a'; lc <= 'z'; lc++)
448     {
449       uc = toupper(lc);
450 
451       if      (esl_abc_CIsValid(a, lc) && ! esl_abc_CIsValid(a, uc)) a->inmap[uc] = a->inmap[lc];
452       else if (esl_abc_CIsValid(a, uc) && ! esl_abc_CIsValid(a, lc)) a->inmap[lc] = a->inmap[uc];
453       else if (esl_abc_CIsValid(a, lc) && esl_abc_CIsValid(a, uc) && a->inmap[uc] != a->inmap[lc])
454 	ESL_EXCEPTION(eslECORRUPT, "symbols %c and %c map differently already (%c vs. %c)",
455 		  lc, uc, a->inmap[lc], a->inmap[uc]);
456     }
457   return eslOK;
458 }
459 
460 /* Function:  esl_alphabet_SetDegeneracy()
461  * Synopsis:  Define degenerate symbol in custom alphabet.
462  *
463  * Purpose:   Given an alphabet under construction,
464  *            define the degenerate character <c> to mean
465  *            any of the characters in the string <ds>.
466  *
467  *            <c> must exist in the digital alphabet, as
468  *            one of the optional degenerate residues (<K+1>..<Kp-3>).
469  *            All the characters in the <ds> string must exist
470  *            in the canonical alphabet (<0>..<K-1>).
471  *
472  *            You may not redefine the mandatory all-degenerate character
473  *            (typically <N> or <X>; <Kp-3> in the digital alphabet).
474  *            It is defined automatically in all alphabets.
475  *
476  * Args:      a   - an alphabet under construction.
477  *            c   - degenerate character code; example: 'R'
478  *            ds  - string of base characters for c; example: "AG"
479  *
480  * Returns:   <eslOK> on success.
481  *
482  * Throws:    <eslEINVAL> if <c> or <ds> arguments aren't valid.
483  */
484 int
esl_alphabet_SetDegeneracy(ESL_ALPHABET * a,char c,char * ds)485 esl_alphabet_SetDegeneracy(ESL_ALPHABET *a, char c, char *ds)
486 {
487   char   *sp;
488   ESL_DSQ x,y;
489 
490   if ((sp = strchr(a->sym, c)) == NULL)
491     ESL_EXCEPTION(eslEINVAL, "no such degenerate character");
492   x = sp - a->sym;
493 
494   /* A degenerate character must have code K+1..Kp-4.
495    * Kp-3, the all-degenerate character, is automatically
496    * created, and can't be remapped.
497    */
498   if (x == a->Kp-3)
499     ESL_EXCEPTION(eslEINVAL, "can't redefine all-degenerate char %c", c);
500   if (x < a->K+1 || x >= a->Kp-2)
501     ESL_EXCEPTION(eslEINVAL, "char %c isn't in expected position in alphabet", c);
502 
503   while (*ds != '\0') {
504     if ((sp = strchr(a->sym, *ds)) == NULL) ESL_EXCEPTION(eslEINVAL, "no such base character");
505     y = sp - a->sym;
506     if (! esl_abc_XIsCanonical(a, y))       ESL_EXCEPTION(eslEINVAL, "can't map degeneracy to noncanonical character");
507 
508     a->degen[x][y] = 1;
509     a->ndegen[x]++;
510     ds++;
511   }
512   return eslOK;
513 }
514 
515 
516 /* Function:  esl_alphabet_SetIgnored()
517  * Synopsis:  Define a set of characters to be ignored in input.
518  *
519  * Purpose:   Given an alphabet <a> (either standard or custom), define
520  *            all the characters in string <ignoredchars> to be
521  *            unmapped: valid, but ignored when converting input text.
522  *
523  *            By default, the standard alphabets do not define any
524  *            ignored characters.
525  *
526  *            The most common ignored characters would be space, tab,
527  *            and digits, to skip silently over whitespace and
528  *            sequence coordinates when parsing loosely-defined
529  *            sequence file formats.
530  *
531  * Args:      a             - alphabet to modify
532  *            ignoredchars  - string listing characters to ignore; i.e. " \t"
533  *
534  * Returns:   <eslOK> on success.
535  */
536 int
esl_alphabet_SetIgnored(ESL_ALPHABET * a,const char * ignoredchars)537 esl_alphabet_SetIgnored(ESL_ALPHABET *a, const char *ignoredchars)
538 {
539   int i;
540   for (i = 0; ignoredchars[i] != '\0'; i++) a->inmap[(int)ignoredchars[i]] = eslDSQ_IGNORED;
541   return eslOK;
542 }
543 
544 
545 /* Function:  esl_alphabet_Sizeof()
546  * Synopsis:  Returns size of an alphabet object, in bytes.
547  *
548  * Purpose:   Returns the size of alphabet <a> object, in bytes.
549  */
550 size_t
esl_alphabet_Sizeof(ESL_ALPHABET * a)551 esl_alphabet_Sizeof(ESL_ALPHABET *a)
552 {
553   size_t n = 0;
554   n += sizeof(ESL_ALPHABET);
555   n += sizeof(char) * a->Kp;	                   /* a->sym        */
556   n += sizeof(char *) * a->Kp;	                   /* a->degen      */
557   n += sizeof(char) * (a->Kp * a->K);              /* a->degen[][]  */
558   n += sizeof(int)  * a->Kp;	                   /* a->ndegen     */
559   if (a->complement) n += sizeof(ESL_DSQ) * a->Kp; /* a->complement */
560   return n;
561 }
562 
563 /* Function:  esl_alphabet_Destroy()
564  * Synopsis:  Frees an alphabet object.
565  *
566  * Purpose:   Free's an <ESL_ALPHABET> structure.
567  *
568  * Args:      a  - the <ESL_ALPHABET> to free.
569  *
570  * Returns:   (void).
571  */
572 void
esl_alphabet_Destroy(ESL_ALPHABET * a)573 esl_alphabet_Destroy(ESL_ALPHABET *a)
574 {
575   if (a)
576     {
577       if (a->sym)    free(a->sym);
578       if (a->ndegen) free(a->ndegen);
579       if (a->degen)
580 	{
581 	  if (a->degen[0]) free(a->degen[0]);
582 	  free(a->degen);
583 	}
584       if (a->complement) free(a->complement);
585       free(a);
586     }
587 }
588 /*--------------- end, ESL_ALPHABET object ----------------------*/
589 
590 
591 
592 
593 
594 /*****************************************************************
595  * 2. Digitized sequences (ESL_DSQ *)
596  *****************************************************************/
597 /* Design note:                 SRE, Mon Sep 18 09:11:41 2006
598  *
599  * An ESL_DSQ is considered to a special string type, equivalent to
600  * <char *>, and is not considered to be an Easel "object".  Thus it
601  * does not have a standard object API.  Rather, the caller deals with
602  * an ESL_DSQ directly: allocate for <(L+2)*sizeof(ESL_DSQ)> to leave
603  * room for sentinels at <0> and <L+1>.
604  *
605  * Additionally, an ESL_DSQ is considered to be "trusted"
606  * data: we're 'guaranteed' that anything in an ESL_DSQ is a valid
607  * symbol, so we don't need to error-check. Anything else is a programming
608  * error.
609  */
610 
611 /* Function:  esl_abc_CreateDsq()
612  * Synopsis:  Digitizes a sequence into new space.
613  *
614  * Purpose:   Given an alphabet <a> and an ASCII sequence <seq>,
615  *            digitize the sequence into newly allocated space, and
616  *            return a pointer to that space in <ret_dsq>.
617  *
618  * Args:      a       - internal alphabet
619  *            seq     - text sequence to be digitized
620  *            ret_dsq - RETURN: the new digital sequence
621  *
622  * Returns:   <eslOK> on success, and <ret_dsq> contains the digitized
623  *            sequence; caller is responsible for free'ing this
624  *            memory. Returns <eslEINVAL> if <seq> contains
625  *            one or more characters that are not in the input map of
626  *            alphabet <a>. If this happens, <ret_dsq> is still valid upon
627  *            return: invalid characters are replaced by full ambiguities
628  *            (typically X or N).
629  *
630  * Throws:    <eslEMEM> on allocation failure.
631  *
632  * Xref:      STL11/63
633  */
634 int
esl_abc_CreateDsq(const ESL_ALPHABET * a,const char * seq,ESL_DSQ ** ret_dsq)635 esl_abc_CreateDsq(const ESL_ALPHABET *a, const char *seq, ESL_DSQ **ret_dsq)
636 {
637   ESL_DSQ *dsq = NULL;
638   int      status;
639   int64_t  L;
640 
641   L = strlen(seq);
642   ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
643   status = esl_abc_Digitize(a, seq, dsq);
644 
645   if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
646   return status;
647 
648  ERROR:
649   if (dsq != NULL)      free(dsq);
650   if (ret_dsq != NULL) *ret_dsq = NULL;
651   return status;
652 }
653 
654 
655 /* Function: esl_abc_Digitize()
656  * Synopsis: Digitizes a sequence into existing space.
657  *
658  * Purpose:  Given an alphabet <a> and a nul-terminated ASCII sequence
659  *           <seq>, digitize the sequence and put it in <dsq>. Caller
660  *           provides space in <dsq> allocated for at least <L+2>
661  *           <ESL_DSQ> residues, where <L> is the length of <seq>.
662  *
663  * Args:     a       - internal alphabet
664  *           seq     - text sequence to be digitized (\0-terminated)
665  *           dsq     - RETURN: the new digital sequence (caller allocates,
666  *                     at least <(L+2) * sizeof(ESL_DSQ)>).
667  *
668  * Returns:  <eslOK> on success.
669  *           Returns <eslEINVAL> if <seq> contains one or more characters
670  *           that are not recognized in the alphabet <a>. (This is classed
671  *           as a normal error, because the <seq> may be untrusted user input.)
672  *           If this happens, the digital sequence <dsq> is still valid upon
673  *           return; invalid ASCII characters are replaced by ambiguities
674  *           (X or N).
675  */
676 int
esl_abc_Digitize(const ESL_ALPHABET * a,const char * seq,ESL_DSQ * dsq)677 esl_abc_Digitize(const ESL_ALPHABET *a, const char *seq, ESL_DSQ *dsq)
678 {
679   int     status;
680   int64_t i;			/* position in seq */
681   int64_t j;			/* position in dsq */
682   ESL_DSQ x;
683 
684   status = eslOK;
685   dsq[0] = eslDSQ_SENTINEL;
686   for (i = 0, j = 1; seq[i] != '\0'; i++)
687     {
688       x = a->inmap[(int) seq[i]];
689       if      (esl_abc_XIsValid(a, x)) dsq[j] = x;
690       else if (x == eslDSQ_IGNORED) continue;
691       else {
692 	status   = eslEINVAL;
693 	dsq[j] = esl_abc_XGetUnknown(a);
694       }
695       j++;
696     }
697   dsq[j] = eslDSQ_SENTINEL;
698   return status;
699 }
700 
701 /* Function:  esl_abc_Textize()
702  * Synopsis:  Convert digital sequence to text.
703  *
704  * Purpose:   Make an ASCII sequence <seq> by converting a digital
705  *            sequence <dsq> of length <L> back to text, according to
706  *            the digital alphabet <a>.
707  *
708  *            Caller provides space in <seq> allocated for at least
709  *            <L+1> bytes (<(L+1) * sizeof(char)>).
710  *
711  * Args:      a   - internal alphabet
712  *            dsq - digital sequence to be converted (1..L)
713  *            L   - length of dsq
714  *            seq - RETURN: the new text sequence (caller allocated
715  *                  space, at least <(L+1) * sizeof(char)>).
716  *
717  * Returns:   <eslOK> on success.
718  */
719 int
esl_abc_Textize(const ESL_ALPHABET * a,const ESL_DSQ * dsq,int64_t L,char * seq)720 esl_abc_Textize(const ESL_ALPHABET *a, const ESL_DSQ *dsq, int64_t L, char *seq)
721 {
722   int64_t i;
723 
724   for (i = 0; i < L; i++)
725     seq[i] = a->sym[dsq[i+1]];
726   seq[i] = '\0';
727   return eslOK;
728 }
729 
730 
731 /* Function:  esl_abc_TextizeN()
732  * Synopsis:  Convert subsequence from digital to text.
733  *
734  * Purpose:   Similar in semantics to <strncpy()>, this procedure takes
735  *            a window of <L> residues in a digitized sequence
736  *            starting at the residue pointed to by <dptr>,
737  *            converts them to ASCII text representation, and
738  *            copies them into the buffer <buf>.
739  *
740  *            <buf> must be at least <L> residues long; <L+1>, if the
741  *            caller needs to NUL-terminate it.
742  *
743  *            If a sentinel byte is encountered in the digitized
744  *            sequence before <L> residues have been copied, <buf> is
745  *            NUL-terminated there. Otherwise, like <strncpy()>, <buf>
746  *            will not be NUL-terminated.
747  *
748  *            Note that because digital sequences are indexed <1..N>,
749  *            not <0..N-1>, the caller must be careful about
750  *            off-by-one errors in <dptr>. For example, to copy from
751  *            the first residue of a digital sequence <dsq>, you must
752  *            pass <dptr=dsq+1>, not <dptr=dsq>. The text in <buf>
753  *            on the other hand is a normal C string indexed <0..L-1>.
754  *
755  * Args:      a     - reference to an internal alphabet
756  *            dptr  - ptr to starting residue in a digital sequence
757  *            L     - number of residues to convert and copy
758  *            buf   - text buffer to store the <L> converted residues in
759  *
760  * Returns:   <eslOK> on success.
761  */
762 int
esl_abc_TextizeN(const ESL_ALPHABET * a,const ESL_DSQ * dptr,int64_t L,char * buf)763 esl_abc_TextizeN(const ESL_ALPHABET *a, const ESL_DSQ *dptr, int64_t L, char *buf)
764 {
765   int64_t i;
766 
767   for (i = 0; i < L; i++)
768     {
769       if (dptr[i] == eslDSQ_SENTINEL)
770 	{
771 	  buf[i] = '\0';
772 	  return eslOK;
773 	}
774       buf[i] = a->sym[dptr[i]];
775     }
776   return eslOK;
777 }
778 
779 
780 /* Function:  esl_abc_dsqcpy()
781  *
782  * Purpose:   Given a digital sequence <dsq> of length <L>,
783  *            make a copy of it in <dcopy>. Caller provides
784  *            storage in <dcopy> for at least <L+2> <ESL_DSQ>
785  *            residues.
786  *
787  * Returns:   <eslOK> on success.
788  */
789 int
esl_abc_dsqcpy(const ESL_DSQ * dsq,int64_t L,ESL_DSQ * dcopy)790 esl_abc_dsqcpy(const ESL_DSQ *dsq, int64_t L, ESL_DSQ *dcopy)
791 {
792   memcpy(dcopy, dsq, sizeof(ESL_DSQ) * (L+2));
793   return eslOK;
794 }
795 
796 
797 /* Function:  esl_abc_dsqdup()
798  * Synopsis:  Duplicate a digital sequence.
799  *
800  * Purpose:   Like <esl_strdup()>, but for digitized sequences:
801  *            make a duplicate of <dsq> and leave it in <ret_dup>.
802  *            Caller can pass the string length <L> if it's known, saving
803  *            some overhead; else pass <-1> and the length will be
804  *            determined for you.
805  *
806  *            Tolerates <dsq> being <NULL>; in which case, returns
807  *            <eslOK> with <*ret_dup> set to <NULL>.
808  *
809  * Args:      dsq     - digital sequence to duplicate (w/ sentinels at 0,L+1)
810  *            L       - length of dsq in residues, if known; -1 if unknown
811  *            ret_dup - RETURN: allocated duplicate of <dsq>, which caller will
812  *                      free.
813  *
814  * Returns:   <eslOK> on success, and leaves a pointer in <ret_dup>.
815  *
816  * Throws:    <eslEMEM> on allocation failure.
817  *
818  * Xref:      STL11/48
819  */
820 int
esl_abc_dsqdup(const ESL_DSQ * dsq,int64_t L,ESL_DSQ ** ret_dup)821 esl_abc_dsqdup(const ESL_DSQ *dsq, int64_t L, ESL_DSQ **ret_dup)
822 {
823   int      status;
824   ESL_DSQ *new = NULL;
825 
826   if (ret_dup == NULL) return eslOK; /* no-op. */
827 
828   *ret_dup = NULL;
829   if (dsq == NULL) return eslOK;
830   if (L < 0) L = esl_abc_dsqlen(dsq);
831 
832   ESL_ALLOC(new, sizeof(ESL_DSQ) * (L+2));
833   memcpy(new, dsq, sizeof(ESL_DSQ) * (L+2));
834 
835   *ret_dup = new;
836   return eslOK;
837 
838  ERROR:
839   if (new     != NULL)  free(new);
840   if (ret_dup != NULL) *ret_dup = NULL;
841   return status;
842 }
843 
844 
845 /* Function:  esl_abc_dsqcat()
846  * Synopsis:  Concatenate and map input chars to a digital sequence.
847  *
848  * Purpose:   Append the contents of string or memory line <s> of
849  *            length <n> to a digital sequence, after digitizing
850  *            each input character in <s> according to an Easel
851  *            <inmap>. The destination sequence and its length
852  *            are passed by reference, <*dsq> and <*L>, so that
853  *            the sequence may be reallocated and the length updated
854  *            upon return.
855  *
856  *            The input map <inmap> may map characters to
857  *            <eslDSQ_IGNORED> or <eslDSQ_ILLEGAL>, but not to <eslDSQ_EOL>,
858  *            <eslDSQ_EOD>, or <eslDSQ_SENTINEL> codes. <inmap[0]> is
859  *            special, and must be set to the code for the 'unknown'
860  *            residue (such as 'X' for proteins, 'N' for DNA) that
861  *            will be used to replace any invalid <eslDSQ_ILLEGAL>
862  *            characters.
863  *
864  *            If <*dsq> is properly terminated digital sequence and
865  *            the caller doesn't know its length, <*L> may be passed
866  *            as -1. Providing the length when it's known saves an
867  *            <esl_abc_dsqlen()> call. If <*dsq> is unterminated, <*L>
868  *            is mandatory. Essentially the same goes for <*s>, which
869  *            may be a NUL-terminated string (pass <n=-1> if length unknown),
870  *            or a memory line (<n> is mandatory).
871  *
872  *            <*dsq> may also be <NULL>, in which case it is allocated
873  *            and initialized here.
874  *
875  *            Caller should provide an <s> that is expected to be
876  *            essentially all appendable to <*dsq> except for a small
877  *            number of chars that map to <eslDSQ_IGNORE>, like an
878  *            input sequence data line from a file, for example. We're
879  *            going to reallocate <*dsq> to size <*L+n>; if <n> is an
880  *            entire large buffer or file, this reallocation will be
881  *            inefficient.
882  *
883  * Args:      inmap - an Easel input map, inmap[0..127];
884  *                    inmap[0] is special, set to the 'unknown' character
885  *                    to replace invalid input chars.
886  *            dsq   - reference to the current digital seq to append to
887  *                    (with sentinel bytes at 0,L+1); may be <NULL>.
888  *                    Upon return, this will probably have
889  *                    been reallocated, and it will contain the original
890  *                    <dsq> with <s> digitized and appended.
891  *            L    -  reference to the current length of <dsq> in residues;
892  *                    may be <-1> if unknown and if <*dsq> is a properly
893  *                    terminated digital sequence. Upon return, <L> is set to
894  *                    the new length of <dsq>, after <s> is appended.
895  *            s    -  ASCII text sequence to append. May
896  *                    contain ignored text characters (flagged with
897  *                    <eslDSQ_IGNORED> in the input map of alphabet <abc>).
898  *            n    -  Length of <s> in characters, if known; or <-1> if
899  *                    unknown and if <s> is a NUL-terminated string.
900  *
901  * Returns:   <eslOK> on success; <*dsq> contains the result of digitizing
902  *            and appending <s> to the original <*dsq>; and <*L> contains
903  *            the new length of the <dsq> result in residues.
904  *
905  *            If any of the characters in <s> are illegal in the
906  *            alphabet <abc>, these characters are digitized as
907  *            unknown residues (using <inmap[0]>) and
908  *            concatenation/digitization proceeds to completion, but
909  *            the function returns <eslEINVAL>. The caller might then
910  *            want to call <esl_abc_ValidateSeq()> on <s> if it wants
911  *            to figure out where digitization goes awry and get a
912  *            more informative error report. This is a normal error,
913  *            because the string <s> might be user input.
914  *
915  * Throws:    <eslEMEM> on allocation or reallocation failure;
916  *            <eslEINCONCEIVABLE> on coding error.
917  *
918  * Xref:      SRE:STL11/48; SRE:J7/145.
919  *
920  * Note:      This closely parallels a text mode version, <esl_strmapcat()>.
921  */
922 int
esl_abc_dsqcat(const ESL_DSQ * inmap,ESL_DSQ ** dsq,int64_t * L,const char * s,esl_pos_t n)923 esl_abc_dsqcat(const ESL_DSQ *inmap, ESL_DSQ **dsq, int64_t *L, const char *s, esl_pos_t n)
924 {
925   int       status = eslOK;
926 
927   if (*L < 0) *L = ((*dsq) ? esl_abc_dsqlen(*dsq) : 0);
928   if ( n < 0)  n = (   (s) ? strlen(s)            : 0);
929 
930   if (n == 0) { goto ERROR; } 	/* that'll return eslOK, leaving *dest untouched, and *ldest its length */
931 
932   if (*dsq == NULL) {		/* an entirely new dsq is allocated *and* initialized with left sentinel. */
933     ESL_ALLOC(*dsq, sizeof(ESL_DSQ)     * (n+2));
934     (*dsq)[0] = eslDSQ_SENTINEL;
935   } else			/* else, existing dsq is just reallocated; leftmost sentinel already in place. */
936     ESL_REALLOC(*dsq, sizeof(ESL_DSQ) * (*L+n+2)); /* most we'll need */
937 
938   return esl_abc_dsqcat_noalloc(inmap, *dsq, L, s, n);
939 
940  ERROR:
941   return status;
942 }
943 
944 /* Function:  esl_abc_dsqcat_noalloc()
945  * Synopsis:  Version of esl_abc_dsqcat() that does no reallocation.
946  *
947  * Purpose:   Same as <esl_abc_dsqcat()>, but with no reallocation of
948  *            <dsq>. The pointer to the destination string <dsq> is
949  *            passed by value not by reference, because it will not
950  *            be reallocated or moved. Caller has already allocated
951  *            at least <*L + n + 2> bytes in <dsq>. <*L> and <n> are
952  *            not optional; caller must know (and provide) the lengths
953  *            of both the old string and the new source.
954  *
955  * Note:      This version was needed in selex format parsing, where
956  *            we need to prepend and append some number of gaps on
957  *            each new line of each block of input; allocating once
958  *            then adding the gaps and the sequence seemed most efficient.
959  */
960 int
esl_abc_dsqcat_noalloc(const ESL_DSQ * inmap,ESL_DSQ * dsq,int64_t * L,const char * s,esl_pos_t n)961 esl_abc_dsqcat_noalloc(const ESL_DSQ *inmap, ESL_DSQ *dsq, int64_t *L, const char *s, esl_pos_t n)
962 {
963   int64_t   xpos;
964   esl_pos_t cpos;
965   ESL_DSQ   x;
966   int       status = eslOK;
967 
968   /* Watch these coords. Start in the 0..n-1 text string at 0;
969    * start in the 1..L dsq at L+1, overwriting its terminal
970    * sentinel byte.
971    */
972   for (xpos = *L+1, cpos = 0; cpos < n; cpos++)
973     {
974       if (! isascii(s[cpos])) { dsq[xpos++] = inmap[0]; status = eslEINVAL; continue; }
975 
976       x = inmap[(int) s[cpos]];
977 
978       if       (x <= 127)      dsq[xpos++] = x;
979       else switch (x) {
980 	case eslDSQ_SENTINEL:  ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_SENTINEL"); break;
981 	case eslDSQ_ILLEGAL:   dsq[xpos++] = inmap[0]; status = eslEINVAL;                               break;
982 	case eslDSQ_IGNORED:   break;
983 	case eslDSQ_EOL:       ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOL");      break;
984 	case eslDSQ_EOD:       ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOD");      break;
985 	default:               ESL_EXCEPTION(eslEINCONCEIVABLE, "bad inmap, no such ESL_DSQ code");      break;
986 	}
987     }
988   dsq[xpos] = eslDSQ_SENTINEL;
989   *L = xpos-1;
990   return status;
991 }
992 
993 /* Function:  esl_abc_dsqlen()
994  * Synopsis:  Returns the length of a digital sequence.
995  *
996  * Purpose:   Returns the length of digitized sequence <dsq> in
997  *            positions (including gaps, if any). The <dsq> must be
998  *            properly terminated by a sentinel byte
999  *            (<eslDSQ_SENTINEL>).
1000  */
1001 int64_t
esl_abc_dsqlen(const ESL_DSQ * dsq)1002 esl_abc_dsqlen(const ESL_DSQ *dsq)
1003 {
1004   int64_t n = 0;
1005   while (dsq[n+1] != eslDSQ_SENTINEL) n++;
1006   return n;
1007 }
1008 
1009 /* Function:  esl_abc_dsqrlen()
1010  * Synopsis:  Returns the number of residues in a digital seq.
1011  *
1012  * Purpose:   Returns the unaligned length of digitized sequence
1013  *            <dsq>, in residues, not counting any gaps, nonresidues,
1014  *            or missing data symbols.
1015  */
1016 int64_t
esl_abc_dsqrlen(const ESL_ALPHABET * abc,const ESL_DSQ * dsq)1017 esl_abc_dsqrlen(const ESL_ALPHABET *abc, const ESL_DSQ *dsq)
1018 {
1019   int64_t n = 0;
1020   int64_t i;
1021 
1022   for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
1023     if (esl_abc_XIsResidue(abc, dsq[i])) n++;
1024   return n;
1025 }
1026 
1027 /* Function:  esl_abc_CDealign()
1028  * Synopsis:  Dealigns a text string, using a reference digital aseq.
1029  *
1030  * Purpose:   Dealigns <s> in place by removing characters aligned to
1031  *            gaps (or missing data symbols) in the reference digital
1032  *            aligned sequence <ref_ax>. Gaps and missing data symbols
1033  *            in <ref_ax> are defined by its digital alphabet <abc>.
1034  *
1035  *            <s> is typically going to be some kind of textual
1036  *            annotation string (secondary structure, consensus, or
1037  *            surface accessibility).
1038  *
1039  *            Be supercareful of off-by-one errors here! The <ref_ax>
1040  *            is a digital sequence that is indexed <1..L>. The
1041  *            annotation string <s> is assumed to be <0..L-1> (a
1042  *            normal C string), off by one with respect to <ref_ax>.
1043  *            In a sequence object, ss annotation is actually stored
1044  *            <1..L> -- so if you're going to <esl_abc_CDealign()> a
1045  *            <sq->ss>, pass <sq->ss+1> as the argument <s>.
1046  *
1047  * Returns:   Returns <eslOK> on success; optionally returns the number
1048  *            of characters in the dealigned <s> in <*opt_rlen>.
1049  *
1050  * Throws:    (no abnormal error conditions)
1051  */
1052 int
esl_abc_CDealign(const ESL_ALPHABET * abc,char * s,const ESL_DSQ * ref_ax,int64_t * opt_rlen)1053 esl_abc_CDealign(const ESL_ALPHABET *abc, char *s, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
1054 {
1055   int64_t apos;
1056   int64_t n = 0;
1057 
1058   if (s == NULL) return eslOK;
1059 
1060   for (n=0, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
1061     if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
1062       s[n++] = s[apos-1];	/* apos-1 because we assume s was 0..alen-1, whereas ref_ax was 1..alen */
1063   s[n] = '\0';
1064 
1065   if (opt_rlen != NULL) *opt_rlen = n;
1066   return eslOK;
1067 }
1068 
1069 /* Function:  esl_abc_XDealign()
1070  * Synopsis:  Dealigns a digital string, using a reference digital aseq.
1071  *
1072  * Purpose:   Dealigns <x> in place by removing characters aligned to
1073  *            gaps (or missing data) in the reference digital aligned
1074  *            sequence <ref_ax>. Gaps and missing data symbols in
1075  *            <ref_ax> are defined by its digital alphabet <abc>.
1076  *
1077  * Returns:   Returns <eslOK> on success; optionally returns the number
1078  *            of characters in the dealigned <x> in <*opt_rlen>.
1079  *
1080  * Throws:    (no abnormal error conditions)
1081  */
1082 int
esl_abc_XDealign(const ESL_ALPHABET * abc,ESL_DSQ * x,const ESL_DSQ * ref_ax,int64_t * opt_rlen)1083 esl_abc_XDealign(const ESL_ALPHABET *abc, ESL_DSQ *x, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
1084 {
1085   int64_t apos;
1086   int64_t n = 0;
1087 
1088   if (x == NULL) return eslOK;
1089 
1090   x[0] = eslDSQ_SENTINEL;
1091   for (n=1, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
1092     if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
1093       x[n++] = x[apos];
1094   x[n] = eslDSQ_SENTINEL;
1095 
1096   if (opt_rlen != NULL) *opt_rlen = n-1;
1097   return eslOK;
1098 }
1099 
1100 /* Function:  esl_abc_ConvertDegen2X()
1101  * Synopsis:  Convert all degenerate residues to X or N.
1102  *
1103  * Purpose:   Convert all the degenerate residue codes in digital
1104  *            sequence <dsq> to the code for the maximally degenerate
1105  *            "unknown residue" code, as specified in digital alphabet
1106  *            <abc>. (For example, X for protein, N for nucleic acid.)
1107  *
1108  *            This comes in handy when you're dealing with some piece
1109  *            of software that can't deal with standard residue codes,
1110  *            and you want to massage your sequences into a form that
1111  *            can be accepted. For example, WU-BLAST can't deal with O
1112  *            (pyrrolysine) residues, but UniProt has O codes.
1113  *
1114  * Returns:   <eslOK> on success.
1115  *
1116  * Throws:    (no abnormal error conditions)
1117  */
1118 int
esl_abc_ConvertDegen2X(const ESL_ALPHABET * abc,ESL_DSQ * dsq)1119 esl_abc_ConvertDegen2X(const ESL_ALPHABET *abc, ESL_DSQ *dsq)
1120 {
1121   int64_t i;
1122 
1123   for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
1124     if (esl_abc_XIsDegenerate(abc, dsq[i]))
1125       dsq[i] = esl_abc_XGetUnknown(abc);
1126   return eslOK;
1127 }
1128 
1129 
1130 /* Function:  esl_abc_revcomp()
1131  * Synopsis:  Reverse complement a digital sequence
1132  * Incept:    SRE, Wed Feb 10 11:54:48 2016 [JB251 BOS-MCO]
1133  *
1134  * Purpose:   Reverse complement <dsq>, in place, according to
1135  *            its digital alphabet <abc>.
1136  *
1137  * Args:      abc  - digital alphabet
1138  *            dsq  - digital sequence, 1..n
1139  *            n    - length of <dsq>
1140  *
1141  * Returns:   <eslOK> on success.
1142  *
1143  * Throws:    <eslEINCOMPAT> if alphabet <abc> can't be reverse complemented
1144  */
1145 int
esl_abc_revcomp(const ESL_ALPHABET * abc,ESL_DSQ * dsq,int n)1146 esl_abc_revcomp(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int n)
1147 {
1148   ESL_DSQ x;
1149   int     pos;
1150 
1151   if (abc->complement == NULL)
1152     ESL_EXCEPTION(eslEINCOMPAT, "tried to reverse complement using an alphabet that doesn't have one");
1153 
1154   for (pos = 1; pos <= n/2; pos++)
1155     {
1156       x            = abc->complement[dsq[n-pos+1]];
1157       dsq[n-pos+1] = abc->complement[dsq[pos]];
1158       dsq[pos]     = x;
1159     }
1160   if (n%2) dsq[pos] = abc->complement[dsq[pos]];
1161   return eslOK;
1162 }
1163 
1164 
1165 
1166 /*-------------- end, digital sequences (ESL_DSQ) ---------------*/
1167 
1168 
1169 /*****************************************************************
1170  * 3. Other routines in the API.
1171  *****************************************************************/
1172 
1173 /* Function:  esl_abc_ValidateType()
1174  * Synopsis:  Check that an alphabet is known and valid
1175  * Incept:    SRE, Thu Feb 11 15:48:23 2016
1176  *
1177  * Purpose:   Returns <eslOK> if <type> is a valid and known Easel
1178  *            alphabet type code.
1179  *
1180  *            Used to validate "user" input, where we're parsing a
1181  *            file format that has stored an Easel alphabet code.
1182  *
1183  *            Returns <eslFAIL> for the special <eslUNKNOWN> "unset"
1184  *            value, even though that is a valid code, because it's
1185  *            not an alphabet, so shouldn't show up in a file.
1186  */
1187 int
esl_abc_ValidateType(int type)1188 esl_abc_ValidateType(int type)
1189 {
1190   if (type <= 0 || type > eslNONSTANDARD) return eslFAIL;
1191   else                                    return eslOK;
1192 }
1193 
1194 
1195 /* Function:  esl_abc_GuessAlphabet()
1196  * Synopsis:  Guess alphabet type from residue composition.
1197  *
1198  * Purpose:   Guess the alphabet type from a residue composition.
1199  *            The input <ct[0..25]> array contains observed counts of
1200  *            the letters A..Z, case-insensitive.
1201  *
1202  *            The composition <ct> must contain more than 10 residues.
1203  *
1204  *            If it contains $\geq$98\% ACGTN and all four of the
1205  *            residues ACGT occur, call it <eslDNA>. (Analogously for
1206  *            ACGUN, ACGU: call <eslRNA>.)
1207  *
1208  *            If it contains any amino-specific residue (EFIJLPQZ),
1209  *            call it <eslAMINO>.
1210  *
1211  *            If it consists of $\geq$98\% canonical aa residues or X,
1212  *            and at least 15 of the different 20 aa residues occur,
1213  *            and the number of residues that are canonical aa/degenerate
1214  *            nucleic (DHKMRSVWY) is greater than the number of canonicals
1215  *            for both amino and nucleic (ACG); then call it <eslAMINO>.
1216  *
1217  *            As a special case, if it consists entirely of N's, and
1218  *            we have >2000 residues, call it <eslDNA>. This is a
1219  *            special case that deals with genome sequence assemblies
1220  *            that lead with a swath of N's.
1221  *
1222  *            We aim to be very conservative, essentially never making
1223  *            a false call; we err towards calling <eslUNKNOWN> if
1224  *            unsure. Our test is to classify every individual
1225  *            sequence in NCBI NR and NT (or equiv large messy
1226  *            sequence database) with no false positives, only correct
1227  *            calls or <eslUNKNOWN>.
1228  *
1229  * Returns:   <eslOK> on success, and <*ret_type> is set to
1230  *            <eslAMINO>, <eslRNA>, or <eslDNA>.
1231  *
1232  *            Returns <eslENOALPHABET> if unable to determine the
1233  *            alphabet type; in this case, <*ret_type> is set to
1234  *            <eslUNKNOWN>.
1235  *
1236  * Notes:     As of Jan 2011:
1237  *               nr         10M seqs :  6999 unknown,  0 misclassified
1238  *               Pfam full  13M seqs :  7930 unknown,  0 misclassified
1239  *               Pfam seed  500K seqs:   366 unknown,  0 misclassified
1240  *               trembl     14M seqs :  7748 unknown,  0 misclassified
1241  *
1242  *               nt         10M seqs : 35620 unknown,  0 misclassified
1243  *               Rfam full   3M seqs :  8146 unknown,  0 misclassified
1244  *               Rfam seed  27K seqs :    49 unknown,  0 misclassified
1245  *
1246  * xref:      esl_alphabet_example3 collects per-sequence classification
1247  *            2012/0201-easel-guess-alphabet
1248  *            J1/62; 2007-0517-easel-guess-alphabet
1249  */
1250 int
esl_abc_GuessAlphabet(const int64_t * ct,int * ret_type)1251 esl_abc_GuessAlphabet(const int64_t *ct, int *ret_type)
1252 {
1253   int      type = eslUNKNOWN;
1254   char     aaonly[]    = "EFIJLOPQZ";
1255   char     allcanon[]  = "ACG";
1256   char     aacanon[]   = "DHKMRSVWY";
1257   int64_t  n1, n2, n3, nn, nt, nu, nx, n; /* n's are counts */
1258   int      x1, x2, x3, xn, xt, xu;	  /* x's are how many different residues are represented */
1259   int      i, x;
1260 
1261   x1 = x2 = x3 = xn = xt = xu = 0;
1262   n1 = n2 = n3 = n = 0;
1263   for (i = 0; i < 26;                i++) n  += ct[i];
1264   for (i = 0; aaonly[i]   != '\0'; i++) { x = ct[aaonly[i]   - 'A']; if (x > 0) { n1 += x; x1++; } }
1265   for (i = 0; allcanon[i] != '\0'; i++) { x = ct[allcanon[i] - 'A']; if (x > 0) { n2 += x; x2++; } }
1266   for (i = 0; aacanon[i]  != '\0'; i++) { x = ct[aacanon[i]  - 'A']; if (x > 0) { n3 += x; x3++; } }
1267   nt = ct['T' - 'A']; xt = (nt ? 1 : 0);
1268   nu = ct['U' - 'A']; xu = (nu ? 1 : 0);
1269   nx = ct['X' - 'A'];
1270   nn = ct['N' - 'A']; xn = (nn ? 1 : 0);
1271 
1272   if      (n  <= 10)                                                type = eslUNKNOWN;        // small sample, don't guess
1273   else if (n  > 2000 && nn == n)                                    type = eslDNA;            // special case of many N's leading a genome assembly
1274   else if (n1 > 0)                                                  type = eslAMINO;          // contains giveaway, aa-only chars
1275   else if (n-(n2+nt+nn) <= 0.02*n && x2+xt == 4)                    type = eslDNA;            // nearly all DNA canon (or N), all four seen
1276   else if (n-(n2+nu+nn) <= 0.02*n && x2+xu == 4)                    type = eslRNA;            // nearly all RNA canon (or N), all four seen
1277   else if (n-(n1+n2+n3+nn+nt+nx) <= 0.02*n && n3>n2 && x1+x2+x3+xn+xt >= 15) type = eslAMINO; // nearly all aa canon (or X); more aa canon than ambig; all 20 seen
1278 
1279   *ret_type = type;
1280   if (type == eslUNKNOWN) return eslENOALPHABET;
1281   else                    return eslOK;
1282 }
1283 
1284 
1285 
1286 /* Function:  esl_abc_Match()
1287  * Synopsis:  Returns the probability that two symbols match.
1288  *
1289  * Purpose:   Given two digital symbols <x> and <y> in alphabet
1290  *            <abc>, calculate and return the probability that
1291  *            <x> and <y> match, taking degenerate residue codes
1292  *            into account.
1293  *
1294  *            If <p> residue probability vector is NULL, the
1295  *            calculation is a simple average. For example, for DNA,
1296  *            R/A gives 0.5, C/N gives 0.25, N/R gives 0.25, R/R gives
1297  *            0.5.
1298  *
1299  *            If <p> residue probability vector is non-NULL, it gives
1300  *            a 0..K-1 array of background frequencies, and the
1301  *            returned match probability is an expectation (weighted
1302  *            average) given those residue frequencies.
1303  *
1304  *            <x> and <y> should only be residue codes. Any other
1305  *            comparison, including comparisons involving gap or
1306  *            missing data characters, or even comparisons involving
1307  *            illegal digital codes, returns 0.0.
1308  *
1309  *            Note that comparison of residues from "identical"
1310  *            sequences (even a self-comparison) will not result in an
1311  *            identity of 1.0, if the sequence(s) contain degenerate
1312  *            residue codes.
1313  *
1314  * Args:      abc   - digtal alphabet to use
1315  *            x,y   - two symbols to compare
1316  *            p     - NULL, or background probabilities of the
1317  *                    canonical residues in this alphabet [0..K-1]
1318  *
1319  * Returns:   the probability of an identity (match) between
1320  *            residues <x> and <y>.
1321  */
1322 double
esl_abc_Match(const ESL_ALPHABET * abc,ESL_DSQ x,ESL_DSQ y,double * p)1323 esl_abc_Match(const ESL_ALPHABET *abc, ESL_DSQ x, ESL_DSQ y, double *p)
1324 {
1325   int    i;
1326   double prob;
1327   double sx, sy;
1328 
1329   /* Easy cases */
1330   if (esl_abc_XIsCanonical(abc, x) && esl_abc_XIsCanonical(abc, y))
1331     {
1332       if (x==y) return 1.0; else return 0.0;
1333     }
1334   if ( ! esl_abc_XIsResidue(abc, x) || ! esl_abc_XIsResidue(abc, x))  return 0.0;
1335 
1336   /* Else, we have at least one degenerate residue, so calc an average or expectation.
1337    */
1338   if (p != NULL)
1339     {
1340       prob = sx = sy = 0.;
1341       for (i = 0; i < abc->K; i++)
1342 	{
1343 	  if (abc->degen[(int)x][i])                            sx += p[i];
1344 	  if (abc->degen[(int)y][i])                            sy += p[i];
1345 	  if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += p[i] * p[i];
1346 	}
1347       prob = prob / (sx*sy);
1348     }
1349   else
1350     {
1351       double uniformp = 1. / (double) abc->K;
1352       prob = sx = sy = 0.;
1353       for (i = 0; i < abc->K; i++)
1354 	{
1355 	  if (abc->degen[(int)x][i])                            sx += uniformp;
1356 	  if (abc->degen[(int)y][i])                            sy += uniformp;
1357 	  if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += uniformp * uniformp;
1358 	}
1359       prob = prob / (sx*sy);
1360     }
1361   return prob;
1362 }
1363 
1364 
1365 
1366 /* Function:  esl_abc_IAvgScore()
1367  * Synopsis:  Returns average score for degenerate residue.
1368  *
1369  * Purpose:  Given a residue code <x> in alphabet <a>, and an array of
1370  *           integer scores <sc> for the residues in the base
1371  *           alphabet, calculate and return the average score
1372  *           (rounded to nearest integer).
1373  *
1374  *           <x> would usually be a degeneracy code, but it
1375  *           may also be a canonical residue. It must not
1376  *           be a gap, missing data, or illegal symbol; if it
1377  *           is, these functions return a score of 0 without
1378  *           raising an error.
1379  *
1380  *           <esl_abc_FAvgScore()> and <esl_abc_DAvgScore()> do the
1381  *           same, but for float and double scores instead of integers
1382  *           (and for real-valued scores, no rounding is done).
1383  *
1384  * Args:     a   - digital alphabet to use
1385  *           x   - a symbol to score
1386  *           sc  - score vector for canonical residues [0..K-1]
1387  *
1388  * Returns:  average score for symbol <x>
1389  */
1390 int
esl_abc_IAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const int * sc)1391 esl_abc_IAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc)
1392 {
1393   float result = 0.;
1394   int i;
1395 
1396   if (! esl_abc_XIsResidue(a, x)) return 0;
1397   for (i = 0; i < a->K; i++)
1398     if (a->degen[(int) x][i]) result += (float) sc[i];
1399   result /= (float) a->ndegen[(int) x];
1400   if (result < 0) return (int) (result - 0.5);
1401   else            return (int) (result + 0.5);
1402 }
1403 float
esl_abc_FAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const float * sc)1404 esl_abc_FAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc)
1405 {
1406   float result = 0.;
1407   int   i;
1408 
1409   if (! esl_abc_XIsResidue(a, x)) return 0.;
1410   for (i = 0; i < a->K; i++)
1411     if (a->degen[(int) x][i]) result += sc[i];
1412   result /= (float) a->ndegen[(int) x];
1413   return result;
1414 }
1415 double
esl_abc_DAvgScore(const ESL_ALPHABET * a,ESL_DSQ x,const double * sc)1416 esl_abc_DAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc)
1417 {
1418   double result = 0.;
1419   int    i;
1420 
1421   if (! esl_abc_XIsResidue(a, x)) return 0.;
1422   for (i = 0; i < a->K; i++)
1423     if (a->degen[(int) x][i]) result += sc[i];
1424   result /= (double) a->ndegen[(int) x];
1425   return result;
1426 }
1427 
1428 
1429 /* Function:  esl_abc_IExpectScore()
1430  * Synopsis:  Returns expected score for degenerate residue.
1431  *
1432  * Purpose:   Given a residue code <x> in alphabet <a>, an
1433  *            array of integer scores <sc> for the residues in the base
1434  *            alphabet, and background frequencies <p> for the
1435  *            occurrence frequencies of the residues in the base
1436  *            alphabet, calculate and return the expected score
1437  *            (weighted by the occurrence frequencies <p>).
1438  *
1439  *            <x> would usually be a degeneracy code, but it
1440  *            may also be a canonical residue. It must not
1441  *            be a gap, missing data, or illegal symbol; if it
1442  *            is, these functions return a score of 0 without
1443  *            raising an error.
1444  *
1445  *            <esl_abc_FExpectScore()> and <esl_abc_DExpectScore()> do the
1446  *            same, but for float and double scores instead of integers
1447  *            (for real-valued scores, no rounding is done).
1448  *
1449  * Args:     a   - digital alphabet to use
1450  *           x   - a symbol to score
1451  *           sc  - score vector for canonical residues [0..K-1]
1452  *           p   - background prob's of canonicals     [0..K-1]
1453  *
1454  * Returns:  average score for symbol <x>
1455  */
1456 int
esl_abc_IExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const int * sc,const float * p)1457 esl_abc_IExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc, const float *p)
1458 {
1459   float  result = 0.;
1460   float  denom  = 0.;
1461   int    i;
1462 
1463   if (! esl_abc_XIsResidue(a, x)) return 0;
1464   for (i = 0; i < a->K; i++)
1465     if (a->degen[(int) x][i]) {
1466       result += (float) sc[i] * p[i];
1467       denom  += p[i];
1468     }
1469   result /= denom;
1470   if (result < 0) return (int) (result - 0.5);
1471   else            return (int) (result + 0.5);
1472 }
1473 float
esl_abc_FExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const float * sc,const float * p)1474 esl_abc_FExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc, const float *p)
1475 {
1476   float  result = 0.;
1477   float  denom  = 0.;
1478   int    i;
1479 
1480   if (! esl_abc_XIsResidue(a, x)) return 0.;
1481   for (i = 0; i < a->K; i++)
1482     if (a->degen[(int) x][i]) {
1483       result += sc[i] * p[i];
1484       denom  += p[i];
1485     }
1486   result /= denom;
1487   return result;
1488 }
1489 double
esl_abc_DExpectScore(const ESL_ALPHABET * a,ESL_DSQ x,const double * sc,const double * p)1490 esl_abc_DExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc, const double *p)
1491 {
1492   double result = 0.;
1493   double denom  = 0.;
1494   int    i;
1495 
1496   if (! esl_abc_XIsResidue(a, x)) return 0.;
1497   for (i = 0; i < a->K; i++)
1498     if (a->degen[(int) x][i]) {
1499       result += sc[i] * p[i];
1500       denom  += p[i];
1501     }
1502   result /= denom;
1503   return result;
1504 }
1505 
1506 /* Function:  esl_abc_IAvgScVec()
1507  * Synopsis:  Fill out score vector with average degenerate scores.
1508  *
1509  * Purpose:   Given an alphabet <a> and a score vector <sc> of length
1510  *            <a->Kp> that contains integer scores for the base
1511  *            alphabet (<0..a->K-1>), fill out the rest of the score
1512  *            vector, calculating average scores for
1513  *            degenerate residues using <esl_abc_IAvgScore()>.
1514  *
1515  *            The score, if any, for a gap character <K>, the
1516  *            nonresidue <Kp-2>, and the missing data character <Kp-1>
1517  *            are untouched by this function. Only the degenerate
1518  *            scores <K+1..Kp-3> are filled in.
1519  *
1520  *            <esl_abc_FAvgScVec()> and <esl_abc_DAvgScVec()> do
1521  *            the same, but for score vectors of floats or doubles,
1522  *            respectively.
1523  *
1524  * Returns:   <eslOK> on success.
1525  */
1526 int
esl_abc_IAvgScVec(const ESL_ALPHABET * a,int * sc)1527 esl_abc_IAvgScVec(const ESL_ALPHABET *a, int *sc)
1528 {
1529   ESL_DSQ x;
1530   for (x = a->K+1; x <= a->Kp-3; x++)
1531     sc[x] = esl_abc_IAvgScore(a, x, sc);
1532   return eslOK;
1533 }
1534 int
esl_abc_FAvgScVec(const ESL_ALPHABET * a,float * sc)1535 esl_abc_FAvgScVec(const ESL_ALPHABET *a, float *sc)
1536 {
1537   ESL_DSQ x;
1538   for (x = a->K+1; x <= a->Kp-3; x++)
1539     sc[x] = esl_abc_FAvgScore(a, x, sc);
1540   return eslOK;
1541 }
1542 int
esl_abc_DAvgScVec(const ESL_ALPHABET * a,double * sc)1543 esl_abc_DAvgScVec(const ESL_ALPHABET *a, double *sc)
1544 {
1545   ESL_DSQ x;
1546   for (x = a->K+1; x <= a->Kp-3; x++)
1547     sc[x] = esl_abc_DAvgScore(a, x, sc);
1548   return eslOK;
1549 }
1550 
1551 /* Function:  esl_abc_IExpectScVec()
1552  * Synopsis:  Fill out score vector with average expected scores.
1553  *
1554  * Purpose:   Given an alphabet <a>, a score vector <sc> of length
1555  *            <a->Kp> that contains integer scores for the base
1556  *            alphabet (<0..a->K-1>), and residue occurrence probabilities
1557  *            <p[0..a->K-1]>; fill in the scores for the
1558  *            degenerate residues <K+1..Kp-3> using <esl_abc_IExpectScore()>.
1559  *
1560  *            The score, if any, for a gap character <K>, the
1561  *            nonresidue <Kp-2>, and the missing data character <Kp-1>
1562  *            are untouched by this function. Only the degenerate
1563  *            scores <K+1..Kp-3> are filled in.
1564  *
1565  *            <esl_abc_FExpectScVec()> and <esl_abc_DExpectScVec()> do
1566  *            the same, but for score vectors of floats or doubles,
1567  *            respectively. The probabilities <p> are floats for the
1568  *            integer and float versions, and doubles for the double
1569  *            version.
1570  *
1571  * Returns:   <eslOK> on success.
1572  */
1573 int
esl_abc_IExpectScVec(const ESL_ALPHABET * a,int * sc,const float * p)1574 esl_abc_IExpectScVec(const ESL_ALPHABET *a, int *sc, const float *p)
1575 {
1576   ESL_DSQ x;
1577   for (x = a->K+1; x <= a->Kp-3; x++)
1578     sc[x] = esl_abc_IExpectScore(a, x, sc, p);
1579   return eslOK;
1580 }
1581 int
esl_abc_FExpectScVec(const ESL_ALPHABET * a,float * sc,const float * p)1582 esl_abc_FExpectScVec(const ESL_ALPHABET *a, float *sc, const float *p)
1583 {
1584   ESL_DSQ x;
1585   for (x = a->K+1; x <= a->Kp-3; x++)
1586     sc[x] = esl_abc_FExpectScore(a, x, sc, p);
1587   return eslOK;
1588 }
1589 int
esl_abc_DExpectScVec(const ESL_ALPHABET * a,double * sc,const double * p)1590 esl_abc_DExpectScVec(const ESL_ALPHABET *a, double *sc, const double *p)
1591 {
1592   ESL_DSQ x;
1593   for (x = a->K+1; x <= a->Kp-3; x++)
1594     sc[x] = esl_abc_DExpectScore(a, x, sc, p);
1595   return eslOK;
1596 }
1597 
1598 
1599 /* Function:  esl_abc_FCount()
1600  * Synopsis:  Count a degenerate symbol into a count vector.
1601  *
1602  * Purpose:   Count a possibly degenerate digital symbol <x> (0..Kp-1)
1603  *            into a counts array <ct> for base symbols (0..K-1).
1604  *            Assign the symbol a weight of <wt> (often just 1.0).
1605  *            The count weight of a degenerate symbol is divided equally
1606  *            across the possible base symbols.
1607  *
1608  *            <x> can be a gap; if so, <ct> must be allocated 0..K,
1609  *            not 0..K-1. If <x> is a missing data symbol, or a nonresidue
1610  *            data symbol, nothing is done.
1611  *
1612  *            <esl_abc_DCount()> does the same, but for double-precision
1613  *            count vectors and weights.
1614  *
1615  * Returns:   <eslOK> on success.
1616  */
1617 int
esl_abc_FCount(const ESL_ALPHABET * abc,float * ct,ESL_DSQ x,float wt)1618 esl_abc_FCount(const ESL_ALPHABET *abc, float *ct, ESL_DSQ x, float wt)
1619 {
1620   ESL_DSQ y;
1621 
1622   if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
1623     ct[x] += wt;
1624   else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
1625     return eslOK;
1626   else
1627     for (y = 0; y < abc->K; y++) {
1628       if (abc->degen[x][y])
1629 	ct[y] += wt / (float) abc->ndegen[x];
1630     }
1631   return eslOK;
1632 }
1633 int
esl_abc_DCount(const ESL_ALPHABET * abc,double * ct,ESL_DSQ x,double wt)1634 esl_abc_DCount(const ESL_ALPHABET *abc, double *ct, ESL_DSQ x, double wt)
1635 {
1636   ESL_DSQ y;
1637 
1638   if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
1639     ct[x] += wt;
1640   else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
1641     return eslOK;
1642   else
1643     for (y = 0; y < abc->K; y++) {
1644       if (abc->degen[x][y])
1645 	ct[y] += wt / (double) abc->ndegen[x];
1646     }
1647   return eslOK;
1648 }
1649 
1650 /* Function:  esl_abc_EncodeType()
1651  * Synopsis:  Convert descriptive string to alphabet type code.
1652  *
1653  * Purpose:   Convert a string like "amino" or "DNA" to the
1654  *            corresponding Easel internal alphabet type code
1655  *            such as <eslAMINO> or <eslDNA>; return the code.
1656  *
1657  * Returns:   the code, such as <eslAMINO>; if <type> isn't
1658  *            recognized, returns <eslUNKNOWN>.
1659  */
1660 int
esl_abc_EncodeType(char * type)1661 esl_abc_EncodeType(char *type)
1662 {
1663   if      (strcasecmp(type, "amino") == 0) return eslAMINO;
1664   else if (strcasecmp(type, "rna")   == 0) return eslRNA;
1665   else if (strcasecmp(type, "dna")   == 0) return eslDNA;
1666   else if (strcasecmp(type, "coins") == 0) return eslCOINS;
1667   else if (strcasecmp(type, "dice")  == 0) return eslDICE;
1668   else if (strcasecmp(type, "custom")== 0) return eslNONSTANDARD;
1669   else                                     return eslUNKNOWN;
1670 }
1671 
1672 /* Function:  esl_abc_EncodeTypeMem()
1673  * Synopsis:  Convert memory chunk to alphabet type code
1674  * Incept:    SRE, Thu 02 Aug 2018
1675  *
1676  * Purpose:   Same as <esl_abc_EncodeType()>, but for a
1677  *            non-NUL terminated memory chunk <type> of
1678  *            length <n>.
1679  */
1680 int
esl_abc_EncodeTypeMem(char * type,int n)1681 esl_abc_EncodeTypeMem(char *type, int n)
1682 {
1683   if      (esl_memstrcmp_case(type, n, "amino"))  return eslAMINO;
1684   else if (esl_memstrcmp_case(type, n, "rna"))    return eslRNA;
1685   else if (esl_memstrcmp_case(type, n, "dna"))    return eslDNA;
1686   else if (esl_memstrcmp_case(type, n, "coins"))  return eslCOINS;
1687   else if (esl_memstrcmp_case(type, n, "dice"))   return eslDICE;
1688   else if (esl_memstrcmp_case(type, n, "custom")) return eslNONSTANDARD;
1689   else                                            return eslUNKNOWN;
1690 }
1691 
1692 
1693 /* Function:  esl_abc_DecodeType()
1694  * Synopsis:  Returns descriptive string for alphabet type code.
1695  *
1696  * Purpose:   For diagnostics and other output: given an internal
1697  *            alphabet code <type> (<eslRNA>, for example), return
1698  *            pointer to an internal string ("RNA", for example).
1699  */
1700 char *
esl_abc_DecodeType(int type)1701 esl_abc_DecodeType(int type)
1702 {
1703   switch (type) {
1704   case eslUNKNOWN:     return "unknown";
1705   case eslRNA:         return "RNA";
1706   case eslDNA:         return "DNA";
1707   case eslAMINO:       return "amino";
1708   case eslCOINS:       return "coins";
1709   case eslDICE:        return "dice";
1710   case eslNONSTANDARD: return "custom";
1711   default:             break;
1712   }
1713   esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such alphabet type code %d\n", type);
1714   return NULL;
1715 }
1716 
1717 
1718 /* Function:  esl_abc_ValidateSeq()
1719  * Synopsis:  Assure that a text sequence can be digitized.
1720  *
1721  * Purpose:   Check that sequence <seq> of length <L> can be digitized
1722  *            without error; all its symbols are valid in alphabet
1723  *            <a>. If so, return <eslOK>. If not, return <eslEINVAL>.
1724  *
1725  *            If <a> is <NULL>, we still validate that at least the
1726  *            <seq> consists only of ASCII characters.
1727  *
1728  *            <errbuf> is either passed as <NULL>, or a pointer to an
1729  *            error string buffer allocated by the caller for
1730  *            <eslERRBUFSIZE> characters. If <errbuf> is non-NULL, and
1731  *            the sequence is invalid, an error message is placed in
1732  *            <errbuf>.
1733  *
1734  * Args:      a      - digital alphabet (or NULL, if unavailable)
1735  *            seq    - sequence to validate [0..L-1]; NUL-termination unnecessary
1736  *            L      - length of <seq>
1737  *            errbuf - NULL, or ptr to <eslERRBUFSIZE> chars of allocated space
1738  *                     for an error message.
1739  *
1740  * Returns:   <eslOK> if <seq> is valid; <eslEINVAL> if not.
1741  *
1742  * Throws:    (no abnormal error conditions).
1743  */
1744 int
esl_abc_ValidateSeq(const ESL_ALPHABET * a,const char * seq,int64_t L,char * errbuf)1745 esl_abc_ValidateSeq(const ESL_ALPHABET *a, const char *seq, int64_t L, char *errbuf)
1746 {
1747   int     status;
1748   int64_t i;
1749   int64_t firstpos = -1;
1750   int64_t nbad     = 0;
1751 
1752   if (errbuf) *errbuf = 0;
1753 
1754 
1755   if (a)  // If we have digital alphabet <a>, it has an <inmap> we can check against
1756     {
1757       for (i = 0; i < L; i++) {
1758 	if (! esl_abc_CIsValid(a, seq[i])) {
1759 	  if (firstpos == -1) firstpos = i;
1760 	  nbad++;
1761 	}
1762       }
1763     }
1764   else  // Else, at least validate that the text string is an ASCII text string
1765     {
1766       for (i = 0; i < L; i++) {
1767 	if (! isascii(seq[i])) {
1768 	  if (firstpos == -1) firstpos = i;
1769 	  nbad++;
1770 	}
1771       }
1772     }
1773 
1774   if      (nbad == 1) ESL_XFAIL(eslEINVAL, errbuf, "invalid char %c at pos %" PRId64,                                   seq[firstpos], firstpos+1);
1775   else if (nbad >  1) ESL_XFAIL(eslEINVAL, errbuf, "%" PRId64 " invalid chars (including %c at pos %" PRId64 ")", nbad, seq[firstpos], firstpos+1);
1776   return eslOK;
1777 
1778  ERROR:
1779   return status;
1780 }
1781 /*---------------- end, other API functions ---------------------*/
1782 
1783 
1784 
1785 /*****************************************************************
1786  * 4. Unit tests.
1787  *****************************************************************/
1788 #ifdef eslALPHABET_TESTDRIVE
1789 #include "esl_vectorops.h"
1790 
1791 static int
utest_Create(void)1792 utest_Create(void)
1793 {
1794   char msg[]  = "esl_alphabet_Create() unit test failed";
1795   int  types[] = { eslDNA, eslRNA, eslAMINO, eslCOINS, eslDICE };
1796   int  Karr[]  = {      4,      4,       20,        2,       6 };
1797   int  Kparr[] = {     18,     18,       29,        6,      10 };
1798   int  i;
1799   ESL_ALPHABET *a;
1800   ESL_DSQ       x;
1801 
1802   for (i = 0; i < 3; i++)
1803     {
1804       if ((a = esl_alphabet_Create(types[i])) == NULL) esl_fatal(msg);
1805       if (a->type != types[i])       esl_fatal(msg);
1806       if (a->K    != Karr[i])        esl_fatal(msg);
1807       if (a->Kp   != Kparr[i])       esl_fatal(msg);
1808       if (strlen(a->sym) != a->Kp)   esl_fatal(msg);
1809 
1810       x = esl_abc_XGetGap(a);
1811       if (x            != a->K)    esl_fatal(msg);
1812       if (a->ndegen[x] != 0)       esl_fatal(msg);
1813 
1814       x = esl_abc_XGetUnknown(a);
1815       if (x            != a->Kp-3) esl_fatal(msg);
1816       if (a->ndegen[x] != a->K)    esl_fatal(msg);
1817 
1818       x = esl_abc_XGetNonresidue(a);
1819       if (x            != a->Kp-2) esl_fatal(msg);
1820       if (a->ndegen[x] != 0)       esl_fatal(msg);
1821 
1822       x = esl_abc_XGetMissing(a);
1823       if (x            != a->Kp-1) esl_fatal(msg);
1824       if (a->ndegen[x] != 0)       esl_fatal(msg);
1825 
1826       esl_alphabet_Destroy(a);
1827     }
1828 
1829   /* Thrown errors
1830    */
1831 #ifdef eslTEST_THROWING
1832   if (esl_alphabet_Create(-1)             != NULL) esl_fatal(msg);
1833   if (esl_alphabet_Create(eslUNKNOWN)     != NULL) esl_fatal(msg);
1834   if (esl_alphabet_Create(eslNONSTANDARD) != NULL) esl_fatal(msg);
1835 #endif
1836 
1837   return eslOK;
1838 }
1839 
1840 static int
utest_CreateCustom(void)1841 utest_CreateCustom(void)
1842 {
1843   char msg[]  = "esl_alphabet_CreateCustom() unit test failed";
1844   ESL_ALPHABET *a;
1845   char         *testseq = "AaU-~Z";
1846   ESL_DSQ      expect[] = { eslDSQ_SENTINEL, 0, 0, 15, 20, 26, 23, eslDSQ_SENTINEL };
1847   ESL_DSQ      *dsq;
1848 
1849   if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZX*~", 20, 27)) == NULL) esl_fatal(msg);
1850   if (esl_alphabet_SetEquiv(a, 'O', 'K')       != eslOK) esl_fatal(msg);  /* read pyrrolysine O as lysine K */
1851   if (esl_alphabet_SetEquiv(a, 'U', 'S')       != eslOK) esl_fatal(msg);  /* read selenocys U as serine S */
1852   if (esl_alphabet_SetCaseInsensitive(a)       != eslOK) esl_fatal(msg);  /* allow lower case input */
1853   if (esl_alphabet_SetDegeneracy(a, 'Z', "QE") != eslOK) esl_fatal(msg);
1854 
1855   if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK)   esl_fatal(msg);
1856   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1857 
1858   free(dsq);
1859   esl_alphabet_Destroy(a);
1860 
1861 #ifdef eslTEST_THROWING
1862   if (esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 21) != NULL) esl_fatal(msg); /* Kp mismatches string length */
1863 #endif
1864 
1865   return eslOK;
1866 }
1867 
1868 static int
utest_SetEquiv(void)1869 utest_SetEquiv(void)
1870 {
1871   char msg[]  = "esl_alphabet_SetEquiv() unit test failed";
1872   ESL_ALPHABET *a;
1873   char         *testseq = "a1&";
1874   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 0, 4, 7, eslDSQ_SENTINEL };
1875   ESL_DSQ      *dsq;
1876 
1877   if ((a = esl_alphabet_CreateCustom("ACGT-N*~", 4, 8)) == NULL) esl_fatal(msg);
1878   if (esl_alphabet_SetEquiv(a, 'a', 'A') != eslOK)               esl_fatal(msg);
1879   if (esl_alphabet_SetEquiv(a, '1', '-') != eslOK)               esl_fatal(msg);
1880   if (esl_alphabet_SetEquiv(a, '&', '~') != eslOK)               esl_fatal(msg);
1881 
1882 #ifdef eslTEST_THROWING
1883   if (esl_alphabet_SetEquiv(a, 'G', 'C') != eslEINVAL)           esl_fatal(msg); /* sym is already in internal alphabet */
1884   if (esl_alphabet_SetEquiv(a, '2', 'V') != eslEINVAL)           esl_fatal(msg); /* c is not in internal alphabet */
1885 #endif
1886 
1887   if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK)                    esl_fatal(msg);
1888   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1889   free(dsq);
1890   esl_alphabet_Destroy(a);
1891   return eslOK;
1892 }
1893 
1894 static int
utest_SetCaseInsensitive(void)1895 utest_SetCaseInsensitive(void)
1896 {
1897   char msg[]  = "esl_alphabet_SetCaseInsensitive() unit test failed";
1898   ESL_ALPHABET *a;
1899   char         *testseq = "ACGT";
1900   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
1901   ESL_DSQ      *dsq;
1902 
1903   if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL)       esl_fatal(msg);
1904   if (esl_alphabet_SetCaseInsensitive(a)  != eslOK)                    esl_fatal(msg);
1905   if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK)                    esl_fatal(msg);
1906   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1907   free(dsq);
1908   esl_alphabet_Destroy(a);
1909 
1910 #ifdef TEST_THROWING
1911   if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL)       esl_fatal(msg);
1912   if (esl_alphabet_SetEquiv(a, 'A', 'c')               != eslOK)       esl_fatal(msg); /* now input A maps to internal c */
1913   if (esl_alphabet_SetCaseInsensitive(a)               != eslECORRUPT) esl_fatal(msg); /* and this fails, can't remap A  */
1914   esl_alphabet_Destroy(a);
1915 #endif
1916 
1917   return eslOK;
1918 }
1919 
1920 static int
utest_SetDegeneracy(void)1921 utest_SetDegeneracy(void)
1922 {
1923   char msg[]  = "esl_alphabet_SetDegeneracy() unit test failed";
1924   ESL_ALPHABET *a;
1925   char         *testseq = "yrn";
1926   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 6, 5, 7, eslDSQ_SENTINEL };
1927   ESL_DSQ      *dsq;
1928   ESL_DSQ       x;
1929 
1930   if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1931   if (esl_alphabet_SetDegeneracy(a, 'R', "AG") != eslOK)            esl_fatal(msg);
1932   if (esl_alphabet_SetDegeneracy(a, 'Y', "CT") != eslOK)            esl_fatal(msg);
1933   if (esl_alphabet_SetCaseInsensitive(a)       != eslOK)            esl_fatal(msg);
1934 
1935   if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK)                    esl_fatal(msg);
1936   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
1937 
1938   x = esl_abc_DigitizeSymbol(a, 'a');  if (a->ndegen[x] != 1) esl_fatal(msg);
1939   x = esl_abc_DigitizeSymbol(a, 'r');  if (a->ndegen[x] != 2) esl_fatal(msg);
1940   x = esl_abc_DigitizeSymbol(a, 'y');  if (a->ndegen[x] != 2) esl_fatal(msg);
1941   x = esl_abc_DigitizeSymbol(a, 'n');  if (a->ndegen[x] != 4) esl_fatal(msg);
1942 
1943   free(dsq);
1944   esl_alphabet_Destroy(a);
1945 
1946 #ifdef TEST_THROWING
1947   if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1948   if (esl_abc_SetDegeneracy(a, 'z', "AC")    != eslEINVAL)          esl_fatal(msg); /* can't map char not in alphabet */
1949   if (esl_abc_SetDegeneracy(a, 'N', "ACGT")  != eslEINVAL)          esl_fatal(msg); /* can't remap N */
1950   if (esl_abc_SetDegeneracy(a, 'A', "GT")    != eslEINVAL)          esl_fatal(msg); /* can't map a nondegen character */
1951   if (esl_abc_SetDegeneracy(a, '-', "GT")    != eslEINVAL)          esl_fatal(msg); /*   ... or a gap... */
1952   if (esl_abc_SetDegeneracy(a, '*', "GT")    != eslEINVAL)          esl_fatal(msg); /*   ... or nonresidues... */
1953   if (esl_abc_SetDegeneracy(a, '~', "GT")    != eslEINVAL)          esl_fatal(msg); /*   ... or missing data. */
1954   if (esl_abc_SetDegeneracy(a, 'R', "XY")    != eslEINVAL)          esl_fatal(msg); /* can't map to unknown chars... */
1955   if (esl_abc_SetDegeneracy(a, 'R', "YN")    != eslEINVAL)          esl_fatal(msg); /*   ... nor to noncanonical chars... */
1956   esl_alphabet_Destroy(a);
1957 #endif
1958   return eslOK;
1959 }
1960 
1961 static int
utest_SetIgnored(void)1962 utest_SetIgnored(void)
1963 {
1964   char msg[]  = "esl_alphabet_SetIgnored() unit test failed";
1965   ESL_ALPHABET *a;
1966   char         *testseq = "y \trn";
1967   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 6, 5, 15, eslDSQ_SENTINEL };
1968   int           L = 5;
1969   ESL_DSQ      *dsq;
1970 
1971   if ((a = esl_alphabet_Create(eslRNA)) == NULL)  esl_fatal(msg);
1972   if (esl_alphabet_SetIgnored(a, " \t") != eslOK) esl_fatal(msg);
1973 
1974   if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK)  esl_fatal(msg);
1975   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * L) != 0) esl_fatal(msg);
1976   free(dsq);
1977   esl_alphabet_Destroy(a);
1978   return eslOK;
1979 }
1980 
1981 
1982 static int
utest_Destroy(void)1983 utest_Destroy(void)
1984 {
1985   char msg[]  = "esl_alphabet_Destroy() unit test failed";
1986   ESL_ALPHABET *a;
1987 
1988   if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
1989   esl_alphabet_Destroy(a);
1990   esl_alphabet_Destroy(NULL);	/* should be robust against NULL pointers */
1991   return eslOK;
1992 }
1993 
1994 static int
utest_CreateDsq(void)1995 utest_CreateDsq(void)
1996 {
1997   char msg[]  = "esl_abc_CreateDsq() unit test failed";
1998   ESL_ALPHABET *a;
1999   char          goodseq[] = "ACDEF";
2000   char          badseq[]  = "1@%34";
2001   ESL_DSQ      *dsq;
2002   ESL_DSQ       x;
2003 
2004   if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2005 
2006   if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2007   if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
2008   free(dsq);
2009 
2010   if (esl_abc_CreateDsq(a, badseq, &dsq) != eslEINVAL) esl_fatal(msg);
2011   x = esl_abc_XGetUnknown(a);
2012   if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
2013   free(dsq);
2014 
2015   if (esl_abc_CreateDsq(a, goodseq, NULL) != eslOK) esl_fatal(msg);
2016 
2017   esl_alphabet_Destroy(a);
2018   return eslOK;
2019 }
2020 
2021 static int
utest_Digitize(void)2022 utest_Digitize(void)
2023 {
2024   char msg[]  = "esl_abc_Digitize() unit test failed";
2025   ESL_ALPHABET *a;
2026   char          goodseq[] = "ACDEF";
2027   char          badseq[]  = "1@%34";
2028   ESL_DSQ      *dsq;
2029   ESL_DSQ       x;
2030   int           status;
2031 
2032   ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (strlen(goodseq)+2));
2033 
2034   if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
2035   esl_abc_Digitize(a, goodseq, dsq);
2036   if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
2037 
2038   esl_abc_Digitize(a, badseq, dsq);
2039   x = esl_abc_XGetUnknown(a);
2040   if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
2041 
2042   free(dsq);
2043   esl_alphabet_Destroy(a);
2044   return eslOK;
2045 
2046  ERROR:
2047   esl_fatal(msg);
2048   return status;
2049 }
2050 
2051 static int
utest_Textize(void)2052 utest_Textize(void)
2053 {
2054   char msg[]  = "esl_abc_Textize() unit test failed";
2055   ESL_ALPHABET *a;
2056   char          goodseq[] = "acdef";
2057   char         *newseq;
2058   ESL_DSQ      *dsq;
2059   int           L;
2060   int           status;
2061 
2062   L = strlen(goodseq);
2063   ESL_ALLOC(newseq, sizeof(char) * (L+1));
2064   if ((a = esl_alphabet_Create(eslAMINO))    == NULL) esl_fatal(msg);
2065   if (esl_abc_CreateDsq(a, goodseq, &dsq)   != eslOK) esl_fatal(msg);
2066   if (esl_abc_Textize(a, dsq, L, newseq )   != eslOK) esl_fatal(msg);
2067   if (strcmp(newseq, "ACDEF")               != 0)     esl_fatal(msg);
2068   free(dsq);
2069   free(newseq);
2070   esl_alphabet_Destroy(a);
2071   return eslOK;
2072 
2073  ERROR:
2074   esl_fatal(msg);
2075   return status;
2076 }
2077 
2078 static int
utest_TextizeN(void)2079 utest_TextizeN(void)
2080 {
2081   char msg[]  = "esl_abc_TextizeN() unit test failed";
2082   ESL_ALPHABET *a;
2083   char          goodseq[] = "acdefrynacdef";
2084   ESL_DSQ      *dsq;
2085   ESL_DSQ      *dptr;
2086   int           W;
2087 
2088   if ((a = esl_alphabet_Create(eslAMINO)) == NULL)  esl_fatal(msg);
2089   if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
2090 
2091   dptr = dsq+6; 		/* points to "r", residue 6         */
2092   W    = 5;			/* copy/convert 5 residues "rynac"  */
2093   if (esl_abc_TextizeN(a, dptr, W, goodseq)  != eslOK) esl_fatal(msg);
2094   if (strcmp(goodseq, "RYNACrynacdef")       != 0)     esl_fatal(msg);
2095 
2096   /* test a case where we hit eslDSQ_SENTINEL, and nul-terminate */
2097   dptr = dsq+10; 		/* points to "c", residue 10        */
2098   W    = 20;			/* copy/convert remaining residues "cdef"  */
2099   if (esl_abc_TextizeN(a, dptr, W, goodseq)  != eslOK) esl_fatal(msg);
2100   if (strcmp(goodseq, "CDEF")                != 0)     esl_fatal(msg);
2101 
2102   free(dsq);
2103   esl_alphabet_Destroy(a);
2104   return eslOK;
2105 }
2106 
2107 static int
utest_dsqdup(void)2108 utest_dsqdup(void)
2109 {
2110   char msg[]  = "esl_abc_dsqdup() unit test failed";
2111   ESL_ALPHABET *a;
2112   char          goodseq[] = "ACGt";
2113   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
2114   ESL_DSQ      *d1, *d2;
2115   int           L;
2116 
2117   L = strlen(goodseq);
2118   if ((a = esl_alphabet_Create(eslRNA))           == NULL)  esl_fatal(msg);
2119   if (esl_abc_CreateDsq(a, goodseq, &d1)          != eslOK) esl_fatal(msg);
2120 
2121   if (esl_abc_dsqdup(d1, -1, &d2)                 != eslOK) esl_fatal(msg);
2122   if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0)     esl_fatal(msg);
2123   free(d2);
2124 
2125   if (esl_abc_dsqdup(d1, L, &d2)                  != eslOK) esl_fatal(msg);
2126   if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0)     esl_fatal(msg);
2127   free(d2);
2128 
2129   free(d1);
2130   esl_alphabet_Destroy(a);
2131   return eslOK;
2132 }
2133 
2134 static int
utest_dsqcat(void)2135 utest_dsqcat(void)
2136 {
2137   char msg[]  = "esl_abc_dsqcat() unit test failed";
2138   ESL_ALPHABET *a;
2139   char          goodseq[] = "ACGt";
2140   char          addseq[]  = "RYM KN";
2141   char          badseq[]  = "RYM K&";
2142   ESL_DSQ       expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, 5, 6, 7, 8, 15, eslDSQ_SENTINEL };
2143   ESL_DSQ      *dsq;
2144   int64_t       L1;
2145   esl_pos_t     L2;
2146 
2147 
2148   if ((a = esl_alphabet_Create(eslRNA))             == NULL)  esl_fatal(msg);
2149   a->inmap[0]   = esl_abc_XGetUnknown(a);
2150   a->inmap[' '] = eslDSQ_IGNORED;
2151 
2152   L1 = strlen(goodseq);
2153   L2 = strlen(addseq);
2154   if (esl_abc_CreateDsq(a, goodseq, &dsq)              != eslOK) esl_fatal(msg);
2155   if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2)  != eslOK) esl_fatal(msg);
2156   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2))    != 0)     esl_fatal(msg);
2157   free(dsq);
2158 
2159   L1 = -1;
2160   L2 = -1;
2161   if (esl_abc_CreateDsq(a, goodseq, &dsq)              != eslOK) esl_fatal(msg);
2162   if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2)  != eslOK) esl_fatal(msg);
2163   if (L1 != esl_abc_dsqlen(dsq))                                 esl_fatal(msg);
2164   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2))    != 0)     esl_fatal(msg);
2165   free(dsq);
2166 
2167   L1  = 0;
2168   dsq = NULL;
2169   if (esl_abc_dsqcat(a->inmap, &dsq, &L1, goodseq, -1)    != eslOK) esl_fatal(msg);
2170   if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq,  -1)    != eslOK) esl_fatal(msg);
2171   if (L1 != esl_abc_dsqlen(dsq))                                    esl_fatal(msg);
2172   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2))       != 0)    esl_fatal(msg);
2173   free(dsq);
2174 
2175   L1 = -1;
2176   L2 = strlen(badseq);
2177   if (esl_abc_CreateDsq(a, goodseq, &dsq)                != eslOK)     esl_fatal(msg);
2178   if (esl_abc_dsqcat(a->inmap, &dsq, &L1, badseq,  L2)   != eslEINVAL) esl_fatal(msg);
2179   if (L1 != esl_abc_dsqlen(dsq))                                       esl_fatal(msg);
2180   if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2))      != 0)         esl_fatal(msg);
2181   free(dsq);
2182 
2183   esl_alphabet_Destroy(a);
2184   return eslOK;
2185 }
2186 
2187 /* dsqlen    unit test goes here */
2188 /* dsqrlen   unit test goes here */
2189 /* utest_Match goes here */
2190 
2191 /* This serves to unit test multiple functions:
2192  *    esl_abc_IAvgScore()
2193  *    esl_abc_IExpectScore()
2194  */
2195 static int
degeneracy_integer_scores(void)2196 degeneracy_integer_scores(void)
2197 {
2198   char *msg = "degeneracy_integer_scores unit test failed";
2199   ESL_ALPHABET *a;
2200   ESL_DSQ       x;
2201   float         p[]  = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2202   int           sc[] = { -1,  -6,   6,   1};
2203   int           val;
2204 
2205   a     = esl_alphabet_Create(eslDNA);
2206 
2207   x     = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2208   val   = esl_abc_IAvgScore(a, x, sc);
2209   /* average of -1,-6,6,1 = 0 */
2210   if (val != 0) esl_fatal(msg);
2211 
2212   x     = esl_abc_DigitizeSymbol(a, 'M');     /* M = A/C */
2213   val   = esl_abc_IExpectScore(a, x, sc, p);
2214   /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2215   if (val != -2) esl_fatal(msg);
2216 
2217   esl_alphabet_Destroy(a);
2218   return eslOK;
2219 }
2220 
2221 /* This serves to unit test multiple functions:
2222  *    esl_abc_FAvgScore()
2223  *    esl_abc_FExpectScore()
2224  */
2225 static int
degeneracy_float_scores(void)2226 degeneracy_float_scores(void)
2227 {
2228   char *msg = "degeneracy_float_scores unit test failed";
2229   ESL_ALPHABET *a;
2230   ESL_DSQ       x;
2231   float         p[]  = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2232   float         sc[] = { -1., -6.,  6., 1.};
2233   float         val;
2234 
2235   a     = esl_alphabet_Create(eslRNA);
2236 
2237   x     = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2238   val   = esl_abc_FAvgScore(a, x, sc);
2239   /* average of -1,-6,6,1 = 0 */
2240   if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
2241 
2242   x     = esl_abc_DigitizeSymbol(a, 'M');     /* M = A/C */
2243   val   = esl_abc_FExpectScore(a, x, sc, p);
2244   /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2245   if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
2246 
2247   esl_alphabet_Destroy(a);
2248   return eslOK;
2249 }
2250 
2251 /* This serves to unit test multiple functions:
2252  *    esl_abc_DAvgScore()
2253  *    esl_abc_DExpectScore()
2254  */
2255 
2256 static int
degeneracy_double_scores(void)2257 degeneracy_double_scores(void)
2258 {
2259   char *msg = "degeneracy_double_scores unit test failed";
2260   ESL_ALPHABET *a;
2261   ESL_DSQ       x;
2262   double        p[]  = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
2263   double        sc[] = { -1., -6.,  6., 1.};
2264   double        val;
2265 
2266   a     = esl_alphabet_Create(eslRNA);
2267 
2268   x     = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
2269   val   = esl_abc_DAvgScore(a, x, sc);
2270   /* average of -1,-6,6,1 = 0 */
2271   if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
2272 
2273   x     = esl_abc_DigitizeSymbol(a, 'M');     /* M = A/C */
2274   val   = esl_abc_DExpectScore(a, x, sc, p);
2275   /* expectation of -1,-6 given p = 0.4,0.1 = -2 */
2276   if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
2277 
2278   esl_alphabet_Destroy(a);
2279   return eslOK;
2280 }
2281 
2282 /* utest_IAvgScVec */
2283 /* utest_FAvgScVec */
2284 /* utest_DAvgScVec */
2285 /* utest_IExpectScVec */
2286 /* utest_FExpectScVec */
2287 /* utest_DExpectScVec */
2288 
2289 static int
utest_FCount(void)2290 utest_FCount(void)
2291 {
2292   char         *msg = "FCount unit test failure";
2293   ESL_ALPHABET *a = NULL;
2294   ESL_DSQ       x;
2295   char         *teststring = "X~-Z.UAX";
2296   char         *s;
2297   int           status;
2298 
2299   /* 0.1 from 2 X's; U -> +1 C; A -> +1 A;  Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
2300   /*                          A    C    D    E    F    G    H    I    K    L    M    N    P    Q    R    S    T    V    W    Y    - */
2301   float       expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
2302   float      *vec;
2303 
2304   a = esl_alphabet_Create(eslAMINO);
2305   ESL_ALLOC(vec, sizeof(float) * (a->K+1)); /* include gap char for this test */
2306   esl_vec_FSet(vec, a->K+1, 0.);
2307   for (s = teststring; *s != '\0'; s++)
2308     {
2309       x = esl_abc_DigitizeSymbol(a, *s);
2310       if (esl_abc_FCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
2311     }
2312   if (esl_vec_FCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
2313 
2314   esl_alphabet_Destroy(a);
2315   free(vec);
2316   return eslOK;
2317 
2318  ERROR:
2319   esl_fatal("allocation failed");
2320   return status;
2321 }
2322 
2323 static int
utest_DCount(void)2324 utest_DCount(void)
2325 {
2326   char         *msg = "DCount unit test failure";
2327   ESL_ALPHABET *a = NULL;
2328   ESL_DSQ       x;
2329   char         *teststring = "X~-Z.UAX";
2330   char         *s;
2331   int           status;
2332 
2333   /* 0.1 from 2 X's; U -> +1 C; A -> +1 A;  Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
2334   /*                          A    C    D    E    F    G    H    I    K    L    M    N    P    Q    R    S    T    V    W    Y    - */
2335   double      expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
2336   double      *vec;
2337 
2338   a = esl_alphabet_Create(eslAMINO);
2339   ESL_ALLOC(vec, sizeof(double) * (a->K+1)); /* include gap char for this test */
2340   esl_vec_DSet(vec, a->K+1, 0.);
2341   for (s = teststring; *s != '\0'; s++)
2342     {
2343       x = esl_abc_DigitizeSymbol(a, *s);
2344       if (esl_abc_DCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
2345     }
2346   if (esl_vec_DCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
2347 
2348   esl_alphabet_Destroy(a);
2349   free(vec);
2350   return eslOK;
2351 
2352  ERROR:
2353   esl_fatal("allocation failed");
2354   return status;
2355 }
2356 #endif /* eslALPHABET_TESTDRIVE*/
2357 /*-------------------- end, unit tests --------------------------*/
2358 
2359 
2360 
2361 
2362 /*****************************************************************
2363  * 5. Test driver.
2364  *****************************************************************/
2365 
2366 /* gcc -g -Wall -std=gnu99 -I.     -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c esl_vectorops.c easel.c -lm
2367  * gcc -g -Wall -std=gnu99 -I. -L. -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c -leasel
2368  * ./test
2369  * valgrind ./test
2370  */
2371 #ifdef eslALPHABET_TESTDRIVE
2372 static int basic_examples(void);
2373 
2374 #include "easel.h"
2375 #include "esl_alphabet.h"
2376 
2377 int
main(void)2378 main(void)
2379 {
2380   utest_Create();
2381   utest_CreateCustom();
2382   utest_SetEquiv();
2383   utest_SetCaseInsensitive();
2384   utest_SetDegeneracy();
2385   utest_SetIgnored();
2386   utest_Destroy();
2387 
2388   utest_CreateDsq();
2389   utest_Digitize();
2390   utest_Textize();
2391   utest_TextizeN();
2392   utest_dsqdup();
2393   utest_dsqcat();
2394 
2395   utest_FCount();
2396   utest_DCount();
2397 
2398   basic_examples();
2399   degeneracy_integer_scores();
2400   degeneracy_float_scores();
2401   degeneracy_double_scores();
2402 
2403   return eslOK;
2404 }
2405 
2406 static int
basic_examples(void)2407 basic_examples(void)
2408 {
2409   char *msg = "basic alphabet example tests failed";
2410   ESL_ALPHABET  *a1, *a2;
2411   char           dnaseq[] = "GARYtcN";
2412   char           aaseq[]  = "EFILqzU";
2413   int            L;
2414   ESL_DSQ       *dsq, *dsq2;
2415   int            i;
2416 
2417   /* Example 1.
2418    * Create a DNA alphabet; digitize a DNA sequence.
2419    */
2420   if ((a1 = esl_alphabet_Create(eslDNA)) == NULL)      esl_fatal(msg);
2421   L  = strlen(dnaseq);
2422   if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
2423   if (esl_abc_Digitize(a1, dnaseq, dsq) != eslOK)      esl_fatal(msg);
2424   if (esl_abc_dsqlen(dsq) != L)                        esl_fatal(msg);
2425   esl_alphabet_Destroy(a1);
2426 
2427   /* Example 2.
2428    * Create an RNA alphabet; digitize the same DNA sequence;
2429    * make sure it is equal to the dsq above (so T=U were
2430    * correctly synonymous on input).
2431    */
2432   if ((a2 = esl_alphabet_Create(eslRNA)) == NULL)       esl_fatal(msg);
2433   if ((dsq2 = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
2434   if (esl_abc_Digitize(a2, dnaseq, dsq2) != eslOK)      esl_fatal(msg);
2435   for (i = 1; i <= L; i++)
2436     if (dsq[i] != dsq2[i]) esl_fatal(msg);
2437   esl_alphabet_Destroy(a2);
2438 
2439   /* Example 3.
2440    * Create an amino alphabet; digitize a protein sequence,
2441    * while reusing memory already allocated in dsq.
2442    */
2443   if ((a1 = esl_alphabet_Create(eslAMINO)) == NULL)  esl_fatal(msg);
2444   if (esl_abc_Digitize(a1, aaseq, dsq) != eslOK)     esl_fatal(msg);
2445 
2446   /* Example 4.
2447    * Create a custom alphabet almost the same as the amino
2448    * acid alphabet; digitize the same protein seq, reusing
2449    * memory in dsq2; check that seqs are identical.
2450    */
2451   if ((a2 = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX~", 20, 28)) == NULL) esl_fatal(msg);
2452   if (esl_alphabet_SetCaseInsensitive(a2)   != eslOK)     esl_fatal(msg);  /* allow lower case input */
2453   if (esl_alphabet_SetDegeneracy(a2, 'Z', "QE") != eslOK) esl_fatal(msg);
2454 
2455   if (esl_abc_Digitize(a2, aaseq, dsq2) != eslOK)      esl_fatal(msg);
2456   for (i = 1; i <= L; i++)
2457     if (dsq[i] != dsq2[i]) esl_fatal(msg);
2458 
2459   /* clean up.
2460    */
2461   esl_alphabet_Destroy(a1);
2462   esl_alphabet_Destroy(a2);
2463   free(dsq);
2464   free(dsq2);
2465   return eslOK;
2466 }
2467 
2468 
2469 #endif /*eslALPHABET_TESTDRIVE*/
2470 
2471 /*****************************************************************
2472  * 6. Examples.
2473  *****************************************************************/
2474 
2475 /*   gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE esl_alphabet.c easel.c
2476  */
2477 #ifdef eslALPHABET_EXAMPLE
2478 /*::cexcerpt::alphabet_example::begin::*/
2479 #include "easel.h"
2480 #include "esl_alphabet.h"
main(void)2481 int main(void)
2482 {
2483   ESL_ALPHABET *a;
2484   char          dnaseq[] = "GARYTC";
2485   int           L        = 6;
2486   ESL_DSQ      *dsq;
2487 
2488   a = esl_alphabet_Create(eslDNA);
2489 
2490   if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL)  esl_fatal("malloc failed");
2491   if (esl_abc_Digitize(a, dnaseq, dsq)       != eslOK)  esl_fatal("failed to digitize the sequence");
2492 
2493   free(dsq);
2494   esl_alphabet_Destroy(a);
2495   return 0;
2496 }
2497 /*::cexcerpt::alphabet_example::end::*/
2498 #endif /*eslALPHABET_EXAMPLE*/
2499 
2500 
2501 /*   gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE2 esl_alphabet.c easel.c
2502  */
2503 #ifdef eslALPHABET_EXAMPLE2
2504 /*::cexcerpt::alphabet_example2::begin::*/
2505 #include "easel.h"
2506 #include "esl_alphabet.h"
main(void)2507 int main(void)
2508 {
2509   ESL_ALPHABET *a;
2510 
2511   /* 1. Create the base alphabet structure. */
2512   a = esl_alphabet_CreateCustom("ACDEFGHIKLMNOPQRSTUVWY-BJZX~", 22, 28);
2513 
2514   /* 2. Set your equivalences in the input map.  */
2515   esl_alphabet_SetEquiv(a, '.', '-');     /* allow . as a gap character too */
2516 
2517   /* 3. After all synonyms are set, (optionally) make map case-insensitive. */
2518   esl_alphabet_SetCaseInsensitive(a);       /* allow lower case input too */
2519 
2520   /* 4. Define your optional degeneracy codes in the alphabet, one at a time.
2521    *    The 'any' character X was automatically set up.  */
2522   esl_alphabet_SetDegeneracy(a, 'B', "DN"); /* read B as {D|N} */
2523   esl_alphabet_SetDegeneracy(a, 'J', "IL"); /* read B as {I|L} */
2524   esl_alphabet_SetDegeneracy(a, 'Z', "QE"); /* read Z as {Q|E} */
2525 
2526   /* 5. (do your stuff) */
2527 
2528   /* 6. Remember to free it when you're done with it. */
2529   esl_alphabet_Destroy(a);
2530   return 0;
2531 }
2532 /*::cexcerpt::alphabet_example2::end::*/
2533 #endif /*eslALPHABET_EXAMPLE2*/
2534 
2535 
2536 
2537 #ifdef eslALPHABET_EXAMPLE3
2538 #include "easel.h"
2539 #include "esl_alphabet.h"
2540 #include "esl_sq.h"
2541 #include "esl_sqio.h"
2542 
2543 int
main(int argc,char ** argv)2544 main(int argc, char **argv)
2545 {
2546   ESL_SQ     *sq      = esl_sq_Create();
2547   ESL_SQFILE *sqfp;
2548   int         format  = eslSQFILE_UNKNOWN;
2549   char       *seqfile = argv[1];
2550   int         type;
2551   int         status;
2552 
2553   status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
2554   if      (status == eslENOTFOUND) esl_fatal("No such file.");
2555   else if (status == eslEFORMAT)   esl_fatal("Format couldn't be determined.");
2556   else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);
2557 
2558   while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
2559   {
2560     esl_sq_GuessAlphabet(sq, &type);
2561     printf("%-25s %s\n", sq->name, esl_abc_DecodeType(type));
2562     esl_sq_Reuse(sq);
2563   }
2564   if      (status == eslEFORMAT) esl_fatal("Parse failed\n  %s", esl_sqfile_GetErrorBuf(sqfp));
2565   else if (status != eslEOF)     esl_fatal("Unexpected read error %d", status);
2566 
2567   esl_sq_Destroy(sq);
2568   esl_sqfile_Close(sqfp);
2569   return 0;
2570 }
2571 #endif /*eslALPHABET_EXAMPLE3*/
2572 
2573 
2574 
2575 
2576