1 /* @source embnmer ************************************************************
2 **
3 ** General routines for n-mer alignment.
4 **
5 ** @author Copyright (c) 1999 Gary Williams
6 ** @version $Revision: 1.14 $
7 ** @modified $Date: 2011/11/08 15:12:52 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA  02110-1301,  USA.
24 **
25 ******************************************************************************/
26 
27 
28 #include "ajlib.h"
29 
30 #include "embnmer.h"
31 
32 #include <math.h>
33 #include <limits.h>
34 
35 
36 
37 
38 /*
39 ** This set of routines was written primarily for the compseq program.
40 ** Feel free to use it for anything you want.
41 **
42 ** compseq counts the frequency of n-mers (or words) in a sequence.
43 **
44 ** The easiest way to do this was to make a big unsigned ajlong ajint array
45 ** to hold the counts of each type of word.
46 **
47 ** I needed a way of converting a sequence word into an integer so that I
48 ** could increment the appropriate element of the array.
49 ** (embNmerNuc2int and embNmerProt2int)
50 **
51 ** I also needed a way of converting an integer back to a short sequence
52 ** so that I could display the word.
53 ** (embNmerInt2nuc and embNmerInt2prot)
54 **
55 ** embNmerGetNoElements returns the number of elements required to store the
56 ** results.
57 ** In other words it is the number of possible words of size n.
58 **
59 ** Gary Williams
60 */
61 
62 
63 
64 
65 /* @func embNmerNuc2int *******************************************************
66 **
67 ** Encodes a word of a nucleic sequence into a unsigned ajlong ajint number.
68 ** For a 4-byte unsigned ajlong ajint this will work if the word size
69 ** is less than 16, otherwise there aren't enough bits to hold the number.
70 **
71 ** @param [r] seq [const char *] the sequence to use
72 ** @param [r] wordsize [ajint] the size of word of sequence to convert to int
73 ** @param [r] offset [ajint] the offset into the sequence of the start of
74 **                         the word
75 ** @param [w] otherflag [AjBool *] set true if the sequence contains
76 **                                 anything except A,G,C or T
77 **
78 ** @return [ajulong] the encoded word
79 **
80 ** @release 1.0.0
81 ******************************************************************************/
82 
embNmerNuc2int(const char * seq,ajint wordsize,ajint offset,AjBool * otherflag)83 ajulong embNmerNuc2int(const char *seq, ajint wordsize, ajint offset,
84 		       AjBool *otherflag)
85 {
86     ajint i;
87     ajulong result = 0;
88     char c;
89 
90     *otherflag = ajFalse;
91 
92     for(i=0; i<wordsize; i++)
93     {
94 	c = seq[offset+i];
95 	result <<= 2;
96 
97 	if(c == 'A')
98 	{
99 	    /* result |= 0; */
100 	}
101 	else if(c == 'C')
102 	    result |= 1;
103 	else if(c == 'G')
104 	    result |= 2;
105 	else if(c == 'T')
106 	    result |= 3;
107 	else
108 	{
109 
110 	    *otherflag = ajTrue;
111 
112 	    return 0;
113 	}
114     }
115 
116     return result;
117 }
118 
119 
120 
121 
122 /* @func embNmerInt2nuc *******************************************************
123 **
124 ** Decodes a unsigned long int number into a word of a nucleic sequence.
125 ** The returned nucleic sequence is pre-pended to anything already in
126 ** the string 'seq'.
127 **
128 ** @param [w] seq [AjPStr *] the returned sequence
129 ** @param [r] wordsize [ajint] the size of word to produce
130 ** @param [r] value [ajulong] the number to decode
131 **
132 ** @return [ajint] not used
133 **
134 ** @release 1.0.0
135 ******************************************************************************/
136 
embNmerInt2nuc(AjPStr * seq,ajint wordsize,ajulong value)137 ajint embNmerInt2nuc(AjPStr *seq, ajint wordsize, ajulong value)
138 {
139     char bases[] = "ACGT";
140     char store[2];
141     ajint i;
142 
143     store[1] = '\0';   /* make a one-character NULL-teminated char* */
144 
145     for(i=0; i<wordsize; i++)
146     {
147 	store[0] = bases[value & 3];
148 	ajStrInsertC(seq, 0, store);
149 	value >>= 2;
150     }
151 
152     return 1;
153 }
154 
155 
156 
157 
158 /* @func embNmerProt2int ******************************************************
159 **
160 ** Encodes a word of a protein sequence into a unsigned ajlong ajint number.
161 ** For a 4-byte unsigned ajlong ajint this will work if the word size
162 ** is less than 8, otherwise there aren't enough bits to hold the number.
163 **
164 ** @param [r] seq [const char *] the sequence to use
165 ** @param [r] wordsize [ajint] the size of word of sequence to convert to int
166 ** @param [r] offset [ajint] the offset into the sequence of the start of
167 **                         the word
168 ** @param [w] otherflag [AjBool *] set true if the sequence contains
169 **                                 anything other than valid amino acid codes
170 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated as
171 **                              non-valid residues
172 **
173 ** @return [ajulong] the encoded word
174 **
175 ** @release 1.0.0
176 ******************************************************************************/
177 
embNmerProt2int(const char * seq,ajint wordsize,ajint offset,AjBool * otherflag,AjBool ignorebz)178 ajulong embNmerProt2int(const char *seq, ajint wordsize, ajint offset,
179 			AjBool *otherflag, AjBool ignorebz)
180 {
181     ajint i;
182     ajulong result = 0;
183     ajint c;
184     ajint aa;
185     ajint noaa;
186     ajint *table;
187 
188     /*  The following tables represent a to z */
189 
190     ajint table21[] =
191     {
192  	0, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11, -1, 12, 13,
193 	14, 15, 16, 17, 18, 19, -1, 20, -1, -1, -1, -1, -1, -1
194     };
195 
196     ajint table23[] =
197     {
198 	0, 1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12, -1, 13, 14,
199 	15, 16, 17, 18, 19, 20, -1, 21, 22, -1, -1, -1, -1, -1
200     };
201 
202     /* Test for B and Z as acceptable amino acids */
203     if(ignorebz)
204     {
205 	table = table21;
206 	noaa  = 21;
207     }
208     else
209     {
210 	table = table23;
211 	noaa  = 23;
212     }
213 
214     *otherflag = ajFalse;
215 
216     for(i=0; i<wordsize; i++)
217     {
218 	c = (ajint) seq[offset+i];
219 	result *= noaa;
220 	c = c - (ajint) 'A';
221 
222 	if(c < 0 || c > 31)
223 	{
224 	    *otherflag = ajTrue;
225 
226 	    return 0;
227 	}
228 
229 	aa = table[c];
230 
231 	if(aa == -1)
232 	{
233 	    *otherflag = ajTrue;
234 
235 	    return 0;
236 	}
237 
238 	result += aa;
239     }
240 
241     return result;
242 }
243 
244 
245 
246 
247 /* @func embNmerInt2prot ******************************************************
248 **
249 ** Decodes a unsigned long int number into a word of a protein sequence.
250 ** The returned protein sequence is pre-pended to anything already in
251 ** the string 'seq'.
252 **
253 ** @param [w] seq [AjPStr *] the returned sequence
254 ** @param [r] wordsize [ajint] the size of word to produce
255 ** @param [r] value [ajulong] the number to decode
256 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated
257 **                                as non-valid residues
258 **
259 ** @return [ajint] not used
260 **
261 ** @release 1.0.0
262 ******************************************************************************/
263 
embNmerInt2prot(AjPStr * seq,ajint wordsize,ajulong value,AjBool ignorebz)264 ajint embNmerInt2prot(AjPStr *seq, ajint wordsize, ajulong value,
265 		      AjBool ignorebz)
266 {
267 
268     char aas21[] = "ACDEFGHIKLMNPQRSTUVWY";
269     char aas23[] = "ABCDEFGHIKLMNPQRSTUVWYZ";
270     char store[2];
271     ajint i;
272     ajint noaa;
273     char *aas;
274 
275     /* Test for B and Z as acceptable amino acids */
276     if(ignorebz)
277     {
278 	aas  = aas21;
279 	noaa = 21;
280     }
281     else
282     {
283 	aas  = aas23;
284 	noaa = 23;
285     }
286 
287     store[1] = '\0';   /* make a one-character NULL-teminated char* */
288 
289     for(i=0; i<wordsize; i++)
290     {
291 	store[0] = aas[value % noaa];
292 	ajStrInsertC(seq, 0, store);
293 	value /= noaa;
294     }
295 
296     return 1;
297 }
298 
299 
300 
301 
302 /* @func embNmerGetNoElements *************************************************
303 **
304 ** Calculates the maximum number required to encode a sequence of size 'word'.
305 **
306 ** @param [w] no_elements [ajulong*] the returned number
307 ** @param [r] word [ajint] the size of word to produce
308 ** @param [r] seqisnuc [AjBool] True is the sequence is nucleic,
309 **                              False if protein
310 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated as
311 **                                non-valid residues
312 **
313 ** @return [AjBool] True if the word is small enough to be encoded
314 **                   in an ajulong
315 **
316 ** @release 1.0.0
317 ******************************************************************************/
318 
embNmerGetNoElements(ajulong * no_elements,ajint word,AjBool seqisnuc,AjBool ignorebz)319 AjBool embNmerGetNoElements(ajulong *no_elements, ajint word,
320 			    AjBool seqisnuc, AjBool ignorebz)
321 {
322     double result;
323     float ccfix = (float) word;
324     static ajulong maxlong = UINT_MAX;
325 
326     /*
327     ** UINT_MAX used because the gcc compiler defines ULONG_MAX
328     ** as a strange expression including LONG_MAX which is signed
329     ** and this makes the whole conversion use signed values, and
330     ** maybe even int rather than long
331     */
332 
333     ajDebug("embNmerGetNoElements( %d, %b, %b )\n", word, seqisnuc, ignorebz);
334 
335     if(seqisnuc)
336 	result = pow(4.0, ccfix);
337     else
338     {
339 	if(ignorebz)
340 	    result =  pow(21.0, ccfix);
341 	else
342 	    result =  pow(23.0, ccfix);
343     }
344 
345     ajDebug("...result: %.3f max: %.3f\n", result, (double) maxlong);
346 
347     /* if the resulting number is too long for an ajulong, return false */
348     if(result-1 > (double)maxlong)
349 	return ajFalse;
350 
351     *no_elements = (ajulong)result;
352 
353     return ajTrue;
354 }
355