1 /* @source embnmer ************************************************************
2 **
3 ** General routines for n-mer alignment.
4 **
5 ** @author Copyright (c) 1999 Gary Williams
6 ** @version $Revision: 1.14 $
7 ** @modified $Date: 2011/11/08 15:12:52 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA 02110-1301, USA.
24 **
25 ******************************************************************************/
26
27
28 #include "ajlib.h"
29
30 #include "embnmer.h"
31
32 #include <math.h>
33 #include <limits.h>
34
35
36
37
38 /*
39 ** This set of routines was written primarily for the compseq program.
40 ** Feel free to use it for anything you want.
41 **
42 ** compseq counts the frequency of n-mers (or words) in a sequence.
43 **
44 ** The easiest way to do this was to make a big unsigned ajlong ajint array
45 ** to hold the counts of each type of word.
46 **
47 ** I needed a way of converting a sequence word into an integer so that I
48 ** could increment the appropriate element of the array.
49 ** (embNmerNuc2int and embNmerProt2int)
50 **
51 ** I also needed a way of converting an integer back to a short sequence
52 ** so that I could display the word.
53 ** (embNmerInt2nuc and embNmerInt2prot)
54 **
55 ** embNmerGetNoElements returns the number of elements required to store the
56 ** results.
57 ** In other words it is the number of possible words of size n.
58 **
59 ** Gary Williams
60 */
61
62
63
64
65 /* @func embNmerNuc2int *******************************************************
66 **
67 ** Encodes a word of a nucleic sequence into a unsigned ajlong ajint number.
68 ** For a 4-byte unsigned ajlong ajint this will work if the word size
69 ** is less than 16, otherwise there aren't enough bits to hold the number.
70 **
71 ** @param [r] seq [const char *] the sequence to use
72 ** @param [r] wordsize [ajint] the size of word of sequence to convert to int
73 ** @param [r] offset [ajint] the offset into the sequence of the start of
74 ** the word
75 ** @param [w] otherflag [AjBool *] set true if the sequence contains
76 ** anything except A,G,C or T
77 **
78 ** @return [ajulong] the encoded word
79 **
80 ** @release 1.0.0
81 ******************************************************************************/
82
embNmerNuc2int(const char * seq,ajint wordsize,ajint offset,AjBool * otherflag)83 ajulong embNmerNuc2int(const char *seq, ajint wordsize, ajint offset,
84 AjBool *otherflag)
85 {
86 ajint i;
87 ajulong result = 0;
88 char c;
89
90 *otherflag = ajFalse;
91
92 for(i=0; i<wordsize; i++)
93 {
94 c = seq[offset+i];
95 result <<= 2;
96
97 if(c == 'A')
98 {
99 /* result |= 0; */
100 }
101 else if(c == 'C')
102 result |= 1;
103 else if(c == 'G')
104 result |= 2;
105 else if(c == 'T')
106 result |= 3;
107 else
108 {
109
110 *otherflag = ajTrue;
111
112 return 0;
113 }
114 }
115
116 return result;
117 }
118
119
120
121
122 /* @func embNmerInt2nuc *******************************************************
123 **
124 ** Decodes a unsigned long int number into a word of a nucleic sequence.
125 ** The returned nucleic sequence is pre-pended to anything already in
126 ** the string 'seq'.
127 **
128 ** @param [w] seq [AjPStr *] the returned sequence
129 ** @param [r] wordsize [ajint] the size of word to produce
130 ** @param [r] value [ajulong] the number to decode
131 **
132 ** @return [ajint] not used
133 **
134 ** @release 1.0.0
135 ******************************************************************************/
136
embNmerInt2nuc(AjPStr * seq,ajint wordsize,ajulong value)137 ajint embNmerInt2nuc(AjPStr *seq, ajint wordsize, ajulong value)
138 {
139 char bases[] = "ACGT";
140 char store[2];
141 ajint i;
142
143 store[1] = '\0'; /* make a one-character NULL-teminated char* */
144
145 for(i=0; i<wordsize; i++)
146 {
147 store[0] = bases[value & 3];
148 ajStrInsertC(seq, 0, store);
149 value >>= 2;
150 }
151
152 return 1;
153 }
154
155
156
157
158 /* @func embNmerProt2int ******************************************************
159 **
160 ** Encodes a word of a protein sequence into a unsigned ajlong ajint number.
161 ** For a 4-byte unsigned ajlong ajint this will work if the word size
162 ** is less than 8, otherwise there aren't enough bits to hold the number.
163 **
164 ** @param [r] seq [const char *] the sequence to use
165 ** @param [r] wordsize [ajint] the size of word of sequence to convert to int
166 ** @param [r] offset [ajint] the offset into the sequence of the start of
167 ** the word
168 ** @param [w] otherflag [AjBool *] set true if the sequence contains
169 ** anything other than valid amino acid codes
170 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated as
171 ** non-valid residues
172 **
173 ** @return [ajulong] the encoded word
174 **
175 ** @release 1.0.0
176 ******************************************************************************/
177
embNmerProt2int(const char * seq,ajint wordsize,ajint offset,AjBool * otherflag,AjBool ignorebz)178 ajulong embNmerProt2int(const char *seq, ajint wordsize, ajint offset,
179 AjBool *otherflag, AjBool ignorebz)
180 {
181 ajint i;
182 ajulong result = 0;
183 ajint c;
184 ajint aa;
185 ajint noaa;
186 ajint *table;
187
188 /* The following tables represent a to z */
189
190 ajint table21[] =
191 {
192 0, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11, -1, 12, 13,
193 14, 15, 16, 17, 18, 19, -1, 20, -1, -1, -1, -1, -1, -1
194 };
195
196 ajint table23[] =
197 {
198 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12, -1, 13, 14,
199 15, 16, 17, 18, 19, 20, -1, 21, 22, -1, -1, -1, -1, -1
200 };
201
202 /* Test for B and Z as acceptable amino acids */
203 if(ignorebz)
204 {
205 table = table21;
206 noaa = 21;
207 }
208 else
209 {
210 table = table23;
211 noaa = 23;
212 }
213
214 *otherflag = ajFalse;
215
216 for(i=0; i<wordsize; i++)
217 {
218 c = (ajint) seq[offset+i];
219 result *= noaa;
220 c = c - (ajint) 'A';
221
222 if(c < 0 || c > 31)
223 {
224 *otherflag = ajTrue;
225
226 return 0;
227 }
228
229 aa = table[c];
230
231 if(aa == -1)
232 {
233 *otherflag = ajTrue;
234
235 return 0;
236 }
237
238 result += aa;
239 }
240
241 return result;
242 }
243
244
245
246
247 /* @func embNmerInt2prot ******************************************************
248 **
249 ** Decodes a unsigned long int number into a word of a protein sequence.
250 ** The returned protein sequence is pre-pended to anything already in
251 ** the string 'seq'.
252 **
253 ** @param [w] seq [AjPStr *] the returned sequence
254 ** @param [r] wordsize [ajint] the size of word to produce
255 ** @param [r] value [ajulong] the number to decode
256 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated
257 ** as non-valid residues
258 **
259 ** @return [ajint] not used
260 **
261 ** @release 1.0.0
262 ******************************************************************************/
263
embNmerInt2prot(AjPStr * seq,ajint wordsize,ajulong value,AjBool ignorebz)264 ajint embNmerInt2prot(AjPStr *seq, ajint wordsize, ajulong value,
265 AjBool ignorebz)
266 {
267
268 char aas21[] = "ACDEFGHIKLMNPQRSTUVWY";
269 char aas23[] = "ABCDEFGHIKLMNPQRSTUVWYZ";
270 char store[2];
271 ajint i;
272 ajint noaa;
273 char *aas;
274
275 /* Test for B and Z as acceptable amino acids */
276 if(ignorebz)
277 {
278 aas = aas21;
279 noaa = 21;
280 }
281 else
282 {
283 aas = aas23;
284 noaa = 23;
285 }
286
287 store[1] = '\0'; /* make a one-character NULL-teminated char* */
288
289 for(i=0; i<wordsize; i++)
290 {
291 store[0] = aas[value % noaa];
292 ajStrInsertC(seq, 0, store);
293 value /= noaa;
294 }
295
296 return 1;
297 }
298
299
300
301
302 /* @func embNmerGetNoElements *************************************************
303 **
304 ** Calculates the maximum number required to encode a sequence of size 'word'.
305 **
306 ** @param [w] no_elements [ajulong*] the returned number
307 ** @param [r] word [ajint] the size of word to produce
308 ** @param [r] seqisnuc [AjBool] True is the sequence is nucleic,
309 ** False if protein
310 ** @param [r] ignorebz [AjBool] true if B and Z are to be treated as
311 ** non-valid residues
312 **
313 ** @return [AjBool] True if the word is small enough to be encoded
314 ** in an ajulong
315 **
316 ** @release 1.0.0
317 ******************************************************************************/
318
embNmerGetNoElements(ajulong * no_elements,ajint word,AjBool seqisnuc,AjBool ignorebz)319 AjBool embNmerGetNoElements(ajulong *no_elements, ajint word,
320 AjBool seqisnuc, AjBool ignorebz)
321 {
322 double result;
323 float ccfix = (float) word;
324 static ajulong maxlong = UINT_MAX;
325
326 /*
327 ** UINT_MAX used because the gcc compiler defines ULONG_MAX
328 ** as a strange expression including LONG_MAX which is signed
329 ** and this makes the whole conversion use signed values, and
330 ** maybe even int rather than long
331 */
332
333 ajDebug("embNmerGetNoElements( %d, %b, %b )\n", word, seqisnuc, ignorebz);
334
335 if(seqisnuc)
336 result = pow(4.0, ccfix);
337 else
338 {
339 if(ignorebz)
340 result = pow(21.0, ccfix);
341 else
342 result = pow(23.0, ccfix);
343 }
344
345 ajDebug("...result: %.3f max: %.3f\n", result, (double) maxlong);
346
347 /* if the resulting number is too long for an ajulong, return false */
348 if(result-1 > (double)maxlong)
349 return ajFalse;
350
351 *no_elements = (ajulong)result;
352
353 return ajTrue;
354 }
355