1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24
25 #include "globals.h"
26 #include "tightString.h"
27 #include "utility.h"
28
29 typedef unsigned char Codon;
30
31 struct tString_st {
32 Codon *sequence;
33 IDnum length;
34 } ATTRIBUTE_PACKED;
35
36 static const Nucleotide Adenine = 0;
37 static const Nucleotide Cytosine = 1;
38 static const Nucleotide Guanine = 2;
39 static const Nucleotide Thymine = 3;
40
41 // Binary 11111100
42 static const Codon FILTER0 = ~((Codon) 3);
43 // Binary 11110011
44 static const Codon FILTER1 = ~(((Codon) 3) << 2);
45 // Binary 11001111
46 static const Codon FILTER2 = ~(((Codon) 3) << 4);
47 // Binary 00111111
48 static const Codon FILTER3 = (Codon) ~ ((Codon) 3 << 6);
49
50 //
51 // Adds a number into the Codon pointed to by codonPtr at the desired
52 // position (0, 1, 2, or 3);
53 //
54 void
writeNucleotideNumber(Nucleotide nucleotide,Codon * codonPtr,Coordinate position)55 writeNucleotideNumber(Nucleotide nucleotide, Codon * codonPtr,
56 Coordinate position)
57 {
58 if (position == 3) {
59 *codonPtr &= FILTER3;
60 *codonPtr += nucleotide << 6;
61 } else if (position == 2) {
62 *codonPtr &= FILTER2;
63 *codonPtr += nucleotide << 4;
64 } else if (position == 1) {
65 *codonPtr &= FILTER1;
66 *codonPtr += nucleotide << 2;
67 } else if (position == 0) {
68 *codonPtr &= FILTER0;
69 *codonPtr += nucleotide;
70 }
71 }
72
getTightStringInArray(TightString * tString,IDnum position)73 TightString *getTightStringInArray(TightString * tString,
74 IDnum position)
75 {
76 return tString + position;
77 }
78
79 //
80 // Adds a nucleotide into the Codon pointed to by codonPtr at the desired
81 // position (0, 1, 2, or 3);
82 //
writeNucleotide(Nucleotide nucleotide,Codon * codonPtr,int position)83 void writeNucleotide(Nucleotide nucleotide, Codon * codonPtr, int position)
84 {
85 int nucleotideNum;
86
87 switch (nucleotide) {
88 case 'A':
89 nucleotideNum = Adenine;
90 break;
91 case 'C':
92 nucleotideNum = Cytosine;
93 break;
94 case 'G':
95 nucleotideNum = Guanine;
96 break;
97 case 'T':
98 nucleotideNum = Thymine;
99 break;
100 case 'N':
101 nucleotideNum = Adenine;
102 break;
103 case 'a':
104 nucleotideNum = Adenine;
105 break;
106 case 'c':
107 nucleotideNum = Cytosine;
108 break;
109 case 'g':
110 nucleotideNum = Guanine;
111 break;
112 case 't':
113 nucleotideNum = Thymine;
114 break;
115 case 'n':
116 nucleotideNum = Adenine;
117 default:
118 nucleotideNum = Adenine;
119 }
120
121 writeNucleotideNumber(nucleotideNum, codonPtr, position);
122 }
123
fillTightStringWithString(TightString * tString,char * sequence,Codon * newSequence)124 static void fillTightStringWithString(TightString * tString,
125 char *sequence,
126 Codon *newSequence)
127 {
128 int index;
129
130 tString->sequence = newSequence;
131 for (index = 0; index < tString->length; index++)
132 writeNucleotide(sequence[index],
133 &(newSequence[index / 4]),
134 index % 4);
135 free(sequence);
136 }
137
138 //
139 // Creates a tightString from an array of normal strings
140 //
newTightStringArrayFromStringArray(char ** sequences,IDnum sequenceCount,char ** tSeqMem)141 TightString *newTightStringArrayFromStringArray(char **sequences,
142 IDnum sequenceCount,
143 char **tSeqMem)
144 {
145 IDnum sequenceIndex;
146 Codon *tmp;
147 TightString *tStringArray = mallocOrExit(sequenceCount, TightString);
148 Coordinate totalLength = 0;
149 int arrayLength;
150
151 for (sequenceIndex = 0; sequenceIndex < sequenceCount; sequenceIndex++)
152 {
153 tStringArray[sequenceIndex].length = strlen (sequences[sequenceIndex]);
154 arrayLength = (tStringArray[sequenceIndex].length + 3) / 4;
155 totalLength += arrayLength;
156 }
157 *tSeqMem = callocOrExit (totalLength, char);
158 tmp = (Codon*)*tSeqMem;
159 for (sequenceIndex = 0; sequenceIndex < sequenceCount; sequenceIndex++)
160 {
161 fillTightStringWithString (&tStringArray[sequenceIndex],
162 sequences[sequenceIndex],
163 tmp);
164 arrayLength = (tStringArray[sequenceIndex].length + 3) / 4;
165 tmp += arrayLength;
166 }
167
168 free(sequences);
169 return tStringArray;
170 }
171
readNucleotide(Nucleotide nucleotide)172 char readNucleotide(Nucleotide nucleotide)
173 {
174 switch (nucleotide) {
175 case 0:
176 return 'A';
177 case 1:
178 return 'C';
179 case 2:
180 return 'G';
181 case 3:
182 return 'T';
183 }
184
185 return '?';
186 }
187
readTightString(TightString * tString)188 char *readTightString(TightString * tString)
189 {
190 Coordinate index, index4;
191 char *string;
192 Codon codon;
193
194 if (tString == NULL || tString->length == 0) {
195 string = callocOrExit(5, char);
196 strcpy(string, "VOID");
197 return string;
198 }
199
200 string = callocOrExit(tString->length + 1, char);
201
202 for (index = 0; index < tString->length / 4; index++) {
203 index4 = index << 2;
204 codon = tString->sequence[index];
205 string[index4] = readNucleotide(codon & 3);
206 string[index4 + 1] = readNucleotide((codon & 12) >> 2);
207 string[index4 + 2] = readNucleotide((codon & 48) >> 4);
208 string[index4 + 3] = readNucleotide((codon & 192) >> 6);
209 }
210
211 index4 = index << 2;
212 codon = tString->sequence[index];
213
214 switch (tString->length % 4) {
215 case 3:
216 string[index4 + 3] = readNucleotide((codon & 192) >> 6);
217 case 2:
218 string[index4 + 2] = readNucleotide((codon & 48) >> 4);
219 case 1:
220 string[index4 + 1] = readNucleotide((codon & 12) >> 2);
221 case 0:
222 string[index4] = readNucleotide(codon & 3);
223 }
224
225 string[tString->length] = '\0';
226
227 return string;
228 }
229
getNucleotide(Coordinate nucleotideIndex,TightString * tString)230 Nucleotide getNucleotide(Coordinate nucleotideIndex, TightString * tString)
231 {
232 Codon codon = tString->sequence[nucleotideIndex / 4];
233
234 switch (nucleotideIndex % 4) {
235 case 3:
236 return (codon & 192) >> 6;
237 case 2:
238 return (codon & 48) >> 4;
239 case 1:
240 return (codon & 12) >> 2;
241 case 0:
242 return (codon & 3);
243 }
244
245 return '?';
246 }
247
readTightStringFragment(TightString * tString,Coordinate start,Coordinate finish,char * string)248 void readTightStringFragment(TightString * tString, Coordinate start,
249 Coordinate finish, char *string)
250 {
251 Coordinate index;
252 Coordinate inFinish = finish;
253
254 if (start >= tString->length) {
255 string[0] = '\0';
256 return;
257 }
258
259 if (inFinish > tString->length)
260 inFinish = tString->length;
261
262 for (index = start; index < inFinish; index++) {
263 string[index - start] =
264 readNucleotide(getNucleotide(index, tString));
265 }
266
267 string[inFinish - start] = '\0';
268 }
269
getNucleotideChar(Coordinate nucleotideIndex,TightString * tString)270 char getNucleotideChar(Coordinate nucleotideIndex, TightString * tString)
271 {
272 Codon codon;
273
274 codon = tString->sequence[nucleotideIndex / 4];
275
276 switch (nucleotideIndex % 4) {
277 case 3:
278 return readNucleotide((codon & 192) >> 6);
279 case 2:
280 return readNucleotide((codon & 48) >> 4);
281 case 1:
282 return readNucleotide((codon & 12) >> 2);
283 case 0:
284 return readNucleotide((codon & 3));
285 }
286
287 return '?';
288 }
289
newTightString(Coordinate length)290 TightString *newTightString(Coordinate length)
291 {
292 Coordinate arrayLength = length / 4;
293 Coordinate index;
294 TightString *newTString = mallocOrExit(1, TightString);
295 if (length % 4 > 0)
296 arrayLength++;
297
298 newTString->length = length;
299 newTString->sequence = callocOrExit(arrayLength, Codon);
300
301 for (index = 0; index < arrayLength; index++)
302 newTString->sequence[index] = 0;
303
304 return newTString;
305 }
306
307 void
writeNucleotideAtPosition(Nucleotide nucleotide,Coordinate position,TightString * tString)308 writeNucleotideAtPosition(Nucleotide nucleotide, Coordinate position,
309 TightString * tString)
310 {
311 if (position >= tString->length)
312 return;
313
314 writeNucleotideNumber(nucleotide,
315 &tString->sequence[position / 4],
316 position % 4);
317 }
318
getLength(TightString * tString)319 IDnum getLength(TightString * tString)
320 {
321 return tString->length;
322 }
323
destroyTightString(TightString * tString)324 void destroyTightString(TightString * tString)
325 {
326 free(tString->sequence);
327 free(tString);
328 }
329
setTightStringLength(TightString * tString,Coordinate length)330 void setTightStringLength(TightString * tString, Coordinate length)
331 {
332 Coordinate arrayLength = tString->length / 4;
333 Coordinate newArrayLength = length / 4;
334 if (tString->length % 4 > 0)
335 arrayLength++;
336 if (length % 4 > 0)
337 newArrayLength++;
338
339 if (newArrayLength > arrayLength) {
340 tString->sequence =
341 reallocOrExit(tString->sequence,
342 newArrayLength, Codon);
343 arrayLength = newArrayLength;
344 }
345
346 tString->length = length;
347 }
348
exportTightString(FILE * outfile,TightString * sequence,IDnum index)349 void exportTightString(FILE * outfile, TightString * sequence, IDnum index)
350 {
351 Coordinate start, finish;
352 char str[100];
353
354 if (sequence == NULL)
355 return;
356
357 velvetFprintf(outfile, ">SEQUENCE_%ld_length_%lld\n", (long) index,
358 (long long) getLength(sequence));
359
360 start = 0;
361 while (start <= getLength(sequence)) {
362 finish = start + 60;
363 readTightStringFragment(sequence, start, finish, str);
364 velvetFprintf(outfile, "%s\n", str);
365 start = finish;
366 }
367
368 fflush(outfile);
369 }
370