1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdio.h>
24 
25 #include "globals.h"
26 #include "tightString.h"
27 #include "utility.h"
28 
29 typedef unsigned char Codon;
30 
31 struct tString_st {
32 	Codon *sequence;
33 	IDnum length;
34 }  ATTRIBUTE_PACKED;
35 
36 static const Nucleotide Adenine = 0;
37 static const Nucleotide Cytosine = 1;
38 static const Nucleotide Guanine = 2;
39 static const Nucleotide Thymine = 3;
40 
41 // Binary 11111100
42 static const Codon FILTER0 = ~((Codon) 3);
43 // Binary 11110011
44 static const Codon FILTER1 = ~(((Codon) 3) << 2);
45 // Binary 11001111
46 static const Codon FILTER2 = ~(((Codon) 3) << 4);
47 // Binary 00111111
48 static const Codon FILTER3 = (Codon) ~ ((Codon) 3 << 6);
49 
50 //
51 // Adds a number into the Codon pointed to by codonPtr at the desired
52 // position (0, 1, 2, or 3);
53 //
54 void
writeNucleotideNumber(Nucleotide nucleotide,Codon * codonPtr,Coordinate position)55 writeNucleotideNumber(Nucleotide nucleotide, Codon * codonPtr,
56 		      Coordinate position)
57 {
58 	if (position == 3) {
59 		*codonPtr &= FILTER3;
60 		*codonPtr += nucleotide << 6;
61 	} else if (position == 2) {
62 		*codonPtr &= FILTER2;
63 		*codonPtr += nucleotide << 4;
64 	} else if (position == 1) {
65 		*codonPtr &= FILTER1;
66 		*codonPtr += nucleotide << 2;
67 	} else if (position == 0) {
68 		*codonPtr &= FILTER0;
69 		*codonPtr += nucleotide;
70 	}
71 }
72 
getTightStringInArray(TightString * tString,IDnum position)73 TightString *getTightStringInArray(TightString * tString,
74 				   IDnum	 position)
75 {
76 	return tString + position;
77 }
78 
79 //
80 // Adds a nucleotide into the Codon pointed to by codonPtr at the desired
81 // position (0, 1, 2, or 3);
82 //
writeNucleotide(Nucleotide nucleotide,Codon * codonPtr,int position)83 void writeNucleotide(Nucleotide nucleotide, Codon * codonPtr, int position)
84 {
85 	int nucleotideNum;
86 
87 	switch (nucleotide) {
88 	case 'A':
89 		nucleotideNum = Adenine;
90 		break;
91 	case 'C':
92 		nucleotideNum = Cytosine;
93 		break;
94 	case 'G':
95 		nucleotideNum = Guanine;
96 		break;
97 	case 'T':
98 		nucleotideNum = Thymine;
99 		break;
100 	case 'N':
101 		nucleotideNum = Adenine;
102 		break;
103 	case 'a':
104 		nucleotideNum = Adenine;
105 		break;
106 	case 'c':
107 		nucleotideNum = Cytosine;
108 		break;
109 	case 'g':
110 		nucleotideNum = Guanine;
111 		break;
112 	case 't':
113 		nucleotideNum = Thymine;
114 		break;
115 	case 'n':
116 		nucleotideNum = Adenine;
117 	default:
118 		nucleotideNum = Adenine;
119 	}
120 
121 	writeNucleotideNumber(nucleotideNum, codonPtr, position);
122 }
123 
fillTightStringWithString(TightString * tString,char * sequence,Codon * newSequence)124 static void fillTightStringWithString(TightString * tString,
125 				      char *sequence,
126 				      Codon *newSequence)
127 {
128 	int index;
129 
130 	tString->sequence = newSequence;
131 	for (index = 0; index < tString->length; index++)
132 		writeNucleotide(sequence[index],
133 				&(newSequence[index / 4]),
134 				index % 4);
135 	free(sequence);
136 }
137 
138 //
139 // Creates a tightString from an array of normal strings
140 //
newTightStringArrayFromStringArray(char ** sequences,IDnum sequenceCount,char ** tSeqMem)141 TightString *newTightStringArrayFromStringArray(char **sequences,
142 						IDnum sequenceCount,
143 						char **tSeqMem)
144 {
145 	IDnum sequenceIndex;
146 	Codon *tmp;
147 	TightString *tStringArray = mallocOrExit(sequenceCount, TightString);
148 	Coordinate totalLength = 0;
149 	int arrayLength;
150 
151 	for (sequenceIndex = 0; sequenceIndex < sequenceCount; sequenceIndex++)
152 	{
153 		tStringArray[sequenceIndex].length = strlen (sequences[sequenceIndex]);
154 		arrayLength = (tStringArray[sequenceIndex].length + 3) / 4;
155 		totalLength += arrayLength;
156 	}
157 	*tSeqMem = callocOrExit (totalLength, char);
158 	tmp = (Codon*)*tSeqMem;
159 	for (sequenceIndex = 0; sequenceIndex < sequenceCount; sequenceIndex++)
160 	{
161 		fillTightStringWithString (&tStringArray[sequenceIndex],
162 					   sequences[sequenceIndex],
163 					   tmp);
164 		arrayLength = (tStringArray[sequenceIndex].length + 3) / 4;
165 		tmp += arrayLength;
166 	}
167 
168 	free(sequences);
169 	return tStringArray;
170 }
171 
readNucleotide(Nucleotide nucleotide)172 char readNucleotide(Nucleotide nucleotide)
173 {
174 	switch (nucleotide) {
175 	case 0:
176 		return 'A';
177 	case 1:
178 		return 'C';
179 	case 2:
180 		return 'G';
181 	case 3:
182 		return 'T';
183 	}
184 
185 	return '?';
186 }
187 
readTightString(TightString * tString)188 char *readTightString(TightString * tString)
189 {
190 	Coordinate index, index4;
191 	char *string;
192 	Codon codon;
193 
194 	if (tString == NULL || tString->length == 0) {
195 		string = callocOrExit(5, char);
196 		strcpy(string, "VOID");
197 		return string;
198 	}
199 
200 	string = callocOrExit(tString->length + 1, char);
201 
202 	for (index = 0; index < tString->length / 4; index++) {
203 		index4 = index << 2;
204 		codon = tString->sequence[index];
205 		string[index4] = readNucleotide(codon & 3);
206 		string[index4 + 1] = readNucleotide((codon & 12) >> 2);
207 		string[index4 + 2] = readNucleotide((codon & 48) >> 4);
208 		string[index4 + 3] = readNucleotide((codon & 192) >> 6);
209 	}
210 
211 	index4 = index << 2;
212 	codon = tString->sequence[index];
213 
214 	switch (tString->length % 4) {
215 	case 3:
216 		string[index4 + 3] = readNucleotide((codon & 192) >> 6);
217 	case 2:
218 		string[index4 + 2] = readNucleotide((codon & 48) >> 4);
219 	case 1:
220 		string[index4 + 1] = readNucleotide((codon & 12) >> 2);
221 	case 0:
222 		string[index4] = readNucleotide(codon & 3);
223 	}
224 
225 	string[tString->length] = '\0';
226 
227 	return string;
228 }
229 
getNucleotide(Coordinate nucleotideIndex,TightString * tString)230 Nucleotide getNucleotide(Coordinate nucleotideIndex, TightString * tString)
231 {
232 	Codon codon = tString->sequence[nucleotideIndex / 4];
233 
234 	switch (nucleotideIndex % 4) {
235 	case 3:
236 		return (codon & 192) >> 6;
237 	case 2:
238 		return (codon & 48) >> 4;
239 	case 1:
240 		return (codon & 12) >> 2;
241 	case 0:
242 		return (codon & 3);
243 	}
244 
245 	return '?';
246 }
247 
readTightStringFragment(TightString * tString,Coordinate start,Coordinate finish,char * string)248 void readTightStringFragment(TightString * tString, Coordinate start,
249 			     Coordinate finish, char *string)
250 {
251 	Coordinate index;
252 	Coordinate inFinish = finish;
253 
254 	if (start >= tString->length) {
255 		string[0] = '\0';
256 		return;
257 	}
258 
259 	if (inFinish > tString->length)
260 		inFinish = tString->length;
261 
262 	for (index = start; index < inFinish; index++) {
263 		string[index - start] =
264 		    readNucleotide(getNucleotide(index, tString));
265 	}
266 
267 	string[inFinish - start] = '\0';
268 }
269 
getNucleotideChar(Coordinate nucleotideIndex,TightString * tString)270 char getNucleotideChar(Coordinate nucleotideIndex, TightString * tString)
271 {
272 	Codon codon;
273 
274 	codon = tString->sequence[nucleotideIndex / 4];
275 
276 	switch (nucleotideIndex % 4) {
277 	case 3:
278 		return readNucleotide((codon & 192) >> 6);
279 	case 2:
280 		return readNucleotide((codon & 48) >> 4);
281 	case 1:
282 		return readNucleotide((codon & 12) >> 2);
283 	case 0:
284 		return readNucleotide((codon & 3));
285 	}
286 
287 	return '?';
288 }
289 
newTightString(Coordinate length)290 TightString *newTightString(Coordinate length)
291 {
292 	Coordinate arrayLength = length / 4;
293 	Coordinate index;
294 	TightString *newTString = mallocOrExit(1, TightString);
295 	if (length % 4 > 0)
296 		arrayLength++;
297 
298 	newTString->length = length;
299 	newTString->sequence = callocOrExit(arrayLength, Codon);
300 
301 	for (index = 0; index < arrayLength; index++)
302 		newTString->sequence[index] = 0;
303 
304 	return newTString;
305 }
306 
307 void
writeNucleotideAtPosition(Nucleotide nucleotide,Coordinate position,TightString * tString)308 writeNucleotideAtPosition(Nucleotide nucleotide, Coordinate position,
309 			  TightString * tString)
310 {
311 	if (position >= tString->length)
312 		return;
313 
314 	writeNucleotideNumber(nucleotide,
315 			      &tString->sequence[position / 4],
316 			      position % 4);
317 }
318 
getLength(TightString * tString)319 IDnum getLength(TightString * tString)
320 {
321 	return tString->length;
322 }
323 
destroyTightString(TightString * tString)324 void destroyTightString(TightString * tString)
325 {
326 	free(tString->sequence);
327 	free(tString);
328 }
329 
setTightStringLength(TightString * tString,Coordinate length)330 void setTightStringLength(TightString * tString, Coordinate length)
331 {
332 	Coordinate arrayLength = tString->length / 4;
333 	Coordinate newArrayLength = length / 4;
334 	if (tString->length % 4 > 0)
335 		arrayLength++;
336 	if (length % 4 > 0)
337 		newArrayLength++;
338 
339 	if (newArrayLength > arrayLength) {
340 		tString->sequence =
341 		    reallocOrExit(tString->sequence,
342 			    newArrayLength, Codon);
343 		arrayLength = newArrayLength;
344 	}
345 
346 	tString->length = length;
347 }
348 
exportTightString(FILE * outfile,TightString * sequence,IDnum index)349 void exportTightString(FILE * outfile, TightString * sequence, IDnum index)
350 {
351 	Coordinate start, finish;
352 	char str[100];
353 
354 	if (sequence == NULL)
355 		return;
356 
357 	velvetFprintf(outfile, ">SEQUENCE_%ld_length_%lld\n", (long) index,
358 		(long long) getLength(sequence));
359 
360 	start = 0;
361 	while (start <= getLength(sequence)) {
362 		finish = start + 60;
363 		readTightStringFragment(sequence, start, finish, str);
364 		velvetFprintf(outfile, "%s\n", str);
365 		start = finish;
366 	}
367 
368 	fflush(outfile);
369 }
370