1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <math.h>
25 #include <time.h>
26 #include <limits.h>
27 #include <ctype.h>
28 
29 #include "globals.h"
30 #include "tightString.h"
31 #include "readSet.h"
32 #include "utility.h"
33 
34 #if !defined(BUNDLEDZLIB)
35 #include <zlib.h>
36 #elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
37 #include "../third-party/zlib-1.2.3/Win32/include/zlib.h"
38 #else
39 #include "../third-party/zlib-1.2.3/zlib.h"
40 #endif
41 
42 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
43 #  include <fcntl.h>
44 #  include <io.h>
45 #  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
46 #else
47 #  define SET_BINARY_MODE(file)
48 #endif
49 
newReadSet()50 ReadSet *newReadSet()
51 {
52 	ReadSet *rs = callocOrExit(1, ReadSet);
53 	return rs;
54 }
55 
56 //////////////////////////////////////////////////////////////////////////
57 // Reference identifiers
58 //////////////////////////////////////////////////////////////////////////
59 
60 typedef struct referenceCoordinate_st ReferenceCoordinate;
61 static Coordinate reference_coordinate_double_strand = true;
62 
63 struct referenceCoordinate_st {
64 	char * name;
65 	Coordinate start;
66 	Coordinate finish;
67 	IDnum referenceID;
68 	boolean positive_strand;
69 }  ATTRIBUTE_PACKED;
70 
compareRefCoords(const void * ptrA,const void * ptrB)71 static int compareRefCoords(const void * ptrA, const void * ptrB) {
72 	ReferenceCoordinate * A = (ReferenceCoordinate *) ptrA;
73 	ReferenceCoordinate * B = (ReferenceCoordinate *) ptrB;
74 	int comp = strcmp(A->name, B->name);
75 
76 	if (comp != 0)
77 		return comp;
78 	else if (!reference_coordinate_double_strand && A->positive_strand != B->positive_strand)
79 		return A->positive_strand > B->positive_strand;
80 	else {
81 		if (A->finish > -1 && A->finish < B->start)
82 			return -1;
83 		else if (B->finish > -1 && A->start > B->finish)
84 			return 1;
85 		else return 0;
86 	}
87 }
88 
89 typedef struct referenceCoordinateTable_st ReferenceCoordinateTable;
90 
91 struct referenceCoordinateTable_st {
92 	ReferenceCoordinate * array;
93 	IDnum arrayLength;
94 }  ATTRIBUTE_PACKED;
95 
newReferenceCoordinateTable()96 static ReferenceCoordinateTable * newReferenceCoordinateTable() {
97 	ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable);
98 	table->array = NULL;
99 	table->arrayLength = 0;
100 	return table;
101 }
102 
destroyReferenceCoordinateTable(ReferenceCoordinateTable * table)103 static void destroyReferenceCoordinateTable(ReferenceCoordinateTable * table) {
104 	IDnum index;
105 
106 	if (table->array) {
107 		for (index = 0; index < table->arrayLength; index++)
108 			free(table->array[index].name);
109 		free(table->array);
110 	}
111 	free(table);
112 }
113 
resizeReferenceCoordinateTable(ReferenceCoordinateTable * table,IDnum extraLength)114 static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) {
115 	if (table->array == NULL)
116 		table->array = callocOrExit(extraLength, ReferenceCoordinate);
117 	else
118 		table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate);
119 }
120 
findReferenceCoordinate(ReferenceCoordinateTable * table,char * name,Coordinate start,Coordinate finish,boolean positive_strand)121 static ReferenceCoordinate * findReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
122 	ReferenceCoordinate * array = table->array;
123 	ReferenceCoordinate refCoord;
124 	Coordinate leftIndex = 0;
125 	Coordinate rightIndex = table->arrayLength - 1;
126 	Coordinate middleIndex;
127 
128 	refCoord.name = name;
129 	refCoord.start = start;
130 	refCoord.finish = finish;
131 	refCoord.referenceID = 0;
132 	refCoord.positive_strand = positive_strand;
133 
134 	while (true) {
135 		middleIndex = (rightIndex + leftIndex) / 2;
136 
137 		if (leftIndex > rightIndex)
138 			return NULL;
139 		else if (compareRefCoords(&(array[middleIndex]), &refCoord) == 0)
140 			return &(array[middleIndex]);
141 		else if (leftIndex == middleIndex)
142 			return NULL;
143 		else if (compareRefCoords(&(array[middleIndex]), &refCoord) > 0)
144 			rightIndex = middleIndex;
145 		else
146 			leftIndex = middleIndex;
147 	}
148 }
149 
addReferenceCoordinate(ReferenceCoordinateTable * table,char * name,Coordinate start,Coordinate finish,boolean positive_strand)150 static void addReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
151 	ReferenceCoordinate * refCoord;
152 
153 	if ((refCoord = findReferenceCoordinate(table, name, start, finish, positive_strand))) {
154 		velvetLog("Overlapping reference coordinates:\n");
155 		velvetLog("%s:%lli-%lli\n", name, (long long) start, (long long) finish);
156 		velvetLog("%s:%lli-%lli\n", refCoord->name, (long long) refCoord->start, (long long) refCoord->finish);
157 		velvetLog("Exiting...\n");
158 #ifdef DEBUG
159 		abort();
160 #endif
161 		exit(1);
162 	}
163 
164 	refCoord = &(table->array[table->arrayLength++]);
165 
166 	refCoord->name = name;
167 	refCoord->start = start;
168 	refCoord->finish = finish;
169 	refCoord->referenceID = table->arrayLength;
170 	refCoord->positive_strand = positive_strand;
171 }
172 
sortReferenceCoordinateTable(ReferenceCoordinateTable * table)173 static void sortReferenceCoordinateTable(ReferenceCoordinateTable * table) {
174 	qsort(table->array, table->arrayLength, sizeof(ReferenceCoordinate), compareRefCoords);
175 }
176 
177 //////////////////////////////////////////////////////////////////////////
178 // File reading
179 //////////////////////////////////////////////////////////////////////////
180 
velvetifySequence(char * str)181 static void velvetifySequence(char * str) {
182 	int i = strlen(str) - 1;
183 	char c;
184 
185 	for (i = strlen(str) - 1; i >= 0; i--) {
186 		c = str[i];
187 		switch (c) {
188 		case '\n':
189 		case '\r':
190 		case EOF:
191 			str[i] = '\0';
192 			break;
193 		case 'A':
194 		case 'a':
195 			str[i] = 'A';
196 			break;
197 		case 'C':
198 		case 'c':
199 			str[i] = 'C';
200 			break;
201 		case 'G':
202 		case 'g':
203 			str[i] = 'G';
204 			break;
205 		case 'T':
206 		case 't':
207 			str[i] = 'T';
208 			break;
209 		default:
210 			str[i] = 'N';
211 		}
212 	}
213 }
214 
reverseComplementSequence(char * str)215 static void reverseComplementSequence(char * str)
216 {
217 	size_t length = strlen(str);
218 	size_t i;
219 
220 	for (i = 0; i < length-1 - i; i++) {
221 		char c = str[i];
222 		str[i] = str[length-1 - i];
223 		str[length-1 - i] = c;
224 	}
225 
226 #ifndef COLOR
227 	for (i = 0; i < length; i++) {
228 		switch (str[i]) {
229 		case 'A':
230 		case 'a':
231 			str[i] = 'T';
232 			break;
233 		case 'C':
234 		case 'c':
235 			str[i] = 'G';
236 			break;
237 		case 'G':
238 		case 'g':
239 			str[i] = 'C';
240 			break;
241 		// As in velvetifySequence(), anything unusual ends up as 'A'
242 		default:
243 			str[i] = 'A';
244 			break;
245 		}
246 	}
247 #endif
248 }
249 
writeFastaSequence(FILE * outfile,const char * str)250 static void writeFastaSequence(FILE * outfile, const char * str)
251 {
252 	size_t length = strlen(str);
253 	size_t start;
254 	for (start = 0; start < length; start += 60)
255 		velvetFprintf(outfile, "%.60s\n", &str[start]);
256 }
257 
convertSequences(ReadSet * rs)258 void convertSequences(ReadSet * rs)
259 {
260 	rs->tSequences = newTightStringArrayFromStringArray(rs->sequences,
261 							    rs->readCount,
262 							    &rs->tSeqMem);
263 	rs->sequences = NULL;
264 }
265 
266 // Returns the value of a 32-bit little-endian-stored integer.
int32(const unsigned char * ptr)267 static int int32(const unsigned char * ptr)
268 {
269 	int x = ptr[3];
270 	x = (x << 8) | ptr[2];
271 	x = (x << 8) | ptr[1];
272 	x = (x << 8) | ptr[0];
273 	return x;
274 }
275 
276 // Imports sequences from a fastq file
277 // Memory space allocated within this function.
readSolexaFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)278 static void readSolexaFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
279 {
280 	FILE *file = fopen(filename, "r");
281 	IDnum counter = 0;
282 	const int maxline = 500;
283 	char line[500];
284 	char readName[500];
285 	char readSeq[500];
286 	char str[100];
287 	Coordinate start;
288 
289 	if (strcmp(filename, "-"))
290 		file = fopen(filename, "r");
291 	else
292 		file = stdin;
293 
294 	if (file != NULL)
295 		velvetLog("Reading Solexa file %s\n", filename);
296 	else
297 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
298 
299 	while (fgets(line, maxline, file) != NULL)
300 		if (strchr(line, '.') == NULL) {
301 			sscanf(line, "%s\t%*i\t%*i\t%*i\t%*c%[^\n]",
302 			       readName, readSeq);
303 			velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
304 			velvetifySequence(readSeq);
305 			start = 0;
306 			while (start <= (Coordinate)strlen(readSeq)) {
307 				strncpy(str, readSeq + start, 60);
308 				str[60] = '\0';
309 				velvetFprintf(outfile, "%s\n", str);
310 				start += 60;
311 			}
312 
313 			counter++;
314 		}
315 
316 	fclose(file);
317 
318 	velvetLog("%li sequences found\n", (long) counter);
319 	velvetLog("Done\n");
320 }
321 
readElandFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)322 static void readElandFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
323 {
324 	FILE *file = fopen(filename, "r");
325 	IDnum counter = 0;
326 	const int maxline = 5000;
327 	char line[5000];
328 	char readName[5000];
329 	char readSeq[5000];
330 	char str[100];
331 	Coordinate start;
332 
333 	if (strcmp(filename, "-"))
334 		file = fopen(filename, "r");
335 	else
336 		file = stdin;
337 
338 	if (file != NULL)
339 		velvetLog("Reading Solexa file %s\n", filename);
340 	else
341 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
342 
343 	// Reopen file and memorize line:
344 	while (fgets(line, maxline, file) != NULL) {
345 		sscanf(line, "%[^\t]\t%[^\t\n]",
346 		       readName, readSeq);
347 		velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
348 		velvetifySequence(readSeq);
349 		start = 0;
350 		while (start <= (Coordinate)strlen(readSeq)) {
351 			strncpy(str, readSeq + start, 60);
352 			str[60] = '\0';
353 			velvetFprintf(outfile, "%s\n", str);
354 			start += 60;
355 		}
356 
357 		counter++;
358 	}
359 
360 	fclose(file);
361 
362 	velvetLog("%li sequences found\n", (long) counter);
363 	velvetLog("Done\n");
364 }
365 
goToEndOfLine(char * line,FILE * file)366 void goToEndOfLine(char *line, FILE * file)
367 {
368 	size_t length = strlen(line);
369 	char c = line[length - 1];
370 
371 	while (c != '\n')
372 		c = fgetc(file);
373 }
374 
375 // Imports sequences from a fastq file
376 // Memory space allocated within this function.
readFastQFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)377 static void readFastQFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
378 {
379 	FILE *file;
380 	const int maxline = 5000;
381 	char line[5000];
382 	char str[100];
383 	IDnum counter = 0;
384 	Coordinate start, i;
385 	char c;
386 
387 	if (strcmp(filename, "-"))
388 		file = fopen(filename, "r");
389 	else
390 		file = stdin;
391 
392 	if (file != NULL)
393 		velvetLog("Reading FastQ file %s\n", filename);
394 	else
395 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
396 
397 	// Checking if FastQ
398 	c = getc(file);
399 	if (c != EOF && c != '@')
400 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
401 	ungetc(c, file);
402 
403 	while(fgets(line, maxline, file)) {
404 
405 		for (i = strlen(line) - 1;
406 		     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
407 			line[i] = '\0';
408 		}
409 
410 		velvetFprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
411 		counter++;
412 
413 		if(!fgets(line, maxline, file))
414 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
415 
416 		velvetifySequence(line);
417 
418 		start = 0;
419 		while (start <= (Coordinate)strlen(line)) {
420 			strncpy(str, line + start, 60);
421 			str[60] = '\0';
422 			velvetFprintf(outfile, "%s\n", str);
423 			start += 60;
424 		}
425 
426 		if(!fgets(line, maxline, file))
427 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
428 		if(!fgets(line, maxline, file))
429 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
430 	}
431 
432 	fclose(file);
433 	velvetLog("%li reads found.\n", (long) counter);
434 	velvetLog("Done\n");
435 }
436 
437 // Imports sequences from a raw sequence file
438 // Memory space allocated within this function.
readRawFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)439 static void readRawFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
440 {
441 	FILE *file;
442 	const int maxline = 5000;
443 	char line[5000];
444 	char str[100];
445 	IDnum counter = 0;
446 	Coordinate start;
447 
448 	if (strcmp(filename, "-"))
449 		file = fopen(filename, "r");
450 	else
451 		file = stdin;
452 
453 	if (file != NULL)
454 		velvetLog("Reading raw file %s\n", filename);
455 	else
456 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
457 
458 	while(fgets(line, maxline, file)) {
459 		velvetFprintf(outfile,">RAW\t%ld\t%d\n", (long) ((*sequenceIndex)++), (int) cat);
460 		counter++;
461 
462 		if (strlen(line) >= maxline - 1) {
463 			velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
464 #ifdef DEBUG
465 			abort();
466 #endif
467 			exit(1);
468 		}
469 		velvetifySequence(line);
470 		start = 0;
471 		while (start <= (Coordinate)strlen(line)) {
472 			strncpy(str, line + start, 60);
473 			str[60] = '\0';
474 			velvetFprintf(outfile, "%s\n", str);
475 			start += 60;
476 		}
477 	}
478 
479 	fclose(file);
480 	velvetLog("%li reads found.\n", (long) counter);
481 	velvetLog("Done\n");
482 }
483 
484 // Imports sequences from a zipped rfastq file
485 // Memory space allocated within this function.
readFastQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)486 static void readFastQGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
487 {
488 	gzFile file;
489 	const int maxline = 5000;
490 	char line[5000];
491 	char str[100];
492 	IDnum counter = 0;
493 	Coordinate start, i;
494 	char c;
495 
496 	if (strcmp(filename, "-"))
497 		file = gzopen(filename, "rb");
498 	else {
499 		file = gzdopen(fileno(stdin), "rb");
500 		SET_BINARY_MODE(stdin);
501 	}
502 
503 	if (file != NULL)
504 		velvetLog("Reading FastQ file %s\n", filename);
505 	else
506 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
507 
508 	// Checking if FastQ
509 	c = gzgetc(file);
510 	if (c != EOF && c != '@')
511 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
512 	gzungetc(c, file);
513 
514 	while (gzgets(file, line, maxline)) {
515 		for (i = strlen(line) - 1;
516 		     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
517 			line[i] = '\0';
518 		}
519 
520 		velvetFprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
521 		counter++;
522 
523 		gzgets(file, line, maxline);
524 
525 		velvetifySequence(line);
526 
527 		start = 0;
528 		while (start <= (Coordinate)strlen(line)) {
529 			strncpy(str, line + start, 60);
530 			str[60] = '\0';
531 			velvetFprintf(outfile, "%s\n", str);
532 			start += 60;
533 		}
534 
535 		gzgets(file, line, maxline);
536 		gzgets(file, line, maxline);
537 	}
538 
539 	gzclose(file);
540 	velvetLog("%li reads found.\n", (long) counter);
541 	velvetLog("Done\n");
542 }
543 
544 // Imports sequences from a zipped raw file
545 // Memory space allocated within this function.
readRawGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)546 static void readRawGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
547 {
548 	gzFile file;
549 	const int maxline = 5000;
550 	char line[5000];
551 	char str[100];
552 	IDnum counter = 0;
553 	Coordinate start;
554 
555 	if (strcmp(filename, "-"))
556 		file = gzopen(filename, "rb");
557 	else {
558 		file = gzdopen(fileno(stdin), "rb");
559 		SET_BINARY_MODE(stdin);
560 	}
561 
562 	if (file != NULL)
563 		velvetLog("Reading zipped raw sequence file %s\n", filename);
564 	else
565 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
566 
567 	while (gzgets(file, line, maxline)) {
568 		velvetFprintf(outfile,">RAW\t%ld\t%d\n", (long) ((*sequenceIndex)++), (int) cat);
569 		counter++;
570 
571 		if (strlen(line) >= maxline - 1) {
572 			velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
573 #ifdef DEBUG
574 			abort();
575 #endif
576 			exit(1);
577 		}
578 
579 		velvetifySequence(line);
580 
581 		start = 0;
582 		while (start <= (Coordinate)strlen(line)) {
583 			strncpy(str, line + start, 60);
584 			str[60] = '\0';
585 			velvetFprintf(outfile, "%s\n", str);
586 			start += 60;
587 		}
588 	}
589 
590 	gzclose(file);
591 	velvetLog("%li reads found.\n", (long) counter);
592 	velvetLog("Done\n");
593 }
594 
fillReferenceCoordinateTable(char * filename,ReferenceCoordinateTable * refCoords,IDnum counter)595 static void fillReferenceCoordinateTable(char *filename, ReferenceCoordinateTable * refCoords, IDnum counter)
596 {
597 	FILE *file;
598 	const int maxline = 5000;
599 	char line[5000];
600 	char * name;
601 	long long start, finish;
602 	Coordinate i;
603 	IDnum index = 0;
604 
605 	if (strcmp(filename, "-"))
606 		file = fopen(filename, "r");
607 	else
608 		file = stdin;
609 
610 	if (counter == 0)
611 		return;
612 
613 	resizeReferenceCoordinateTable(refCoords,counter);
614 
615 	while (fgets(line, maxline, file) && index < counter) {
616 		if (line[0] == '>') {
617 			name = callocOrExit(strlen(line), char);
618 
619 			if (strchr(line, ':')) {
620 				sscanf(strtok(line, ":-\r\n"), ">%s", name);
621 				sscanf(strtok(NULL, ":-\r\n"), "%lli", &start);
622 				sscanf(strtok(NULL, ":-\r\n"), "%lli", &finish);
623 				if (start <= finish)
624 					addReferenceCoordinate(refCoords, name, start, finish, true);
625 				else
626 					addReferenceCoordinate(refCoords, name, finish, start, false);
627 			} else {
628 				for (i = strlen(line) - 1;
629 				     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
630 					line[i] = '\0';
631 				}
632 
633 				strcpy(name, line + 1);
634 				addReferenceCoordinate(refCoords, name, 1, -1, true);
635 			}
636 
637 			index++;
638 		}
639 	}
640 
641 	sortReferenceCoordinateTable(refCoords);
642 }
643 
644 
645 // Imports sequences from a fasta file
646 // Memory is allocated within the function
readFastAFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)647 static void readFastAFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex, ReferenceCoordinateTable * refCoords)
648 {
649 	FILE *file;
650 	const int maxline = 5000;
651 	char line[5000];
652 	char str[100];
653 	IDnum counter = 0;
654 	Coordinate i;
655 	char c;
656 	Coordinate start;
657 	int offset = 0;
658 
659 	if (strcmp(filename, "-"))
660 		file = fopen(filename, "r");
661 	else
662 		file = stdin;
663 
664 	if (file != NULL)
665 		velvetLog("Reading FastA file %s;\n", filename);
666 	else
667 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
668 
669 	// Checking if FastA
670 	c = getc(file);
671 	if (c != EOF && c != '>')
672 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
673 	ungetc(c, file);
674 
675 	while (fgets(line, maxline, file)) {
676 		if (line[0] == '>') {
677 			if (offset != 0) {
678 				velvetFprintf(outfile, "\n");
679 				offset = 0;
680 			}
681 
682 			for (i = strlen(line) - 1;
683 			     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
684 				line[i] = '\0';
685 			}
686 
687 			velvetFprintf(outfile,"%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
688 			counter++;
689 		} else {
690 			velvetifySequence(line);
691 			start = 0;
692 			while (start < (Coordinate)strlen(line)) {
693 				strncpy(str, line + start, 60 - offset);
694 				str[60 - offset] = '\0';
695 				velvetFprintf(outfile, "%s", str);
696 				offset += strlen(str);
697 				if (offset >= 60) {
698 					velvetFprintf(outfile, "\n");
699 					offset = 0;
700 				}
701 				start += strlen(str);
702 			}
703 		}
704 	}
705 
706 	if (offset != 0)
707 		velvetFprintf(outfile, "\n");
708 	fclose(file);
709 
710 	if (cat == REFERENCE)
711 		fillReferenceCoordinateTable(filename, refCoords, counter);
712 
713 	velvetLog("%li sequences found\n", (long) counter);
714 	velvetLog("Done\n");
715 }
716 
717 // Imports sequences from a zipped fasta file
718 // Memory is allocated within the function
readFastAGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)719 static void readFastAGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
720 {
721 	gzFile file;
722 	const int maxline = 5000;
723 	char line[5000];
724 	char str[100];
725 	IDnum counter = 0;
726 	Coordinate i, start;
727 	char c;
728 	int offset = 0;
729 
730 	if (strcmp(filename, "-"))
731 		file = gzopen(filename, "rb");
732 	else {
733 		file = gzdopen(fileno(stdin), "rb");
734 		SET_BINARY_MODE(stdin);
735 	}
736 
737 	if (file != NULL)
738 		velvetLog("Reading zipped FastA file %s;\n", filename);
739 	else
740 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
741 
742 	// Checking if FastA
743 	c = gzgetc(file);
744 	if (c != EOF && c != '>')
745 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
746 	gzungetc(c, file);
747 
748 	while (gzgets(file, line, maxline)) {
749 		if (line[0] == '>') {
750 			if (offset != 0) {
751 				velvetFprintf(outfile, "\n");
752 				offset = 0;
753 			}
754 
755 			for (i = strlen(line) - 1;
756 			     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
757 				line[i] = '\0';
758 			}
759 
760 			velvetFprintf(outfile, "%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
761 			counter++;
762 		} else {
763 			velvetifySequence(line);
764 
765 			start = 0;
766 			while (start < (Coordinate)strlen(line)) {
767 				strncpy(str, line + start, 60 - offset);
768 				str[60 - offset] = '\0';
769 				velvetFprintf(outfile, "%s", str);
770 				offset += strlen(str);
771 				if (offset >= 60) {
772 					velvetFprintf(outfile, "\n");
773 					offset = 0;
774 				}
775 				start += strlen(str);
776 			}
777 		}
778 	}
779 
780 	if (offset != 0)
781 		velvetFprintf(outfile, "\n");
782 	gzclose(file);
783 
784 	velvetLog("%li sequences found\n", (long) counter);
785 	velvetLog("Done\n");
786 }
787 
788 // Parser for new output
readMAQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)789 static void readMAQGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
790 {
791 	gzFile file;
792 	const int maxline = 1000;
793 	char line[1000];
794 	IDnum counter = 0;
795 	char readName[500];
796 	char readSeq[500];
797 	char str[100];
798 	Coordinate start;
799 
800 	if (strcmp(filename, "-"))
801 		file = gzopen(filename, "rb");
802 	else {
803 		file = gzdopen(fileno(stdin), "rb");
804 		SET_BINARY_MODE(stdin);
805 	}
806 
807 	if (file != NULL)
808 		velvetLog("Reading zipped MAQ file %s\n", filename);
809 	else
810 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
811 
812 	// Reopen file and memorize line:
813 	while (gzgets(file, line, maxline)) {
814 		sscanf(line, "%s\t%*i\t%*i\t%*c\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%[^\t]",
815 		       readName, readSeq);
816 		velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
817 		velvetifySequence(readSeq);
818 		start = 0;
819 		while (start <= (Coordinate)strlen(readSeq)) {
820 			strncpy(str, readSeq + start, 60);
821 			str[60] = '\0';
822 			velvetFprintf(outfile, "%s\n", str);
823 			start += 60;
824 		}
825 
826 		counter++;
827 	}
828 
829 	gzclose(file);
830 
831 	velvetLog("%li sequences found\n", (long) counter);
832 	velvetLog("Done\n");
833 }
834 
readSAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)835 static void readSAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
836 {
837 	char line[5000];
838 	unsigned long lineno, readCount;
839 	char previous_qname_pairing[10];
840 	char previous_qname[5000];
841 	char previous_seq[5000];
842 	char previous_rname[5000];
843 	char previous_cigar[5000];
844 	long long previous_pos = -1;
845 	int previous_orientation = 0;
846 	boolean previous_paired = false;
847 	ReferenceCoordinate * refCoord;
848 	char c;
849 	int cigar_index;
850 	long long cigar_num;
851 
852 	if (cat == REFERENCE) {
853 		velvetLog("SAM file %s cannot contain reference sequences.\n", filename);
854 		velvetLog("Please check the command line.\n");
855 #ifdef DEBUG
856 		abort();
857 #endif
858 		exit(1);
859 	}
860 
861 	FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
862 	if (file)
863 		velvetLog("Reading SAM file %s\n", filename);
864 	else
865 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
866 
867 	readCount = 0;
868 	for (lineno = 1; fgets(line, sizeof(line), file); lineno++)
869 		if (line[0] != '@') {
870 			char *qname, *flag, *seq, *rname, *cigar;
871 			long long pos;
872 			int orientation;
873 			int i;
874 
875 			qname = strtok(line, "\t");
876 			flag  = strtok(NULL, "\t");
877 			rname = strtok(NULL, "\t");
878 			sscanf(strtok(NULL, "\t"), "%lli", &pos);
879 			orientation = 1;
880 
881 			// Mapping scor
882 			(void) strtok(NULL, "\t");
883 			cigar = strtok(NULL, "\t");
884 
885 			// Columns 7,8,9 are paired name, position and score
886 			for (i = 7; i < 10; i++)
887 				(void) strtok(NULL, "\t");
888 			seq = strtok(NULL, "\t");
889 
890 			if (seq == NULL) {
891 				velvetFprintf(stderr,
892 					"Line #%lu: ignoring SAM record with too few fields\n",
893 					lineno);
894 			}
895 			else if (strcmp(seq, "*") == 0) {
896 				velvetFprintf(stderr,
897 					"Line #%lu: ignoring SAM record with omitted SEQ field\n",
898 					lineno);
899 			}
900 			else {
901 				// Accept flags represented in either decimal or hex:
902 				int flagbits = strtol(flag, NULL, 0);
903 
904 				if (flagbits & 0x4)
905 				    strcpy(rname, "");
906 
907 				const char *qname_pairing = "";
908 				if (flagbits & 0x40)
909 					qname_pairing = "/1";
910 				else if (flagbits & 0x80)
911 					qname_pairing = "/2";
912 
913 				if (flagbits & 0x10) {
914 					orientation = -1;
915 					reverseComplementSequence(seq);
916 				}
917 
918 				// Determine if paired to previous read
919 				if (readCount > 0) {
920 					if (cat % 2) {
921 						if (previous_paired) {
922 							// Last read paired to penultimate read
923 							velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
924 								(long) ((*sequenceIndex)++), (int) cat);
925 							writeFastaSequence(outfile, previous_seq);
926 							previous_paired = false;
927 						} else if (strcmp(qname, previous_qname) == 0 && strcmp(qname_pairing, previous_qname_pairing) == 0) {
928 							// New multi-mapping issue
929 							previous_paired = false;
930 						}  else if (strcmp(qname, previous_qname) == 0) {
931 							// Last read paired to current reads
932 							velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
933 								(long) ((*sequenceIndex)++), (int) cat);
934 							writeFastaSequence(outfile, previous_seq);
935 							previous_paired = true;
936 						} else {
937 							// Last read unpaired
938 							velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
939 								(long) ((*sequenceIndex)++), (int) cat - 1);
940 							writeFastaSequence(outfile, previous_seq);
941 							previous_paired = false;
942 						}
943 					} else {
944 						// Unpaired dataset
945 						velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
946 							(long) ((*sequenceIndex)++), (int) cat);
947 						writeFastaSequence(outfile, previous_seq);
948 					}
949 
950 					if ((refCoord = findReferenceCoordinate(refCoords, previous_rname, (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
951 						if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
952 							;
953 						else {
954 							cigar_num = 0;
955 							for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
956 								c = previous_cigar[cigar_index];
957 								if (c == 'M' || c == '=' || c == 'X') {
958 									if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
959 										if (refCoord->positive_strand) {
960 											velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
961 										} else
962 											velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
963 									}
964 									cigar_num = 0;
965 								} else if (c == 'S' || c == 'I') {
966 									previous_pos -= cigar_num;
967 									cigar_num = 0;
968 								} else if (c == 'D' || c == 'N') {
969 									previous_pos += cigar_num;
970 									cigar_num = 0;
971 								} else if (c == 'H' || c == 'P') {
972 									cigar_num = 0;
973 								} else if (isdigit(c)) {
974 									cigar_num = 10 * cigar_num + (c - 48);
975 								} else {
976 									abort();
977 								}
978 							}
979 						}
980 					}
981 				}
982 
983 				strcpy(previous_qname, qname);
984 				strcpy(previous_qname_pairing, qname_pairing);
985 				strcpy(previous_seq, seq);
986 				strcpy(previous_rname, rname);
987 				strcpy(previous_cigar, cigar);
988 				previous_pos = pos;
989 				previous_orientation = orientation;
990 				velvetifySequence(previous_seq);
991 
992 				readCount++;
993 			}
994 		}
995 
996 	if (readCount) {
997 		if (cat % 2) {
998 			if (previous_paired) {
999 				// Last read paired to penultimate read
1000 				velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1001 					(long) ((*sequenceIndex)++), (int) cat);
1002 				writeFastaSequence(outfile, previous_seq);
1003 			} else {
1004 				// Last read unpaired
1005 				velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1006 					(long) ((*sequenceIndex)++), (int) cat - 1);
1007 				writeFastaSequence(outfile, previous_seq);
1008 			}
1009 		} else {
1010 			// Unpaired dataset
1011 			velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1012 				(long) ((*sequenceIndex)++), (int) cat);
1013 			writeFastaSequence(outfile, previous_seq);
1014 		}
1015 
1016 		if ((refCoord = findReferenceCoordinate(refCoords, previous_rname, (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1017 			if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1018 				;
1019 			else {
1020 				cigar_num = 0;
1021 				for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1022 					c = previous_cigar[cigar_index];
1023 					if (c == 'M' || c == '=' || c == 'X') {
1024 						if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1025 							if (refCoord->positive_strand) {
1026 								velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1027 							} else
1028 								velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1029 						}
1030 						cigar_num = 0;
1031 					} else if (c == 'S' || c == 'I') {
1032 						previous_pos -= cigar_num;
1033 						cigar_num = 0;
1034 					} else if (c == 'D' || c == 'N') {
1035 						previous_pos += cigar_num;
1036 						cigar_num = 0;
1037 					} else if (c == 'H' || c == 'P') {
1038 						cigar_num = 0;
1039 					} else if (isdigit(c)) {
1040 						cigar_num = 10 * cigar_num + (c - 48);
1041 					} else {
1042 						abort();
1043 					}
1044 				}
1045 			}
1046 		}
1047 	}
1048 
1049 	fclose(file);
1050 	velvetLog("%lu reads found.\n", readCount);
1051 	velvetLog("Done\n");
1052 }
1053 
readBAMint32(gzFile file)1054 static int readBAMint32(gzFile file)
1055 {
1056 	unsigned char buffer[4];
1057 	if (gzread(file, buffer, 4) != 4)
1058 		exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
1059 
1060 	return int32(buffer);
1061 }
1062 
readBAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)1063 static void readBAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
1064 {
1065 	size_t seqCapacity = 0;
1066 	char *seq = NULL;
1067 	char cigar[5000];
1068 	char cigar_buffer[5000];
1069 	size_t bufferCapacity = 4;
1070 	unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
1071 	unsigned long recno, readCount;
1072 	int i, refCount;
1073 	gzFile file;
1074 	char previous_qname_pairing[10];
1075 	char previous_qname[5000];
1076 	char previous_seq[5000];
1077 	char previous_cigar[5000];
1078 	int previous_rID = 0;
1079 	long long previous_pos = -1;
1080 	int previous_orientation = 0;
1081 	boolean previous_paired = false;
1082 	int previous_flagbits = 0;
1083 	char ** refNames;
1084 	ReferenceCoordinate * refCoord;
1085 	char c;
1086 	int cigar_index;
1087 	long long cigar_num;
1088 
1089 	if (cat == REFERENCE) {
1090 		velvetLog("BAM file %s cannot contain reference sequences.\n", filename);
1091 		velvetLog("Please check the command line.\n");
1092 #ifdef DEBUG
1093 		abort();
1094 #endif
1095 		exit(1);
1096 	}
1097 
1098 	if (strcmp(filename, "-") != 0)
1099 		file = gzopen(filename, "rb");
1100 	else {
1101 		file = gzdopen(fileno(stdin), "rb");
1102 		SET_BINARY_MODE(stdin);
1103 	}
1104 
1105 	if (file != NULL)
1106 		velvetLog("Reading BAM file %s\n", filename);
1107 	else
1108 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1109 
1110 	if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1111 		exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1112 
1113 	// Skip header text
1114 	if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1115 		exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1116 
1117 	// Skip header reference list
1118 	refCount = readBAMint32(file);
1119 	refNames = callocOrExit(refCount, char *);
1120 	for (i = 0; i < refCount; i++) {
1121 		int strLength;
1122 
1123 		if (gzread(file, buffer, 4) != 4)
1124 			exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1125 
1126 		strLength = int32(buffer);
1127 		refNames[i] = callocOrExit(strLength, char);
1128 
1129 		if (bufferCapacity < (size_t)(4 + strLength)) {
1130 			bufferCapacity = 4 + strLength + 4096;
1131 			buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1132 		}
1133 
1134 		if (gzread(file, buffer, 4 + strLength) != 4 + strLength)
1135 			exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1136 
1137 		strcpy(refNames[i], (char *) buffer);
1138 	}
1139 
1140 	readCount = 0;
1141 	for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1142 		int blockSize = int32(buffer);
1143 		int readLength;
1144 
1145 		if (bufferCapacity < (size_t)(4 + blockSize)) {
1146 			bufferCapacity = 4 + blockSize + 4096;
1147 			buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1148 		}
1149 
1150 		if (gzread(file, &buffer[4], blockSize) != blockSize)
1151 			exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1152 
1153 		readLength = int32(&buffer[20]);
1154 		if (readLength == 0) {
1155 			velvetFprintf(stderr,
1156 				"Record #%lu: ignoring BAM record with omitted SEQ field\n",
1157 				recno);
1158 		}
1159 		else {
1160 			int readNameLength = buffer[12];
1161 			int flag_nc = int32(&buffer[16]);
1162 			int flagbits = flag_nc >> 16;
1163 			int cigarLength = flag_nc & 0xffff;
1164 			char *qname = (char *)&buffer[36];
1165 			uint32_t *rawcigar = (uint32_t *) &buffer[36 + readNameLength];
1166 			unsigned char *rawseq =
1167 					&buffer[36 + readNameLength + 4 * cigarLength];
1168 			int rID = int32(&buffer[4]);
1169 			// NOTE: BAM file coords are 0-based, not 1-based like SAM files
1170 			// No comment
1171 			long long pos = int32(&buffer[8]) + 1;
1172 			int orientation = 1;
1173 
1174 			const char *qname_pairing = "";
1175 			if (flagbits & 0x40)
1176 				qname_pairing = "/1";
1177 			else if (flagbits & 0x80)
1178 				qname_pairing = "/2";
1179 
1180 			strcpy(cigar, "");
1181 			for (i = 0; i < cigarLength; i++) {
1182 				static const char decode_ops[] = "MIDNSHP=X";
1183 				uint32_t packed = *(rawcigar++);
1184 				sprintf(cigar_buffer, "%i%c", packed >> 4, decode_ops[packed & 0xf]);
1185 				strcat(cigar, cigar_buffer);
1186 			}
1187 
1188 			if (seqCapacity < (size_t)(readLength + 1)) {
1189 				seqCapacity = readLength * 2 + 1;
1190 				seq = reallocOrExit(seq, seqCapacity, char);
1191 			}
1192 
1193 			for (i = 0; i < readLength; i += 2) {
1194 				static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1195 				unsigned int packed = *(rawseq++);
1196 				seq[i] = decode_bases[packed >> 4];
1197 				seq[i+1] = decode_bases[packed & 0xf];
1198 			}
1199 			seq[readLength] = '\0';
1200 
1201 			if (flagbits & 0x10) {
1202 				orientation = -1;
1203 				reverseComplementSequence(seq);
1204 			}
1205 
1206 			// Determine if paired to previous read
1207 			if (readCount > 0) {
1208 				if (cat % 2) {
1209 					 if (previous_paired) {
1210 						// Last read paired to penultimate read
1211 						velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1212 							(long) ((*sequenceIndex)++), (int) cat);
1213 						writeFastaSequence(outfile, previous_seq);
1214 						previous_paired = false;
1215 					} else if (strcmp(qname, previous_qname) == 0 && strcmp(qname_pairing, previous_qname_pairing) == 0) {
1216 						// New multi-mapping issue
1217 						previous_paired = false;
1218 					} else if (strcmp(qname, previous_qname) == 0) {
1219 						// Last read paired to current reads
1220 						velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1221 							(long) ((*sequenceIndex)++), (int) cat);
1222 						writeFastaSequence(outfile, previous_seq);
1223 						previous_paired = true;
1224 					} else {
1225 						// Last read unpaired
1226 						velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1227 							(long) ((*sequenceIndex)++), (int) cat - 1);
1228 						writeFastaSequence(outfile, previous_seq);
1229 						previous_paired = false;
1230 					}
1231 				} else {
1232 					// Unpaired dataset
1233 					velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1234 						(long) ((*sequenceIndex)++), (int) cat);
1235 					writeFastaSequence(outfile, previous_seq);
1236 				}
1237 
1238 				if (!(previous_flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[previous_rID], (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1239 					if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1240 						;
1241 					else {
1242 						cigar_num = 0;
1243 						for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1244 							c = previous_cigar[cigar_index];
1245 							if (c == 'M' || c == '=' || c == 'X') {
1246 								if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1247 									if (refCoord->positive_strand) {
1248 										velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1249 									} else
1250 										velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1251 								}
1252 								cigar_num = 0;
1253 							} else if (c == 'S' || c == 'I') {
1254 								previous_pos -= cigar_num;
1255 								cigar_num = 0;
1256 							} else if (c == 'D' || c == 'N') {
1257 								previous_pos += cigar_num;
1258 								cigar_num = 0;
1259 							} else if (c == 'H' || c == 'P') {
1260 								cigar_num = 0;
1261 							} else if (isdigit(c)) {
1262 								cigar_num = 10 * cigar_num + (c - 48);
1263 							} else {
1264 								abort();
1265 							}
1266 						}
1267 					}
1268 				}
1269 			}
1270 
1271 			strcpy(previous_qname, qname);
1272 			strcpy(previous_qname_pairing, qname_pairing);
1273 			strcpy(previous_seq, seq);
1274 			strcpy(previous_cigar, cigar);
1275 			previous_rID = rID;
1276 			previous_pos = pos;
1277 			previous_orientation = orientation;
1278 			previous_flagbits = flagbits;
1279 			velvetifySequence(previous_seq);
1280 
1281 			readCount++;
1282 		}
1283 	}
1284 
1285 	if (readCount) {
1286 		if (cat % 2) {
1287 			if (previous_paired) {
1288 				// Last read paired to penultimate read
1289 				velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1290 					(long) ((*sequenceIndex)++), (int) cat);
1291 				writeFastaSequence(outfile, previous_seq);
1292 			} else {
1293 				// Last read unpaired
1294 				velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1295 					(long) ((*sequenceIndex)++), (int) cat - 1);
1296 				writeFastaSequence(outfile, previous_seq);
1297 			}
1298 		} else {
1299 			// Unpaired dataset
1300 			velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1301 				(long) ((*sequenceIndex)++), (int) cat);
1302 			writeFastaSequence(outfile, previous_seq);
1303 		}
1304 
1305 		if (!(previous_flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[previous_rID], (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1306 			if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1307 				;
1308 			else {
1309 				cigar_num = 0;
1310 				for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1311 					c = previous_cigar[cigar_index];
1312 					if (c == 'M' || c == '=' || c == 'X') {
1313 						if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1314 							if (refCoord->positive_strand) {
1315 								velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1316 							} else
1317 								velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1318 						}
1319 						cigar_num = 0;
1320 					} else if (c == 'S' || c == 'I') {
1321 						previous_pos -= cigar_num;
1322 						cigar_num = 0;
1323 					} else if (c == 'D' || c == 'N') {
1324 						previous_pos += cigar_num;
1325 						cigar_num = 0;
1326 					} else if (c == 'H' || c == 'P') {
1327 						cigar_num = 0;
1328 					} else if (isdigit(c)) {
1329 						cigar_num = 10 * cigar_num + (c - 48);
1330 					} else {
1331 						abort();
1332 					}
1333 				}
1334 			}
1335 		}
1336 	}
1337 
1338 	free(seq);
1339 	free(buffer);
1340 
1341 	gzclose(file);
1342 	velvetLog("%lu reads found.\n", readCount);
1343 	velvetLog("Done\n");
1344 }
1345 
printUsage()1346 static void printUsage()
1347 {
1348 	puts("Usage:");
1349 	puts("./velveth directory hash_length {[-file_format][-read_type] filename} [options]");
1350 	puts("");
1351 	puts("\tdirectory\t\t: directory name for output files");
1352 	printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
1353 	puts("\tfilename\t\t: path to sequence file or - for standard input");
1354 	puts("");
1355 	puts("File format options:");
1356 	puts("\t-fasta");
1357 	puts("\t-fastq");
1358 	puts("\t-raw");
1359 	puts("\t-fasta.gz");
1360 	puts("\t-fastq.gz");
1361 	puts("\t-raw.gz");
1362 	puts("\t-sam");
1363 	puts("\t-bam");
1364 	puts("\t-eland");
1365 	puts("\t-gerald");
1366 	puts("");
1367 	puts("Read type options:");
1368 	puts("\t-short");
1369 	puts("\t-shortPaired");
1370 	puts("\t-short2");
1371 	puts("\t-shortPaired2");
1372 	puts("\t-long");
1373 	puts("\t-longPaired");
1374 	puts("\t-reference");
1375 	puts("");
1376 	puts("Options:");
1377 	puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
1378 	puts("");
1379 	puts("Output:");
1380 	puts("\tdirectory/Roadmaps");
1381 	puts("\tdirectory/Sequences");
1382 	puts("\t\t[Both files are picked up by graph, so please leave them there]");
1383 }
1384 
1385 #define FASTQ 1
1386 #define FASTA 2
1387 #define GERALD 3
1388 #define ELAND 4
1389 #define FASTA_GZ 5
1390 #define FASTQ_GZ 6
1391 #define MAQ_GZ 7
1392 #define SAM 8
1393 #define BAM 9
1394 #define RAW 10
1395 #define RAW_GZ 11
1396 
1397 // General argument parser for most functions
1398 // Basically a reused portion of toplevel code dumped into here
parseDataAndReadFiles(char * filename,int argc,char ** argv,boolean * double_strand,boolean * noHash)1399 void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand, boolean * noHash)
1400 {
1401 	int argIndex = 1;
1402 	FILE *outfile;
1403 	int filetype = FASTA;
1404 	Category cat = 0;
1405 	IDnum sequenceIndex = 1;
1406 	short short_var;
1407 	ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
1408 	boolean reuseSequences = false;
1409 
1410 	if (argc < 2) {
1411 		printUsage();
1412 #ifdef DEBUG
1413 		abort();
1414 #endif
1415 		exit(1);
1416 	}
1417 
1418 	for (argIndex = 1; argIndex < argc; argIndex++) {
1419 		if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1420 			*double_strand = false;
1421 			reference_coordinate_double_strand = false;
1422 		} else if (strcmp(argv[argIndex], "-reuse_Sequences") == 0) {
1423 			reuseSequences = true;
1424 		} else if (strcmp(argv[argIndex], "-noHash") == 0) {
1425 			*noHash = true;
1426 		}
1427 	}
1428 
1429 	if (reuseSequences)
1430 		return;
1431 
1432 	outfile = fopen(filename, "w");
1433 
1434 	for (argIndex = 1; argIndex < argc; argIndex++) {
1435 		if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1436 
1437 			if (strcmp(argv[argIndex], "-fastq") == 0)
1438 				filetype = FASTQ;
1439 			else if (strcmp(argv[argIndex], "-fasta") == 0)
1440 				filetype = FASTA;
1441 			else if (strcmp(argv[argIndex], "-gerald") == 0)
1442 				filetype = GERALD;
1443 			else if (strcmp(argv[argIndex], "-eland") == 0)
1444 				filetype = ELAND;
1445 			else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1446 				filetype = FASTQ_GZ;
1447 			else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1448 				filetype = FASTA_GZ;
1449 			else if (strcmp(argv[argIndex], "-sam") == 0)
1450 				filetype = SAM;
1451 			else if (strcmp(argv[argIndex], "-bam") == 0)
1452 				filetype = BAM;
1453 			else if (strcmp(argv[argIndex], "-raw") == 0)
1454 				filetype = RAW;
1455 			else if (strcmp(argv[argIndex], "-raw.gz") == 0)
1456 				filetype = RAW_GZ;
1457 			else if (strcmp(argv[argIndex], "-maq.gz") == 0)
1458 				filetype = MAQ_GZ;
1459 			else if (strcmp(argv[argIndex], "-short") == 0)
1460 				cat = 0;
1461 			else if (strcmp(argv[argIndex], "-shortPaired") ==
1462 				 0)
1463 				cat = 1;
1464 			else if (strncmp
1465 				 (argv[argIndex], "-shortPaired",
1466 				  12) == 0) {
1467 				sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1468 				cat = (Category) short_var;
1469 				if (cat < 1 || cat > CATEGORIES) {
1470 					velvetLog("Unknown option: %s\n",
1471 					       argv[argIndex]);
1472 #ifdef DEBUG
1473 					abort();
1474 #endif
1475 					exit(1);
1476 				}
1477 				cat--;
1478 				cat *= 2;
1479 				cat++;
1480 			} else if (strncmp(argv[argIndex], "-short", 6) ==
1481 				   0) {
1482 				sscanf(argv[argIndex], "-short%hd", &short_var);
1483 				cat = (Category) short_var;
1484 				if (cat < 1 || cat > CATEGORIES) {
1485 					velvetLog("Unknown option: %s\n",
1486 					       argv[argIndex]);
1487 #ifdef DEBUG
1488 					abort();
1489 #endif
1490 					exit(1);
1491 				}
1492 				cat--;
1493 				cat *= 2;
1494 			} else if (strcmp(argv[argIndex], "-long") == 0)
1495 				cat = CATEGORIES * 2;
1496 			else if (strcmp(argv[argIndex], "-longPaired") ==
1497 				 0)
1498 				cat = CATEGORIES * 2 + 1;
1499 			else if (strcmp(argv[argIndex], "-reference") ==
1500 				 0)
1501 				cat = CATEGORIES * 2 + 2;
1502 			else if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1503 				*double_strand = false;
1504 				reference_coordinate_double_strand = false;
1505 			} else if (strcmp(argv[argIndex], "-noHash") == 0) {
1506 				;
1507 			}
1508 			else {
1509 				velvetLog("Unknown option: %s\n",
1510 				       argv[argIndex]);
1511 #ifdef DEBUG
1512 				abort();
1513 #endif
1514 				exit(1);
1515 			}
1516 
1517 			continue;
1518 		}
1519 
1520 		if (cat == -1)
1521 			continue;
1522 
1523 		switch (filetype) {
1524 		case FASTA:
1525 			readFastAFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1526 			break;
1527 		case FASTQ:
1528 			readFastQFile(outfile, argv[argIndex], cat, &sequenceIndex);
1529 			break;
1530 		case RAW:
1531 			readRawFile(outfile, argv[argIndex], cat, &sequenceIndex);
1532 			break;
1533 		case GERALD:
1534 			readSolexaFile(outfile, argv[argIndex], cat, &sequenceIndex);
1535 			break;
1536 		case ELAND:
1537 			readElandFile(outfile, argv[argIndex], cat, &sequenceIndex);
1538 			break;
1539 		case FASTA_GZ:
1540 			readFastAGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1541 			break;
1542 		case FASTQ_GZ:
1543 			readFastQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1544 			break;
1545 		case RAW_GZ:
1546 			readRawGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1547 			break;
1548 		case SAM:
1549 			readSAMFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1550 			break;
1551 		case BAM:
1552 			readBAMFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1553 			break;
1554 		case MAQ_GZ:
1555 			readMAQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1556 			break;
1557 		default:
1558 			velvetLog("Screw up in parser... exiting\n");
1559 #ifdef DEBUG
1560 			abort();
1561 #endif
1562 			exit(1);
1563 		}
1564 	}
1565 
1566 	destroyReferenceCoordinateTable(refCoords);
1567 	fclose(outfile);
1568 }
1569 
createReadPairingArray(ReadSet * reads)1570 void createReadPairingArray(ReadSet* reads)
1571 {
1572 	IDnum index;
1573 	IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1574 	Category cat = 0;
1575 	int phase = 0;
1576 
1577 	for (index = 0; index < reads->readCount; index++)
1578 		mateReads[index] = -1;
1579 
1580 	reads->mateReads = mateReads;
1581 
1582 	for (index = 0; index < reads->readCount; index++)
1583 	{
1584 		// Paired category
1585 		if (cat & 1)
1586 		{
1587 			// Leaving the paired category
1588 			if (reads->categories[index] != cat)
1589 			{
1590 				if (phase == 1)
1591 				{
1592 					reads->mateReads[index - 1] = -1;
1593 					reads->categories[index - 1]--;
1594 					phase = 0;
1595 				}
1596 				cat = reads->categories[index];
1597 				// Into another paired category
1598 				if (cat & 1)
1599 				{
1600 					reads->mateReads[index] = index + 1;
1601 					phase = 1;
1602 				}
1603 			}
1604 			else if (phase == 0)
1605 			{
1606 				reads->mateReads[index] = index + 1;
1607 				phase = 1;
1608 			}
1609 			else
1610 			{
1611 				reads->mateReads[index] = index - 1;
1612 				phase = 0;
1613 			}
1614 		}
1615 		// Leaving an unpaired category
1616 		else if (reads->categories[index] != cat)
1617 		{
1618 			cat = reads->categories[index];
1619 			// Into a paired category
1620 			if (cat & 1)
1621 			{
1622 				reads->mateReads[index] = index + 1;
1623 				phase = 1;
1624 			}
1625 		}
1626 	}
1627 }
1628 
pairedCategories(ReadSet * reads)1629 int pairedCategories(ReadSet * reads)
1630 {
1631 	boolean pairedCat[CATEGORIES + 1];
1632 	int pairedCatCount = 0;
1633 	IDnum index;
1634 
1635 	for (index = 0; index <= CATEGORIES; index++)
1636 		pairedCat[index] = 0;
1637 
1638 	for (index = 0; index < reads->readCount; index++) {
1639 		if (reads->categories[index] & 1 && !pairedCat[reads->categories[index] / 2]) {
1640 			pairedCat[reads->categories[index] / 2] = true;
1641 			if (pairedCatCount++ == CATEGORIES)
1642 				break;
1643 		}
1644 	}
1645 
1646 	return pairedCatCount;
1647 }
1648 
isSecondInPair(ReadSet * reads,IDnum index)1649 boolean isSecondInPair(ReadSet * reads, IDnum index)
1650 {
1651 	return reads->secondInPair[index / 8] & (1 << (index & 7));
1652 }
1653 
1654 
computeSecondInPair(ReadSet * reads)1655 static void computeSecondInPair(ReadSet * reads)
1656 {
1657 	IDnum index;
1658 	Category currentCat = 0;
1659 	Category previousCat = 0;
1660 	int phase = 0;
1661 
1662 	if (reads->secondInPair)
1663 		free (reads->secondInPair);
1664 	reads->secondInPair = callocOrExit((reads->readCount + 7) / 8, unsigned char);
1665 
1666 	for (index = 0; index < reads->readCount; index++)
1667 	{
1668 		currentCat = reads->categories[index];
1669 		if (currentCat & 1)
1670 		{
1671 			if (previousCat == currentCat)
1672 			{
1673 				if (phase == 0)
1674 				{
1675 					phase = 1;
1676 				}
1677 				else
1678 				{
1679 					reads->secondInPair[index / 8] |= (1 << (index & 7));
1680 					phase = 0;
1681 				}
1682 			}
1683 			else
1684 				phase = 1;
1685 		}
1686 		previousCat = currentCat;
1687 	}
1688 }
1689 
detachDubiousReads(ReadSet * reads,boolean * dubiousReads)1690 void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1691 {
1692 	IDnum index;
1693 	IDnum pairID;
1694 	IDnum sequenceCount = reads->readCount;
1695 	IDnum *mateReads = reads->mateReads;
1696 
1697 	if (dubiousReads == NULL || mateReads == NULL)
1698 		return;
1699 
1700 	for (index = 0; index < sequenceCount; index++) {
1701 		if (!dubiousReads[index] || reads->categories[index] % 2 == 0 )
1702 			continue;
1703 
1704 		if (isSecondInPair(reads, index))
1705 		    pairID = index - 1;
1706 		else
1707 		    pairID = index + 1;
1708 
1709 		reads->categories[index] = (reads->categories[index] / 2) * 2;
1710 		reads->categories[pairID] = (reads->categories[pairID] / 2) * 2;
1711 	}
1712 }
1713 
importReadSet(char * filename)1714 ReadSet *importReadSet(char *filename)
1715 {
1716 	FILE *file = fopen(filename, "r");
1717 	char *sequence = NULL;
1718 	Coordinate bpCount = 0;
1719 	const int maxline = 5000;
1720 	char line[5000];
1721 	IDnum sequenceCount, sequenceIndex;
1722 	ReadSet *reads;
1723 	short int temp_short;
1724 	int lineLength;
1725 
1726 	if (file != NULL)
1727 		velvetLog("Reading read set file %s;\n", filename);
1728 	else
1729 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1730 
1731 	reads = newReadSet();
1732 
1733 	// Count number of separate sequences
1734 	sequenceCount = 0;
1735 	while (fgets(line, maxline, file) != NULL)
1736 		if (line[0] == '>')
1737 			sequenceCount++;
1738 	fclose(file);
1739 	velvetLog("%li sequences found\n", (long) sequenceCount);
1740 
1741 	reads->readCount = sequenceCount;
1742 
1743 	if (reads->readCount == 0) {
1744 		reads->sequences = NULL;
1745 		reads->categories = NULL;
1746 		return reads;
1747 	}
1748 
1749 	reads->sequences = callocOrExit(sequenceCount, char *);
1750 	reads->categories = callocOrExit(sequenceCount, Category);
1751 	// Counting base pair length of each sequence:
1752 	file = fopen(filename, "r");
1753 	sequenceIndex = -1;
1754 	while (fgets(line, maxline, file) != NULL) {
1755 		if (line[0] == '>') {
1756 
1757 			// Reading category info
1758 			sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1759 			       &temp_short);
1760 			reads->categories[sequenceIndex + 1] = (Category) temp_short;
1761 
1762 			if (sequenceIndex != -1)
1763 				reads->sequences[sequenceIndex] =
1764 				    mallocOrExit(bpCount + 1, char);
1765 			sequenceIndex++;
1766 			bpCount = 0;
1767 		} if (line[0] == 'M') {;
1768 			// Map line
1769 		} else {
1770 			bpCount += (Coordinate) strlen(line) - 1;
1771 
1772 			if (sizeof(ShortLength) == sizeof(int16_t) && bpCount > SHRT_MAX) {
1773 				velvetLog("Read %li of length %lli, longer than limit %i\n",
1774 				       (long) sequenceIndex + 1, (long long) bpCount, SHRT_MAX);
1775 				velvetLog("You should modify recompile with the LONGSEQUENCES option (cf. manual)\n");
1776 				exit(1);
1777 			}
1778 		}
1779 	}
1780 
1781 	//velvetLog("Sequence %d has length %d\n", sequenceIndex, bpCount);
1782 	reads->sequences[sequenceIndex] =
1783 	    mallocOrExit(bpCount + 1, char);
1784 	fclose(file);
1785 
1786 	// Reopen file and memorize line:
1787 	file = fopen(filename, "r");
1788 	sequenceIndex = -1;
1789 	while (fgets(line, maxline, file)) {
1790 		if (line[0] == '>') {
1791 			if (sequenceIndex != -1) {
1792 				sequence[bpCount] = '\0';
1793 			}
1794 			sequenceIndex++;
1795 			bpCount = 0;
1796 			//velvetLog("Starting to read sequence %d\n",
1797 			//       sequenceIndex);
1798 			sequence = reads->sequences[sequenceIndex];
1799 		} else if (line[0] == 'M') {;
1800 			// Map line
1801 		} else {
1802 			lineLength = strlen(line) - 1;
1803 			strncpy(sequence + bpCount, line, lineLength);
1804 			bpCount += (Coordinate) lineLength;
1805 		}
1806 	}
1807 
1808 	sequence[bpCount] = '\0';
1809 	fclose(file);
1810 	computeSecondInPair(reads);
1811 
1812 	velvetLog("Done\n");
1813 	return reads;
1814 
1815 }
1816 
logInstructions(int argc,char ** argv,char * directory)1817 void logInstructions(int argc, char **argv, char *directory)
1818 {
1819 	int index;
1820 	char *logFilename =
1821 	    mallocOrExit(strlen(directory) + 100, char);
1822 	FILE *logFile;
1823 	time_t date;
1824 	char *string;
1825 
1826 	time(&date);
1827 	string = ctime(&date);
1828 
1829 	strcpy(logFilename, directory);
1830 	strcat(logFilename, "/Log");
1831 	logFile = fopen(logFilename, "a");
1832 
1833 	if (logFile == NULL)
1834 		exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1835 
1836 	velvetFprintf(logFile, "%s", string);
1837 
1838 	for (index = 0; index < argc; index++)
1839 		velvetFprintf(logFile, " %s", argv[index]);
1840 
1841 	velvetFprintf(logFile, "\n");
1842 
1843 	velvetFprintf(logFile, "Version %i.%i.%2.2i\n", VERSION_NUMBER,
1844 	       RELEASE_NUMBER, UPDATE_NUMBER);
1845 	velvetFprintf(logFile, "Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
1846 	velvetFprintf(logFile, "This is free software; see the source for copying conditions.  There is NO\n");
1847 	velvetFprintf(logFile, "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
1848 	velvetFprintf(logFile, "Compilation settings:\n");
1849 	velvetFprintf(logFile, "CATEGORIES = %i\n", CATEGORIES);
1850 	velvetFprintf(logFile, "MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
1851 #ifdef _OPENMP
1852 	velvetFprintf(logFile, "OPENMP\n");
1853 #endif
1854 #ifdef LONGSEQUENCES
1855 	velvetFprintf(logFile, "LONGSEQUENCES\n");
1856 #endif
1857 #ifdef BIGASSEMBLY
1858 	velvetFprintf(logFile, "BIGASSEMBLY\n");
1859 #endif
1860 #ifdef COLOR
1861 	velvetFprintf(logFile, "COLOR\n");
1862 #endif
1863 #ifdef DEBUG
1864 	velvetFprintf(logFile, "DEBUG\n");
1865 #endif
1866 	velvetFprintf(logFile, "\n");
1867 
1868 	fclose(logFile);
1869 	free(logFilename);
1870 }
1871 
destroyReadSet(ReadSet * reads)1872 void destroyReadSet(ReadSet * reads)
1873 {
1874 	IDnum index;
1875 
1876 	if (reads == NULL)
1877 		return;
1878 
1879 	if (reads->sequences != NULL)
1880 	{
1881 		for (index = 0; index < reads->readCount; index++)
1882 			free(reads->sequences[index]);
1883 		free(reads->sequences);
1884 	}
1885 
1886 	if (reads->tSequences != NULL)
1887 		free (reads->tSequences);
1888 
1889 	if (reads->tSeqMem != NULL)
1890 		free (reads->tSeqMem);
1891 
1892 	if (reads->labels != NULL)
1893 		for (index = 0; index < reads->readCount; index++)
1894 			free(reads->labels[index]);
1895 
1896 	if (reads->confidenceScores != NULL)
1897 		for (index = 0; index < reads->readCount; index++)
1898 			free(reads->confidenceScores[index]);
1899 
1900 	if (reads->kmerProbabilities != NULL)
1901 		for (index = 0; index < reads->readCount; index++)
1902 			free(reads->kmerProbabilities[index]);
1903 
1904 	free(reads->labels);
1905 	free(reads->confidenceScores);
1906 	free(reads->kmerProbabilities);
1907 	free(reads->mateReads);
1908 	free(reads->categories);
1909 	free(reads->secondInPair);
1910 	free(reads);
1911 }
1912 
getSequenceLengths(ReadSet * reads,int wordLength)1913 ShortLength *getSequenceLengths(ReadSet * reads, int wordLength)
1914 {
1915 	ShortLength *lengths = callocOrExit(reads->readCount, ShortLength);
1916 	ShortLength index;
1917 	int lengthOffset = wordLength - 1;
1918 
1919 	for (index = 0; index < reads->readCount; index++)
1920 		lengths[index] =
1921 		    getLength(getTightStringInArray(reads->tSequences, index)) - lengthOffset;
1922 
1923 	return lengths;
1924 }
1925