1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <math.h>
25 #include <time.h>
26 
27 #include "globals.h"
28 #include "tightString.h"
29 #include "readSet.h"
30 #include "utility.h"
31 
32 #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
33 #include "../../third-party/zlib-1.2.3/Win32/include/zlib.h"
34 #else
35 #include "../../third-party/zlib-1.2.3/zlib.h"
36 #endif
37 
38 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
39 #  include <fcntl.h>
40 #  include <io.h>
41 #  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
42 #else
43 #  define SET_BINARY_MODE(file)
44 #endif
45 
newReadSet()46 ReadSet *newReadSet()
47 {
48 	ReadSet *rs = callocOrExit(1, ReadSet);
49 	return rs;
50 }
51 
velvetifySequence(char * str)52 static void velvetifySequence(char * str) {
53 	int i = strlen(str) - 1;
54 	char c;
55 
56 	for (i = strlen(str) - 1; i >= 0; i--) {
57 		c = str[i];
58 		switch (c) {
59 		case '\n':
60 		case '\r':
61 		case EOF:
62 			str[i] = '\0';
63 			break;
64 		case 'C':
65 		case 'c':
66 			str[i] = 'C';
67 			break;
68 		case 'G':
69 		case 'g':
70 			str[i] = 'G';
71 			break;
72 		case 'T':
73 		case 't':
74 			str[i] = 'T';
75 			break;
76 		default:
77 			str[i] = 'A';
78 		}
79 	}
80 }
81 
reverseComplementSequence(char * str)82 static void reverseComplementSequence(char * str)
83 {
84        size_t length = strlen(str);
85        size_t i;
86 
87        for (i = 0; i < length-1 - i; i++) {
88 	       char c = str[i];
89 	       str[i] = str[length-1 - i];
90 	       str[length-1 - i] = c;
91        }
92 
93        for (i = 0; i < length; i++) {
94 	       switch (str[i]) {
95 	       case 'A':
96 	       case 'a':
97 		       str[i] = 'T';
98 		       break;
99 	       case 'C':
100 	       case 'c':
101 		       str[i] = 'G';
102 		       break;
103 	       case 'G':
104 	       case 'g':
105 		       str[i] = 'C';
106 		       break;
107 	       // As in velvetifySequence(), anything unusual ends up as 'A'
108 	       default:
109 		       str[i] = 'A';
110 		       break;
111 	       }
112        }
113 }
114 
writeFastaSequence(FILE * outfile,const char * str)115 static void writeFastaSequence(FILE * outfile, const char * str)
116 {
117        size_t length = strlen(str);
118        size_t start;
119        for (start = 0; start < length; start += 60)
120 	       fprintf(outfile, "%.60s\n", &str[start]);
121 }
122 
newReadSetAroundTightStringArray(TightString ** array,IDnum length)123 ReadSet *newReadSetAroundTightStringArray(TightString ** array,
124 					  IDnum length)
125 {
126 	ReadSet *rs = newReadSet();
127 	rs->tSequences = array;
128 	rs->readCount = length;
129 	return rs;
130 }
131 
concatenateReadSets(ReadSet * A,ReadSet * B)132 void concatenateReadSets(ReadSet * A, ReadSet * B)
133 {
134 	ReadSet tmp;
135 	IDnum index;
136 
137 	// Read count:
138 	tmp.readCount = A->readCount + B->readCount;
139 
140 	// Sequences
141 	if (A->sequences != NULL || B->sequences != NULL) {
142 		tmp.sequences = mallocOrExit(tmp.readCount, char *);
143 		if (A->sequences != NULL) {
144 			for (index = 0; index < A->readCount; index++)
145 				tmp.sequences[index] = A->sequences[index];
146 			free(A->sequences);
147 		} else
148 			for (index = 0; index < A->readCount; index++)
149 				tmp.sequences[index] = NULL;
150 
151 		if (B->sequences != NULL) {
152 			for (index = 0; index < B->readCount; index++)
153 				tmp.sequences[A->readCount + index] =
154 				    B->sequences[index];
155 			free(B->sequences);
156 		} else
157 			for (index = 0; index < B->readCount; index++)
158 				tmp.sequences[A->readCount + index] = NULL;
159 	} else
160 		tmp.sequences = NULL;
161 
162 	// tSequences
163 	if (A->tSequences != NULL || B->tSequences != NULL) {
164 		tmp.tSequences =
165 		    mallocOrExit(tmp.readCount, TightString *);
166 
167 		if (A->tSequences != NULL) {
168 			for (index = 0; index < A->readCount; index++)
169 				tmp.tSequences[index] =
170 				    A->tSequences[index];
171 			free(A->tSequences);
172 		} else
173 			for (index = 0; index < A->readCount; index++)
174 				tmp.tSequences[index] = NULL;
175 
176 		if (B->tSequences != NULL) {
177 			for (index = 0; index < B->readCount; index++)
178 				tmp.tSequences[A->readCount + index] =
179 				    B->tSequences[index];
180 			free(B->tSequences);
181 		} else
182 			for (index = 0; index < B->readCount; index++)
183 				tmp.tSequences[A->readCount + index] =
184 				    NULL;
185 	} else
186 		tmp.tSequences = NULL;
187 
188 	// Labels
189 	if (A->labels != NULL || B->labels != NULL) {
190 		tmp.labels = mallocOrExit(tmp.readCount, char *);
191 
192 		if (A->labels != NULL) {
193 			for (index = 0; index < A->readCount; index++)
194 				tmp.labels[index] = A->labels[index];
195 			free(A->labels);
196 		} else
197 			for (index = 0; index < A->readCount; index++)
198 				tmp.labels[index] = NULL;
199 
200 		if (B->labels != NULL) {
201 			for (index = 0; index < B->readCount; index++)
202 				tmp.labels[A->readCount + index] =
203 				    B->labels[index];
204 			free(B->labels);
205 		} else
206 			for (index = 0; index < B->readCount; index++)
207 				tmp.labels[A->readCount + index] = NULL;
208 	} else
209 		tmp.labels = NULL;
210 
211 
212 	// Confidence scores
213 	if (A->confidenceScores != NULL || B->confidenceScores != NULL) {
214 		tmp.confidenceScores =
215 		    mallocOrExit(tmp.readCount, Quality *);
216 
217 		if (A->confidenceScores != NULL) {
218 			for (index = 0; index < A->readCount; index++)
219 				tmp.confidenceScores[index] =
220 				    A->confidenceScores[index];
221 			free(A->confidenceScores);
222 		} else
223 			for (index = 0; index < A->readCount; index++)
224 				tmp.confidenceScores[index] = NULL;
225 
226 		if (B->confidenceScores != NULL) {
227 			for (index = 0; index < B->readCount; index++)
228 				tmp.confidenceScores[A->readCount +
229 						     index] =
230 				    B->confidenceScores[index];
231 			free(B->confidenceScores);
232 		} else
233 			for (index = 0; index < B->readCount; index++)
234 				tmp.confidenceScores[A->readCount +
235 						     index] = NULL;
236 	} else
237 		tmp.confidenceScores = NULL;
238 
239 	// Kmer probabilities
240 	if (A->kmerProbabilities != NULL || B->kmerProbabilities != NULL) {
241 		tmp.kmerProbabilities =
242 		    mallocOrExit(tmp.readCount, Quality *);
243 
244 		if (A->kmerProbabilities != NULL) {
245 			for (index = 0; index < A->readCount; index++)
246 				tmp.kmerProbabilities[index] =
247 				    A->kmerProbabilities[index];
248 			free(A->kmerProbabilities);
249 		} else
250 			for (index = 0; index < A->readCount; index++)
251 				tmp.kmerProbabilities[index] = NULL;
252 
253 		if (B->kmerProbabilities != NULL) {
254 			for (index = 0; index < B->readCount; index++)
255 				tmp.kmerProbabilities[A->readCount +
256 						      index] =
257 				    B->kmerProbabilities[index];
258 			free(B->kmerProbabilities);
259 		} else
260 			for (index = 0; index < B->readCount; index++)
261 				tmp.kmerProbabilities[A->readCount +
262 						      index] = NULL;
263 	} else
264 		tmp.kmerProbabilities = NULL;
265 
266 	// Mate reads
267 	if (A->mateReads != NULL || B->mateReads != NULL) {
268 		tmp.mateReads = mallocOrExit(tmp.readCount, IDnum);
269 
270 		if (A->mateReads != NULL) {
271 			for (index = 0; index < A->readCount; index++)
272 				tmp.mateReads[index] = A->mateReads[index];
273 			free(A->mateReads);
274 		} else
275 			for (index = 0; index < A->readCount; index++)
276 				tmp.mateReads[index] = 0;
277 
278 		if (B->mateReads != NULL) {
279 			for (index = 0; index < B->readCount; index++)
280 				tmp.mateReads[A->readCount + index] =
281 				    B->mateReads[index];
282 			free(B->mateReads);
283 		} else
284 			for (index = 0; index < B->readCount; index++)
285 				tmp.mateReads[A->readCount + index] = 0;
286 	} else
287 		tmp.mateReads = NULL;
288 
289 	// Categories
290 	if (A->categories != NULL || B->categories != NULL) {
291 		tmp.categories = mallocOrExit(tmp.readCount, Quality *);
292 
293 		if (A->categories != NULL) {
294 			for (index = 0; index < A->readCount; index++)
295 				tmp.categories[index] =
296 				    A->categories[index];
297 			free(A->categories);
298 		} else
299 			for (index = 0; index < A->readCount; index++)
300 				tmp.categories[index] = CATEGORIES;
301 
302 		if (B->categories != NULL) {
303 			for (index = 0; index < B->readCount; index++)
304 				tmp.categories[A->readCount + index] =
305 				    B->categories[index];
306 			free(B->categories);
307 		} else
308 			for (index = 0; index < B->readCount; index++)
309 				tmp.categories[A->readCount + index] =
310 				    CATEGORIES;
311 	} else
312 		tmp.categories = NULL;
313 
314 	// Put everything back into A
315 	A->readCount = tmp.readCount;
316 	A->sequences = tmp.sequences;
317 	A->tSequences = tmp.tSequences;
318 	A->labels = tmp.labels;
319 	A->confidenceScores = tmp.confidenceScores;
320 	A->kmerProbabilities = tmp.kmerProbabilities;
321 	A->mateReads = tmp.mateReads;
322 	A->categories = tmp.categories;
323 
324 	// Deallocate
325 	free(B);
326 }
327 
convertSequences(ReadSet * rs)328 void convertSequences(ReadSet * rs)
329 {
330 	rs->tSequences =
331 	    newTightStringArrayFromStringArray(rs->sequences,
332 					       rs->readCount);
333 	rs->sequences = NULL;
334 }
335 
convertQualityScore(Quality score)336 static Probability convertQualityScore(Quality score)
337 {
338 	return (Probability) 1 - pow(10, -score / ((double) 10));
339 }
340 
convertConfidenceScores(ReadSet * rs,int WORDLENGTH)341 void convertConfidenceScores(ReadSet * rs, int WORDLENGTH)
342 {
343 	Quality *baseCallerScores;
344 	Probability *kmerProbabilities;
345 	IDnum index;
346 	Coordinate position;
347 	Probability proba;
348 
349 	rs->kmerProbabilities =
350 	    mallocOrExit(rs->readCount, Probability *);
351 
352 	for (index = 0; index < rs->readCount; index++) {
353 		rs->kmerProbabilities[index] =
354 		    mallocOrExit(getLength(rs->tSequences[index]) - WORDLENGTH +
355 			    1, Probability);
356 		kmerProbabilities = rs->kmerProbabilities[index];
357 		baseCallerScores = rs->confidenceScores[index];
358 
359 		proba = 1;
360 		for (position = 0;
361 		     position < getLength(rs->tSequences[index]);
362 		     position++) {
363 			proba *=
364 			    convertQualityScore(baseCallerScores
365 						[position]);
366 			if (position < WORDLENGTH)
367 				continue;
368 
369 			proba /=
370 			    convertQualityScore(baseCallerScores
371 						[position - WORDLENGTH]);
372 			kmerProbabilities[position - WORDLENGTH + 1] =
373 			    proba;
374 		}
375 
376 		rs->confidenceScores[index] = NULL;
377 		free(baseCallerScores);
378 	}
379 
380 	free(rs->confidenceScores);
381 	rs->confidenceScores = NULL;
382 }
383 
categorizeReads(ReadSet * readSet,Category category)384 void categorizeReads(ReadSet * readSet, Category category)
385 {
386 	IDnum index;
387 
388 	if (readSet->categories == NULL)
389 		readSet->categories =
390 		    mallocOrExit(readSet->readCount, Category);
391 
392 	for (index = 0; index < readSet->readCount; index++)
393 		readSet->categories[index] = category;
394 }
395 
simplifyReads(ReadSet * readSet)396 void simplifyReads(ReadSet * readSet)
397 {
398 	IDnum index;
399 
400 	if (readSet->categories == NULL)
401 		readSet->categories =
402 		    mallocOrExit(readSet->readCount, Category);
403 
404 	for (index = 0; index < readSet->readCount; index++) {
405 		if (readSet->categories[index] < CATEGORIES) {
406 			readSet->categories[index] /= 2;
407 			readSet->categories[index] *= 2;
408 		}
409 	}
410 }
411 
exportIDMapping(char * filename,ReadSet * reads)412 void exportIDMapping(char *filename, ReadSet * reads)
413 {
414 	IDnum index;
415 	FILE *outfile = fopen(filename, "w");
416 
417 	if (outfile == NULL) {
418 		printf("Couldn't open %s, sorry\n", filename);
419 		return;
420 	} else
421 		puts("Writing into file...");
422 
423 	if (reads->labels == NULL) {
424 		fclose(outfile);
425 		return;
426 	}
427 
428 	for (index = 0; index < reads->readCount; index++)
429 		if (reads->labels != NULL)
430 			fprintf(outfile, "s/SEQUENCE %ld/%s/\n", (long) (index + 1),
431 				reads->labels[index]);
432 
433 	fclose(outfile);
434 
435 }
436 
437 // Returns the value of a 32-bit little-endian-stored integer.
int32(const unsigned char * ptr)438 static int int32(const unsigned char * ptr)
439 {
440 	int x = ptr[3];
441 	x = (x << 8) | ptr[2];
442 	x = (x << 8) | ptr[1];
443 	x = (x << 8) | ptr[0];
444 	return x;
445 }
446 
447 // Imports sequences from a fastq file
448 // Memory space allocated within this function.
readSolexaFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)449 static void readSolexaFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
450 {
451 	FILE *file = fopen(filename, "r");
452 	IDnum counter = 0;
453 	const int maxline = 500;
454 	char line[500];
455 	char readName[500];
456 	char readSeq[500];
457 	char str[100];
458 	Coordinate start;
459 
460 	if (strcmp(filename, "-"))
461 		file = fopen(filename, "r");
462 	else
463 		file = stdin;
464 
465 	if (file != NULL)
466 		printf("Reading Solexa file %s\n", filename);
467 	else
468 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
469 
470 	while (fgets(line, maxline, file) != NULL)
471 		if (strchr(line, '.') == NULL) {
472 			sscanf(line, "%s\t%*i\t%*i\t%*i\t%*c%[^\n]",
473 			       readName, readSeq);
474 			fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
475 			velvetifySequence(readSeq);
476 			start = 0;
477 			while (start <= strlen(readSeq)) {
478 				strncpy(str, readSeq + start, 60);
479 				str[60] = '\0';
480 				fprintf(outfile, "%s\n", str);
481 				start += 60;
482 			}
483 
484 			counter++;
485 		}
486 
487 	fclose(file);
488 
489 	printf("%d sequences found\n", counter);
490 	puts("Done");
491 }
492 
readElandFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)493 static void readElandFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
494 {
495 	FILE *file = fopen(filename, "r");
496 	IDnum counter = 0;
497 	const int maxline = 5000;
498 	char line[5000];
499 	char readName[5000];
500 	char readSeq[5000];
501 	char str[100];
502 	Coordinate start;
503 
504 	if (strcmp(filename, "-"))
505 		file = fopen(filename, "r");
506 	else
507 		file = stdin;
508 
509 	if (file != NULL)
510 		printf("Reading Solexa file %s\n", filename);
511 	else
512 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
513 
514 	// Reopen file and memorize line:
515 	while (fgets(line, maxline, file) != NULL) {
516 		sscanf(line, "%[^\t]\t%[^\t\n]",
517 		       readName, readSeq);
518 		fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
519 		velvetifySequence(readSeq);
520 		start = 0;
521 		while (start <= strlen(readSeq)) {
522 			strncpy(str, readSeq + start, 60);
523 			str[60] = '\0';
524 			fprintf(outfile, "%s\n", str);
525 			start += 60;
526 		}
527 
528 		counter++;
529 	}
530 
531 	fclose(file);
532 
533 	printf("%d sequences found\n", counter);
534 	puts("Done");
535 }
536 
goToEndOfLine(char * line,FILE * file)537 void goToEndOfLine(char *line, FILE * file)
538 {
539 	size_t length = strlen(line);
540 	char c = line[length - 1];
541 
542 	while (c != '\n')
543 		c = fgetc(file);
544 }
545 
546 // Imports sequences from a fastq file
547 // Memory space allocated within this function.
readFastQFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)548 static void readFastQFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
549 {
550 	FILE *file;
551 	const int maxline = 5000;
552 	char line[5000];
553 	char str[100];
554 	IDnum counter = 0;
555 	Coordinate start, i;
556 	char c;
557 
558 	if (strcmp(filename, "-"))
559 		file = fopen(filename, "r");
560 	else
561 		file = stdin;
562 
563 	if (file != NULL)
564 		printf("Reading FastQ file %s\n", filename);
565 	else
566 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
567 
568 	// Checking if FastQ
569 	c = getc(file);
570 	if (c != '@')
571 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
572 	ungetc(c, file);
573 
574 	while(fgets(line, maxline, file)) {
575 
576 		for (i = strlen(line) - 1;
577 		     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
578 			line[i] = '\0';
579 		}
580 
581 		fprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
582 		counter++;
583 
584 		if(!fgets(line, maxline, file))
585 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
586 
587 		velvetifySequence(line);
588 
589 		start = 0;
590 		while (start <= strlen(line)) {
591 			strncpy(str, line + start, 60);
592 			str[60] = '\0';
593 			fprintf(outfile, "%s\n", str);
594 			start += 60;
595 		}
596 
597 		if(!fgets(line, maxline, file))
598 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
599 		if(!fgets(line, maxline, file))
600 			exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
601 	}
602 
603 	fclose(file);
604 	printf("%d reads found.\n", counter);
605 	puts("Done");
606 }
607 
608 // Imports sequences from a zipped rfastq file
609 // Memory space allocated within this function.
readFastQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)610 static void readFastQGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
611 {
612 	gzFile file;
613 	const int maxline = 5000;
614 	char line[5000];
615 	char str[100];
616 	IDnum counter = 0;
617 	Coordinate start, i;
618 	char c;
619 
620 	if (strcmp(filename, "-"))
621 		file = gzopen(filename, "rb");
622 	else {
623 		file = gzdopen(fileno(stdin), "rb");
624 		SET_BINARY_MODE(stdin);
625 	}
626 
627 	if (file != NULL)
628 		printf("Reading FastQ file %s\n", filename);
629 	else
630 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
631 
632 	// Checking if FastQ
633 	c = gzgetc(file);
634 	if (c != '@')
635 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
636 	gzungetc(c, file);
637 
638 	while (gzgets(file, line, maxline)) {
639 		for (i = strlen(line) - 1;
640 		     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
641 			line[i] = '\0';
642 		}
643 
644 		fprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
645 		counter++;
646 
647 		gzgets(file, line, maxline);
648 
649 		velvetifySequence(line);
650 
651 		start = 0;
652 		while (start <= strlen(line)) {
653 			strncpy(str, line + start, 60);
654 			str[60] = '\0';
655 			fprintf(outfile, "%s\n", str);
656 			start += 60;
657 		}
658 
659 		gzgets(file, line, maxline);
660 		gzgets(file, line, maxline);
661 	}
662 
663 	gzclose(file);
664 	printf("%d reads found.\n", counter);
665 	puts("Done");
666 }
667 
668 // Imports sequences from a fasta file
669 // Memory is allocated within the function
readFastAFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)670 static void readFastAFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
671 {
672 	FILE *file;
673 	const int maxline = 5000;
674 	char line[5000];
675 	char str[100];
676 	IDnum counter = 0;
677 	Coordinate i;
678 	char c;
679 	Coordinate start;
680 	int offset = 0;
681 
682 	if (strcmp(filename, "-"))
683 		file = fopen(filename, "r");
684 	else
685 		file = stdin;
686 
687 	if (file != NULL)
688 		printf("Reading FastA file %s;\n", filename);
689 	else
690 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
691 
692 	// Checking if FastA
693 	c = getc(file);
694 	if (c != '>')
695 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
696 	ungetc(c, file);
697 
698 	while (fgets(line, maxline, file)) {
699 		if (line[0] == '>') {
700 			if (strchr(line,'\t')) {
701 				printf("FastA headers in %s contain tabs, please remove them.\n", filename);
702 				printf("E.g.: %s", line);
703 				puts("Exiting");
704 				exit(1);
705 			}
706 
707 			if (offset != 0) {
708 				fprintf(outfile, "\n");
709 				offset = 0;
710 			}
711 
712 			for (i = strlen(line) - 1;
713 			     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
714 				line[i] = '\0';
715 			}
716 
717 			fprintf(outfile,"%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
718 			counter++;
719 		} else {
720 			velvetifySequence(line);
721 			start = 0;
722 			while (start < strlen(line)) {
723 				strncpy(str, line + start, 60 - offset);
724 				str[60 - offset] = '\0';
725 				fprintf(outfile, "%s", str);
726 				offset += strlen(str);
727 				if (offset >= 60) {
728 					fprintf(outfile, "\n");
729 					offset = 0;
730 				}
731 				start += strlen(str);
732 			}
733 		}
734 	}
735 
736 	if (offset != 0)
737 		fprintf(outfile, "\n");
738 	fclose(file);
739 
740 	printf("%d sequences found\n", counter);
741 	puts("Done");
742 }
743 
744 // Imports sequences from a zipped fasta file
745 // Memory is allocated within the function
readFastAGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)746 static void readFastAGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
747 {
748 	gzFile file;
749 	const int maxline = 5000;
750 	char line[5000];
751 	char str[100];
752 	IDnum counter = 0;
753 	Coordinate i, start;
754 	char c;
755 	int offset = 0;
756 
757 	if (strcmp(filename, "-"))
758 		file = gzopen(filename, "rb");
759 	else {
760 		file = gzdopen(fileno(stdin), "rb");
761 		SET_BINARY_MODE(stdin);
762 	}
763 
764 	if (file != NULL)
765 		printf("Reading zipped FastA file %s;\n", filename);
766 	else
767 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
768 
769 	// Checking if FastA
770 	c = gzgetc(file);
771 	if (c != '>')
772 		exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
773 	gzungetc(c, file);
774 
775 	while (gzgets(file, line, maxline)) {
776 		if (line[0] == '>') {
777 			if (offset != 0) {
778 				fprintf(outfile, "\n");
779 				offset = 0;
780 			}
781 
782 			for (i = strlen(line) - 1;
783 			     i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
784 				line[i] = '\0';
785 			}
786 
787 			fprintf(outfile, "%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
788 			counter++;
789 		} else {
790 			velvetifySequence(line);
791 
792 			start = 0;
793 			while (start < strlen(line)) {
794 				strncpy(str, line + start, 60 - offset);
795 				str[60 - offset] = '\0';
796 				fprintf(outfile, "%s", str);
797 				offset += strlen(str);
798 				if (offset >= 60) {
799 					fprintf(outfile, "\n");
800 					offset = 0;
801 				}
802 				start += strlen(str);
803 			}
804 		}
805 	}
806 
807 	if (offset != 0)
808 		fprintf(outfile, "\n");
809 	gzclose(file);
810 
811 	printf("%d sequences found\n", counter);
812 	puts("Done");
813 }
814 
815 // Parser for new output
readMAQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)816 static void readMAQGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
817 {
818 	gzFile file;
819 	const int maxline = 1000;
820 	char line[1000];
821 	IDnum counter = 0;
822 	char readName[500];
823 	char readSeq[500];
824 	char str[100];
825 	Coordinate start;
826 
827 	if (strcmp(filename, "-"))
828 		file = gzopen(filename, "rb");
829 	else {
830 		file = gzdopen(fileno(stdin), "rb");
831 		SET_BINARY_MODE(stdin);
832 	}
833 
834 	if (file != NULL)
835 		printf("Reading zipped MAQ file %s\n", filename);
836 	else
837 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
838 
839 	// Reopen file and memorize line:
840 	while (gzgets(file, line, maxline)) {
841 		sscanf(line, "%s\t%*i\t%*i\t%*c\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%[^\t]",
842 		       readName, readSeq);
843 		fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
844 		velvetifySequence(readSeq);
845 		start = 0;
846 		while (start <= strlen(readSeq)) {
847 			strncpy(str, readSeq + start, 60);
848 			str[60] = '\0';
849 			fprintf(outfile, "%s\n", str);
850 			start += 60;
851 		}
852 
853 		counter++;
854 	}
855 
856 	gzclose(file);
857 
858 	printf("%d sequences found\n", counter);
859 	puts("Done");
860 }
861 
readSAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)862 static void readSAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex)
863 {
864 	char line[5000];
865 	unsigned long lineno, readCount;
866 	char previous_qname_pairing[10];
867 	char previous_qname[5000];
868 	char previous_seq[5000];
869 	boolean previous_paired = false;
870 
871 	FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
872 	if (file)
873 		printf("Reading SAM file %s\n", filename);
874 	else
875 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
876 
877 	readCount = 0;
878 	for (lineno = 1; fgets(line, sizeof(line), file); lineno++)
879 		if (line[0] != '@') {
880 			char *qname, *flag, *seq;
881 			int i;
882 
883 			qname = strtok(line, "\t");
884 			flag  = strtok(NULL, "\t");
885 			for (i = 3; i < 10; i++)
886 				(void) strtok(NULL, "\t");
887 			seq = strtok(NULL, "\t");
888 
889 			if (seq == NULL) {
890 				fprintf(stderr,
891 					"Line #%lu: ignoring SAM record with too few fields\n",
892 					lineno);
893 			}
894 			else if (strcmp(seq, "*") == 0) {
895 				fprintf(stderr,
896 					"Line #%lu: ignoring SAM record with omitted SEQ field\n",
897 					lineno);
898 			}
899 			else {
900 				// Accept flags represented in either decimal or hex:
901 				int flagbits = strtol(flag, NULL, 0);
902 
903 				const char *qname_pairing = "";
904 				if (flagbits & 0x40)
905 					qname_pairing = "/1";
906 				else if (flagbits & 0x80)
907 					qname_pairing = "/2";
908 
909 				if (flagbits & 0x10)
910 					reverseComplementSequence(seq);
911 
912 				// Determine if paired to previous read
913 				if (readCount > 0) {
914 					if (cat % 2) {
915 						if (previous_paired) {
916 							// Last read paired to penultimate read
917 							fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
918 								(long) ((*sequenceIndex)++), (int) cat);
919 							writeFastaSequence(outfile, previous_seq);
920 							previous_paired = false;
921 						} else if (strcmp(qname, previous_qname) == 0) {
922 							// Last read paired to current reads
923 							fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
924 								(long) ((*sequenceIndex)++), (int) cat);
925 							writeFastaSequence(outfile, previous_seq);
926 							previous_paired = true;
927 						} else {
928 							// Last read unpaired
929 							fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
930 								(long) ((*sequenceIndex)++), (int) cat - 1);
931 							writeFastaSequence(outfile, previous_seq);
932 							previous_paired = false;
933 						}
934 					} else {
935 						// Unpaired dataset
936 						fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
937 							(long) ((*sequenceIndex)++), (int) cat);
938 						writeFastaSequence(outfile, previous_seq);
939 					}
940 				}
941 
942 				strcpy(previous_qname, qname);
943 				strcpy(previous_qname_pairing, qname_pairing);
944 				strcpy(previous_seq, seq);
945 				velvetifySequence(previous_seq);
946 
947 				readCount++;
948 			}
949 		}
950 
951 	if (readCount) {
952 		if (cat % 2) {
953 			if (previous_paired) {
954 				// Last read paired to penultimate read
955 				fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
956 					(long) ((*sequenceIndex)++), (int) cat);
957 				writeFastaSequence(outfile, previous_seq);
958 			} else {
959 				// Last read unpaired
960 				fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
961 					(long) ((*sequenceIndex)++), (int) cat - 1);
962 				writeFastaSequence(outfile, previous_seq);
963 			}
964 		} else {
965 			// Unpaired dataset
966 			fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
967 				(long) ((*sequenceIndex)++), (int) cat);
968 			writeFastaSequence(outfile, previous_seq);
969 		}
970 
971 	}
972 
973 	fclose(file);
974 	printf("%lu reads found.\nDone\n", readCount);
975 }
976 
readBAMint32(gzFile file)977 static int readBAMint32(gzFile file)
978 {
979 	unsigned char buffer[4];
980 	if (gzread(file, buffer, 4) != 4)
981 		exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
982 
983 	return int32(buffer);
984 }
985 
readBAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)986 static void readBAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex)
987 {
988 	size_t seqCapacity = 0;
989 	char *seq = NULL;
990 	size_t bufferCapacity = 4;
991 	unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
992 	unsigned long recno, readCount;
993 	int i, refCount;
994 	gzFile file;
995 	char previous_qname_pairing[10];
996 	char previous_qname[5000];
997 	char previous_seq[5000];
998 	boolean previous_paired = false;
999 
1000 	if (strcmp(filename, "-") != 0)
1001 		file = gzopen(filename, "rb");
1002 	else {
1003 		file = gzdopen(fileno(stdin), "rb");
1004 		SET_BINARY_MODE(stdin);
1005 	}
1006 
1007 	if (file != NULL)
1008 		printf("Reading BAM file %s\n", filename);
1009 	else
1010 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1011 
1012 	if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1013 		exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1014 
1015 	// Skip header text
1016 	if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1017 		exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1018 
1019 	// Skip header reference list
1020 	refCount = readBAMint32(file);
1021 	for (i = 0; i < refCount; i++) {
1022 		if (gzseek(file, readBAMint32(file) + 4, SEEK_CUR) == -1)
1023 			exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1024 	}
1025 
1026 	readCount = 0;
1027 	for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1028 		int blockSize = int32(buffer);
1029 		int readLength;
1030 
1031 		if (bufferCapacity < 4 + blockSize) {
1032 			bufferCapacity = 4 + blockSize + 4096;
1033 			buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1034 		}
1035 
1036 		if (gzread(file, &buffer[4], blockSize) != blockSize)
1037 			exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1038 
1039 		readLength = int32(&buffer[20]);
1040 		if (readLength == 0) {
1041 			fprintf(stderr,
1042 				"Record #%lu: ignoring BAM record with omitted SEQ field\n",
1043 				recno);
1044 		}
1045 		else {
1046 			int readNameLength = buffer[12];
1047 			int flag_nc = int32(&buffer[16]);
1048 			int flagbits = flag_nc >> 16;
1049 			int cigarLength = flag_nc & 0xffff;
1050 			char *qname = (char *)&buffer[36];
1051 			unsigned char *rawseq =
1052 					&buffer[36 + readNameLength + 4 * cigarLength];
1053 
1054 			const char *qname_pairing = "";
1055 			if (flagbits & 0x40)
1056 				qname_pairing = "/1";
1057 			else if (flagbits & 0x80)
1058 				qname_pairing = "/2";
1059 
1060 			if (seqCapacity < readLength + 1) {
1061 				seqCapacity = readLength * 2 + 1;
1062 				seq = reallocOrExit(seq, seqCapacity, char);
1063 			}
1064 
1065 			for (i = 0; i < readLength; i += 2) {
1066 				static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1067 				unsigned int packed = *rawseq++;
1068 				seq[i] = decode_bases[packed >> 4];
1069 				seq[i+1] = decode_bases[packed & 0xf];
1070 			}
1071 			seq[readLength] = '\0';
1072 
1073 			if (flagbits & 0x10)
1074 				reverseComplementSequence(seq);
1075 
1076 			// Determine if paired to previous read
1077 			if (readCount > 0) {
1078 				if (cat % 2) {
1079 					 if (previous_paired) {
1080 						// Last read paired to penultimate read
1081 						fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1082 							(long) ((*sequenceIndex)++), (int) cat);
1083 						writeFastaSequence(outfile, previous_seq);
1084 						previous_paired = false;
1085 					} else if (strcmp(qname, previous_qname) == 0) {
1086 						// Last read paired to current reads
1087 						fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1088 							(long) ((*sequenceIndex)++), (int) cat);
1089 						writeFastaSequence(outfile, previous_seq);
1090 						previous_paired = true;
1091 					} else {
1092 						// Last read unpaired
1093 						fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1094 							(long) ((*sequenceIndex)++), (int) cat - 1);
1095 						writeFastaSequence(outfile, previous_seq);
1096 						previous_paired = false;
1097 					}
1098 				} else {
1099 					// Unpaired dataset
1100 					fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1101 						(long) ((*sequenceIndex)++), (int) cat);
1102 					writeFastaSequence(outfile, previous_seq);
1103 				}
1104 			}
1105 
1106 			strcpy(previous_qname, qname);
1107 			strcpy(previous_qname_pairing, qname_pairing);
1108 			strcpy(previous_seq, seq);
1109 			velvetifySequence(previous_seq);
1110 
1111 			readCount++;
1112 		}
1113 	}
1114 
1115 	if (readCount) {
1116 		if (cat % 2) {
1117 			if (previous_paired) {
1118 				// Last read paired to penultimate read
1119 				fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1120 					(long) ((*sequenceIndex)++), (int) cat);
1121 				writeFastaSequence(outfile, previous_seq);
1122 			} else {
1123 				// Last read unpaired
1124 				fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1125 					(long) ((*sequenceIndex)++), (int) cat - 1);
1126 				writeFastaSequence(outfile, previous_seq);
1127 			}
1128 		} else {
1129 			// Unpaired dataset
1130 			fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1131 				(long) ((*sequenceIndex)++), (int) cat);
1132 			writeFastaSequence(outfile, previous_seq);
1133 		}
1134 	}
1135 
1136 	free(seq);
1137 	free(buffer);
1138 
1139 	gzclose(file);
1140 	printf("%lu reads found.\nDone\n", readCount);
1141 }
1142 
1143 #define FASTQ 1
1144 #define FASTA 2
1145 #define GERALD 3
1146 #define ELAND 4
1147 #define FASTA_GZ 5
1148 #define FASTQ_GZ 6
1149 #define MAQ_GZ 7
1150 #define SAM 8
1151 #define BAM 9
1152 
1153 // General argument parser for most functions
1154 // Basically a reused portion of toplevel code dumped into here
parseDataAndReadFiles(char * filename,int argc,char ** argv,boolean * double_strand)1155 void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand)
1156 {
1157 	int argIndex = 1;
1158 	FILE *outfile = fopen(filename, "w");
1159 	int filetype = FASTA;
1160 	Category cat = 0;
1161 	IDnum sequenceIndex = 1;
1162 	short short_var;
1163 
1164 	if (argc < 2) {
1165 		puts("Wrong number of arguments!");
1166 		puts("Correct usage:");
1167 		puts("run -<filetype> <list of files> [...] ");
1168 		puts("Allowed filetypes:");
1169 		puts("\t-fasta");
1170 		puts("\t-fastq");
1171 		puts("\t-solexa");
1172 		puts("\t-eland");
1173 		puts("If reading exclusively fasta file, the -fasta parameter is not necessary");
1174 		exit(1);
1175 	}
1176 
1177 	for (argIndex = 1; argIndex < argc; argIndex++) {
1178 		if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1179 
1180 			if (strcmp(argv[argIndex], "-fastq") == 0)
1181 				filetype = FASTQ;
1182 			else if (strcmp(argv[argIndex], "-fasta") == 0)
1183 				filetype = FASTA;
1184 			else if (strcmp(argv[argIndex], "-gerald") == 0)
1185 				filetype = GERALD;
1186 			else if (strcmp(argv[argIndex], "-eland") == 0)
1187 				filetype = ELAND;
1188 			else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1189 				filetype = FASTQ_GZ;
1190 			else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1191 				filetype = FASTA_GZ;
1192 			else if (strcmp(argv[argIndex], "-sam") == 0)
1193 				filetype = SAM;
1194 			else if (strcmp(argv[argIndex], "-bam") == 0)
1195 				filetype = BAM;
1196 			else if (strcmp(argv[argIndex], "-maq.gz") == 0)
1197 				filetype = MAQ_GZ;
1198 			else if (strcmp(argv[argIndex], "-short") == 0)
1199 				cat = 0;
1200 			else if (strcmp(argv[argIndex], "-shortPaired") ==
1201 				 0)
1202 				cat = 1;
1203 			else if (strncmp
1204 				 (argv[argIndex], "-shortPaired",
1205 				  12) == 0) {
1206 				sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1207 				cat = (Category) short_var;
1208 				if (cat < 1 || cat > CATEGORIES) {
1209 					printf("Unknown option: %s\n",
1210 					       argv[argIndex]);
1211 					exit(1);
1212 				}
1213 				cat--;
1214 				cat *= 2;
1215 				cat++;
1216 			} else if (strncmp(argv[argIndex], "-short", 6) ==
1217 				   0) {
1218 				sscanf(argv[argIndex], "-short%hd", &short_var);
1219 				cat = (Category) short_var;
1220 				if (cat < 1 || cat > CATEGORIES) {
1221 					printf("Unknown option: %s\n",
1222 					       argv[argIndex]);
1223 					exit(1);
1224 				}
1225 				cat--;
1226 				cat *= 2;
1227 			} else if (strcmp(argv[argIndex], "-long") == 0)
1228 				cat = CATEGORIES * 2;
1229 			else if (strcmp(argv[argIndex], "-longPaired") ==
1230 				 0)
1231 				cat = CATEGORIES * 2 + 1;
1232 			else if (strcmp(argv[argIndex], "-strand_specific")
1233 				 == 0)
1234 				*double_strand = false;
1235 			else {
1236 				printf("Unknown option: %s\n",
1237 				       argv[argIndex]);
1238 				exit(1);
1239 			}
1240 
1241 			continue;
1242 		}
1243 
1244 		if (cat == -1)
1245 			continue;
1246 
1247 		switch (filetype) {
1248 		case FASTA:
1249 			readFastAFile(outfile, argv[argIndex], cat, &sequenceIndex);
1250 			break;
1251 		case FASTQ:
1252 			readFastQFile(outfile, argv[argIndex], cat, &sequenceIndex);
1253 			break;
1254 		case GERALD:
1255 			readSolexaFile(outfile, argv[argIndex], cat, &sequenceIndex);
1256 			break;
1257 		case ELAND:
1258 			readElandFile(outfile, argv[argIndex], cat, &sequenceIndex);
1259 			break;
1260 		case FASTA_GZ:
1261 			readFastAGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1262 			break;
1263 		case FASTQ_GZ:
1264 			readFastQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1265 			break;
1266 		case SAM:
1267 			readSAMFile(outfile, argv[argIndex], cat, &sequenceIndex);
1268 			break;
1269 		case BAM:
1270 			readBAMFile(outfile, argv[argIndex], cat, &sequenceIndex);
1271 			break;
1272 		case MAQ_GZ:
1273 			readMAQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1274 			break;
1275 		default:
1276 			puts("Screw up in parser... exiting");
1277 			exit(1);
1278 		}
1279 	}
1280 
1281 	fclose(outfile);
1282 }
1283 
createReadPairingArray(ReadSet * reads)1284 void createReadPairingArray(ReadSet* reads) {
1285 	IDnum index;
1286 	IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1287 
1288 	for (index = 0; index < reads->readCount; index++)
1289 		mateReads[index] = -1;
1290 
1291 	reads->mateReads = mateReads;
1292 }
1293 
pairUpReads(ReadSet * reads,Category cat)1294 boolean pairUpReads(ReadSet * reads, Category cat)
1295 {
1296 	int phase = 0;
1297 	IDnum index;
1298 	boolean found = false;
1299 
1300 	for (index = 0; index < reads->readCount; index++) {
1301 		if (reads->categories[index] != cat) {
1302 			if (phase == 1) {
1303 				reads->mateReads[index - 1] = -1;
1304 				reads->categories[index - 1]--;
1305 				phase = 0;
1306 			}
1307 		} else if (phase == 0) {
1308 			found = true;
1309 			reads->mateReads[index] = index + 1;
1310 			phase = 1;
1311 		} else {
1312 			found = true;
1313 			reads->mateReads[index] = index - 1;
1314 			phase = 0;
1315 		}
1316 	}
1317 
1318 	return found;
1319 }
1320 
detachDubiousReads(ReadSet * reads,boolean * dubiousReads)1321 void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1322 {
1323 	IDnum index;
1324 	IDnum pairID;
1325 	IDnum sequenceCount = reads->readCount;
1326 	IDnum *mateReads = reads->mateReads;
1327 
1328 	if (dubiousReads == NULL || mateReads == NULL)
1329 		return;
1330 
1331 	for (index = 0; index < sequenceCount; index++) {
1332 		if (!dubiousReads[index])
1333 			continue;
1334 
1335 		pairID = mateReads[index];
1336 
1337 		if (pairID != -1) {
1338 			//printf("Separating %d and %d\n", index, pairID);
1339 			mateReads[index] = -1;
1340 			mateReads[pairID] = -1;
1341 		}
1342 	}
1343 }
1344 
exportRead(FILE * outfile,ReadSet * reads,IDnum index)1345 static void exportRead(FILE * outfile, ReadSet * reads, IDnum index)
1346 {
1347 	Coordinate start, finish;
1348 	char str[100];
1349 	TightString *sequence = reads->tSequences[index];
1350 
1351 	if (sequence == NULL)
1352 		return;
1353 
1354 	fprintf(outfile, ">SEQUENCE_%ld_length_%lld", (long) index,
1355 		(long long) getLength(sequence));
1356 
1357 	if (reads->categories != NULL)
1358 		fprintf(outfile, "\t%i", (int) reads->categories[index]);
1359 
1360 	fprintf(outfile, "\n");
1361 
1362 	start = 0;
1363 	while (start <= getLength(sequence)) {
1364 		finish = start + 60;
1365 		readTightStringFragment(sequence, start, finish, str);
1366 		fprintf(outfile, "%s\n", str);
1367 		start = finish;
1368 	}
1369 
1370 	fflush(outfile);
1371 }
1372 
exportReadSet(char * filename,ReadSet * reads)1373 void exportReadSet(char *filename, ReadSet * reads)
1374 {
1375 	IDnum index;
1376 	FILE *outfile = fopen(filename, "w+");
1377 
1378 	if (outfile == NULL) {
1379 		puts("Couldn't open file, sorry");
1380 		return;
1381 	} else
1382 		printf("Writing into readset file: %s\n", filename);
1383 
1384 	for (index = 0; index < reads->readCount; index++) {
1385 		exportRead(outfile, reads, index);
1386 	}
1387 
1388 	fclose(outfile);
1389 
1390 	puts("Done");
1391 }
1392 
importReadSet(char * filename)1393 ReadSet *importReadSet(char *filename)
1394 {
1395 	FILE *file = fopen(filename, "r");
1396 	char *sequence = NULL;
1397 	Coordinate bpCount = 0;
1398 	const int maxline = 5000;
1399 	char line[5000];
1400 	IDnum sequenceCount, sequenceIndex;
1401 	IDnum index;
1402 	ReadSet *reads;
1403 	short int temp_short;
1404 
1405 	if (file != NULL)
1406 		printf("Reading read set file %s;\n", filename);
1407 	else
1408 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1409 
1410 	reads = newReadSet();
1411 
1412 	// Count number of separate sequences
1413 	sequenceCount = 0;
1414 	while (fgets(line, maxline, file) != NULL)
1415 		if (line[0] == '>')
1416 			sequenceCount++;
1417 	fclose(file);
1418 	printf("%d sequences found\n", sequenceCount);
1419 
1420 	reads->readCount = sequenceCount;
1421 
1422 	if (reads->readCount == 0) {
1423 		reads->sequences = NULL;
1424 		reads->categories = NULL;
1425 		return reads;
1426 	}
1427 
1428 	reads->sequences = callocOrExit(sequenceCount, char *);
1429 	reads->categories = callocOrExit(sequenceCount, Category);
1430 	// Counting base pair length of each sequence:
1431 	file = fopen(filename, "r");
1432 	sequenceIndex = -1;
1433 	while (fgets(line, maxline, file) != NULL) {
1434 		if (line[0] == '>') {
1435 
1436 			// Reading category info
1437 			sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1438 			       &temp_short);
1439 			reads->categories[sequenceIndex + 1] = (Category) temp_short;
1440 
1441 			if (sequenceIndex != -1)
1442 				reads->sequences[sequenceIndex] =
1443 				    mallocOrExit(bpCount + 1, char);
1444 			sequenceIndex++;
1445 			bpCount = 0;
1446 		} else {
1447 			bpCount += (Coordinate) strlen(line) - 1;
1448 		}
1449 	}
1450 
1451 	//printf("Sequence %d has length %d\n", sequenceIndex, bpCount);
1452 	reads->sequences[sequenceIndex] =
1453 	    mallocOrExit(bpCount + 1, char);
1454 	fclose(file);
1455 
1456 	// Reopen file and memorize line:
1457 	file = fopen(filename, "r");
1458 	sequenceIndex = -1;
1459 	while (fgets(line, maxline, file)) {
1460 		if (line[0] == '>') {
1461 			if (sequenceIndex != -1) {
1462 				sequence[bpCount] = '\0';
1463 			}
1464 			sequenceIndex++;
1465 			bpCount = 0;
1466 			//printf("Starting to read sequence %d\n",
1467 			//       sequenceIndex);
1468 			sequence = reads->sequences[sequenceIndex];
1469 		} else {
1470 			for (index = 0; index < (Coordinate) strlen(line) - 1;
1471 			     index++)
1472 				sequence[bpCount + index] = line[index];
1473 			bpCount += (Coordinate) (strlen(line) - 1);
1474 		}
1475 	}
1476 
1477 	sequence[bpCount] = '\0';
1478 	fclose(file);
1479 
1480 	puts("Done");
1481 	return reads;
1482 
1483 }
1484 
logInstructions(int argc,char ** argv,char * directory)1485 void logInstructions(int argc, char **argv, char *directory)
1486 {
1487 	int index;
1488 	char *logFilename =
1489 	    mallocOrExit(strlen(directory) + 100, char);
1490 	FILE *logFile;
1491 	time_t date;
1492 	char *string;
1493 
1494 	time(&date);
1495 	string = ctime(&date);
1496 
1497 	strcpy(logFilename, directory);
1498 	strcat(logFilename, "/Log");
1499 	logFile = fopen(logFilename, "a");
1500 
1501 	if (logFile == NULL)
1502 		exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1503 
1504 	fprintf(logFile, "%s", string);
1505 
1506 	for (index = 0; index < argc; index++)
1507 		fprintf(logFile, " %s", argv[index]);
1508 
1509 	fprintf(logFile, "\n");
1510 
1511 	fclose(logFile);
1512 	free(logFilename);
1513 }
1514 
destroyReadSet(ReadSet * reads)1515 void destroyReadSet(ReadSet * reads)
1516 {
1517 	IDnum index;
1518 
1519 	if (reads == NULL)
1520 		return;
1521 
1522 	if (reads->sequences != NULL)
1523 		for (index = 0; index < reads->readCount; index++)
1524 			free(reads->sequences[index]);
1525 
1526 	if (reads->tSequences != NULL)
1527 		for (index = 0; index < reads->readCount; index++)
1528 			destroyTightString(reads->tSequences[index]);
1529 
1530 	if (reads->labels != NULL)
1531 		for (index = 0; index < reads->readCount; index++)
1532 			free(reads->labels[index]);
1533 
1534 	if (reads->confidenceScores != NULL)
1535 		for (index = 0; index < reads->readCount; index++)
1536 			free(reads->confidenceScores[index]);
1537 
1538 	if (reads->kmerProbabilities != NULL)
1539 		for (index = 0; index < reads->readCount; index++)
1540 			free(reads->kmerProbabilities[index]);
1541 
1542 	free(reads->sequences);
1543 	free(reads->tSequences);
1544 	free(reads->labels);
1545 	free(reads->confidenceScores);
1546 	free(reads->kmerProbabilities);
1547 	free(reads->mateReads);
1548 	free(reads->categories);
1549 	free(reads);
1550 }
1551 
getSequenceLengths(ReadSet * reads,int wordLength)1552 Coordinate *getSequenceLengths(ReadSet * reads, int wordLength)
1553 {
1554 	Coordinate *lengths = callocOrExit(reads->readCount, Coordinate);
1555 	IDnum index;
1556 	int lengthOffset = wordLength - 1;
1557 
1558 	for (index = 0; index < reads->readCount; index++)
1559 		lengths[index] =
1560 		    getLength(reads->tSequences[index]) - lengthOffset;
1561 
1562 	return lengths;
1563 }
1564 
getSequenceLengthsFromFile(char * filename,int wordLength)1565 Coordinate *getSequenceLengthsFromFile(char *filename, int wordLength)
1566 {
1567 	Coordinate *lengths;
1568 	FILE *file = fopen(filename, "r");
1569 	Coordinate bpCount = 0;
1570 	const int maxline = 100;
1571 	char line[100];
1572 	IDnum sequenceCount, sequenceIndex;
1573 	int lengthOffset = wordLength - 1;
1574 
1575 	if (file != NULL)
1576 		printf("Reading read set file %s;\n", filename);
1577 	else
1578 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1579 
1580 	// Count number of separate sequences
1581 	sequenceCount = 0;
1582 	while (fgets(line, maxline, file) != NULL)
1583 		if (line[0] == '>')
1584 			sequenceCount++;
1585 	fclose(file);
1586 
1587 	lengths = callocOrExit(sequenceCount, Coordinate);
1588 	// Counting base pair length of each sequence:
1589 	file = fopen(filename, "r");
1590 	sequenceIndex = -1;
1591 	while (fgets(line, maxline, file) != NULL) {
1592 		if (line[0] == '>') {
1593 			if (sequenceIndex != -1)
1594 				lengths[sequenceIndex] =
1595 				    bpCount - lengthOffset;
1596 			sequenceIndex++;
1597 			bpCount = 0;
1598 		} else {
1599 			bpCount += (Coordinate) strlen(line) - 1;
1600 		}
1601 	}
1602 	lengths[sequenceIndex] =  bpCount - lengthOffset;
1603 	fclose(file);
1604 
1605 	return lengths;
1606 }
1607