1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <math.h>
25 #include <time.h>
26
27 #include "globals.h"
28 #include "tightString.h"
29 #include "readSet.h"
30 #include "utility.h"
31
32 #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
33 #include "../../third-party/zlib-1.2.3/Win32/include/zlib.h"
34 #else
35 #include "../../third-party/zlib-1.2.3/zlib.h"
36 #endif
37
38 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
39 # include <fcntl.h>
40 # include <io.h>
41 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
42 #else
43 # define SET_BINARY_MODE(file)
44 #endif
45
newReadSet()46 ReadSet *newReadSet()
47 {
48 ReadSet *rs = callocOrExit(1, ReadSet);
49 return rs;
50 }
51
velvetifySequence(char * str)52 static void velvetifySequence(char * str) {
53 int i = strlen(str) - 1;
54 char c;
55
56 for (i = strlen(str) - 1; i >= 0; i--) {
57 c = str[i];
58 switch (c) {
59 case '\n':
60 case '\r':
61 case EOF:
62 str[i] = '\0';
63 break;
64 case 'C':
65 case 'c':
66 str[i] = 'C';
67 break;
68 case 'G':
69 case 'g':
70 str[i] = 'G';
71 break;
72 case 'T':
73 case 't':
74 str[i] = 'T';
75 break;
76 default:
77 str[i] = 'A';
78 }
79 }
80 }
81
reverseComplementSequence(char * str)82 static void reverseComplementSequence(char * str)
83 {
84 size_t length = strlen(str);
85 size_t i;
86
87 for (i = 0; i < length-1 - i; i++) {
88 char c = str[i];
89 str[i] = str[length-1 - i];
90 str[length-1 - i] = c;
91 }
92
93 for (i = 0; i < length; i++) {
94 switch (str[i]) {
95 case 'A':
96 case 'a':
97 str[i] = 'T';
98 break;
99 case 'C':
100 case 'c':
101 str[i] = 'G';
102 break;
103 case 'G':
104 case 'g':
105 str[i] = 'C';
106 break;
107 // As in velvetifySequence(), anything unusual ends up as 'A'
108 default:
109 str[i] = 'A';
110 break;
111 }
112 }
113 }
114
writeFastaSequence(FILE * outfile,const char * str)115 static void writeFastaSequence(FILE * outfile, const char * str)
116 {
117 size_t length = strlen(str);
118 size_t start;
119 for (start = 0; start < length; start += 60)
120 fprintf(outfile, "%.60s\n", &str[start]);
121 }
122
newReadSetAroundTightStringArray(TightString ** array,IDnum length)123 ReadSet *newReadSetAroundTightStringArray(TightString ** array,
124 IDnum length)
125 {
126 ReadSet *rs = newReadSet();
127 rs->tSequences = array;
128 rs->readCount = length;
129 return rs;
130 }
131
concatenateReadSets(ReadSet * A,ReadSet * B)132 void concatenateReadSets(ReadSet * A, ReadSet * B)
133 {
134 ReadSet tmp;
135 IDnum index;
136
137 // Read count:
138 tmp.readCount = A->readCount + B->readCount;
139
140 // Sequences
141 if (A->sequences != NULL || B->sequences != NULL) {
142 tmp.sequences = mallocOrExit(tmp.readCount, char *);
143 if (A->sequences != NULL) {
144 for (index = 0; index < A->readCount; index++)
145 tmp.sequences[index] = A->sequences[index];
146 free(A->sequences);
147 } else
148 for (index = 0; index < A->readCount; index++)
149 tmp.sequences[index] = NULL;
150
151 if (B->sequences != NULL) {
152 for (index = 0; index < B->readCount; index++)
153 tmp.sequences[A->readCount + index] =
154 B->sequences[index];
155 free(B->sequences);
156 } else
157 for (index = 0; index < B->readCount; index++)
158 tmp.sequences[A->readCount + index] = NULL;
159 } else
160 tmp.sequences = NULL;
161
162 // tSequences
163 if (A->tSequences != NULL || B->tSequences != NULL) {
164 tmp.tSequences =
165 mallocOrExit(tmp.readCount, TightString *);
166
167 if (A->tSequences != NULL) {
168 for (index = 0; index < A->readCount; index++)
169 tmp.tSequences[index] =
170 A->tSequences[index];
171 free(A->tSequences);
172 } else
173 for (index = 0; index < A->readCount; index++)
174 tmp.tSequences[index] = NULL;
175
176 if (B->tSequences != NULL) {
177 for (index = 0; index < B->readCount; index++)
178 tmp.tSequences[A->readCount + index] =
179 B->tSequences[index];
180 free(B->tSequences);
181 } else
182 for (index = 0; index < B->readCount; index++)
183 tmp.tSequences[A->readCount + index] =
184 NULL;
185 } else
186 tmp.tSequences = NULL;
187
188 // Labels
189 if (A->labels != NULL || B->labels != NULL) {
190 tmp.labels = mallocOrExit(tmp.readCount, char *);
191
192 if (A->labels != NULL) {
193 for (index = 0; index < A->readCount; index++)
194 tmp.labels[index] = A->labels[index];
195 free(A->labels);
196 } else
197 for (index = 0; index < A->readCount; index++)
198 tmp.labels[index] = NULL;
199
200 if (B->labels != NULL) {
201 for (index = 0; index < B->readCount; index++)
202 tmp.labels[A->readCount + index] =
203 B->labels[index];
204 free(B->labels);
205 } else
206 for (index = 0; index < B->readCount; index++)
207 tmp.labels[A->readCount + index] = NULL;
208 } else
209 tmp.labels = NULL;
210
211
212 // Confidence scores
213 if (A->confidenceScores != NULL || B->confidenceScores != NULL) {
214 tmp.confidenceScores =
215 mallocOrExit(tmp.readCount, Quality *);
216
217 if (A->confidenceScores != NULL) {
218 for (index = 0; index < A->readCount; index++)
219 tmp.confidenceScores[index] =
220 A->confidenceScores[index];
221 free(A->confidenceScores);
222 } else
223 for (index = 0; index < A->readCount; index++)
224 tmp.confidenceScores[index] = NULL;
225
226 if (B->confidenceScores != NULL) {
227 for (index = 0; index < B->readCount; index++)
228 tmp.confidenceScores[A->readCount +
229 index] =
230 B->confidenceScores[index];
231 free(B->confidenceScores);
232 } else
233 for (index = 0; index < B->readCount; index++)
234 tmp.confidenceScores[A->readCount +
235 index] = NULL;
236 } else
237 tmp.confidenceScores = NULL;
238
239 // Kmer probabilities
240 if (A->kmerProbabilities != NULL || B->kmerProbabilities != NULL) {
241 tmp.kmerProbabilities =
242 mallocOrExit(tmp.readCount, Quality *);
243
244 if (A->kmerProbabilities != NULL) {
245 for (index = 0; index < A->readCount; index++)
246 tmp.kmerProbabilities[index] =
247 A->kmerProbabilities[index];
248 free(A->kmerProbabilities);
249 } else
250 for (index = 0; index < A->readCount; index++)
251 tmp.kmerProbabilities[index] = NULL;
252
253 if (B->kmerProbabilities != NULL) {
254 for (index = 0; index < B->readCount; index++)
255 tmp.kmerProbabilities[A->readCount +
256 index] =
257 B->kmerProbabilities[index];
258 free(B->kmerProbabilities);
259 } else
260 for (index = 0; index < B->readCount; index++)
261 tmp.kmerProbabilities[A->readCount +
262 index] = NULL;
263 } else
264 tmp.kmerProbabilities = NULL;
265
266 // Mate reads
267 if (A->mateReads != NULL || B->mateReads != NULL) {
268 tmp.mateReads = mallocOrExit(tmp.readCount, IDnum);
269
270 if (A->mateReads != NULL) {
271 for (index = 0; index < A->readCount; index++)
272 tmp.mateReads[index] = A->mateReads[index];
273 free(A->mateReads);
274 } else
275 for (index = 0; index < A->readCount; index++)
276 tmp.mateReads[index] = 0;
277
278 if (B->mateReads != NULL) {
279 for (index = 0; index < B->readCount; index++)
280 tmp.mateReads[A->readCount + index] =
281 B->mateReads[index];
282 free(B->mateReads);
283 } else
284 for (index = 0; index < B->readCount; index++)
285 tmp.mateReads[A->readCount + index] = 0;
286 } else
287 tmp.mateReads = NULL;
288
289 // Categories
290 if (A->categories != NULL || B->categories != NULL) {
291 tmp.categories = mallocOrExit(tmp.readCount, Quality *);
292
293 if (A->categories != NULL) {
294 for (index = 0; index < A->readCount; index++)
295 tmp.categories[index] =
296 A->categories[index];
297 free(A->categories);
298 } else
299 for (index = 0; index < A->readCount; index++)
300 tmp.categories[index] = CATEGORIES;
301
302 if (B->categories != NULL) {
303 for (index = 0; index < B->readCount; index++)
304 tmp.categories[A->readCount + index] =
305 B->categories[index];
306 free(B->categories);
307 } else
308 for (index = 0; index < B->readCount; index++)
309 tmp.categories[A->readCount + index] =
310 CATEGORIES;
311 } else
312 tmp.categories = NULL;
313
314 // Put everything back into A
315 A->readCount = tmp.readCount;
316 A->sequences = tmp.sequences;
317 A->tSequences = tmp.tSequences;
318 A->labels = tmp.labels;
319 A->confidenceScores = tmp.confidenceScores;
320 A->kmerProbabilities = tmp.kmerProbabilities;
321 A->mateReads = tmp.mateReads;
322 A->categories = tmp.categories;
323
324 // Deallocate
325 free(B);
326 }
327
convertSequences(ReadSet * rs)328 void convertSequences(ReadSet * rs)
329 {
330 rs->tSequences =
331 newTightStringArrayFromStringArray(rs->sequences,
332 rs->readCount);
333 rs->sequences = NULL;
334 }
335
convertQualityScore(Quality score)336 static Probability convertQualityScore(Quality score)
337 {
338 return (Probability) 1 - pow(10, -score / ((double) 10));
339 }
340
convertConfidenceScores(ReadSet * rs,int WORDLENGTH)341 void convertConfidenceScores(ReadSet * rs, int WORDLENGTH)
342 {
343 Quality *baseCallerScores;
344 Probability *kmerProbabilities;
345 IDnum index;
346 Coordinate position;
347 Probability proba;
348
349 rs->kmerProbabilities =
350 mallocOrExit(rs->readCount, Probability *);
351
352 for (index = 0; index < rs->readCount; index++) {
353 rs->kmerProbabilities[index] =
354 mallocOrExit(getLength(rs->tSequences[index]) - WORDLENGTH +
355 1, Probability);
356 kmerProbabilities = rs->kmerProbabilities[index];
357 baseCallerScores = rs->confidenceScores[index];
358
359 proba = 1;
360 for (position = 0;
361 position < getLength(rs->tSequences[index]);
362 position++) {
363 proba *=
364 convertQualityScore(baseCallerScores
365 [position]);
366 if (position < WORDLENGTH)
367 continue;
368
369 proba /=
370 convertQualityScore(baseCallerScores
371 [position - WORDLENGTH]);
372 kmerProbabilities[position - WORDLENGTH + 1] =
373 proba;
374 }
375
376 rs->confidenceScores[index] = NULL;
377 free(baseCallerScores);
378 }
379
380 free(rs->confidenceScores);
381 rs->confidenceScores = NULL;
382 }
383
categorizeReads(ReadSet * readSet,Category category)384 void categorizeReads(ReadSet * readSet, Category category)
385 {
386 IDnum index;
387
388 if (readSet->categories == NULL)
389 readSet->categories =
390 mallocOrExit(readSet->readCount, Category);
391
392 for (index = 0; index < readSet->readCount; index++)
393 readSet->categories[index] = category;
394 }
395
simplifyReads(ReadSet * readSet)396 void simplifyReads(ReadSet * readSet)
397 {
398 IDnum index;
399
400 if (readSet->categories == NULL)
401 readSet->categories =
402 mallocOrExit(readSet->readCount, Category);
403
404 for (index = 0; index < readSet->readCount; index++) {
405 if (readSet->categories[index] < CATEGORIES) {
406 readSet->categories[index] /= 2;
407 readSet->categories[index] *= 2;
408 }
409 }
410 }
411
exportIDMapping(char * filename,ReadSet * reads)412 void exportIDMapping(char *filename, ReadSet * reads)
413 {
414 IDnum index;
415 FILE *outfile = fopen(filename, "w");
416
417 if (outfile == NULL) {
418 printf("Couldn't open %s, sorry\n", filename);
419 return;
420 } else
421 puts("Writing into file...");
422
423 if (reads->labels == NULL) {
424 fclose(outfile);
425 return;
426 }
427
428 for (index = 0; index < reads->readCount; index++)
429 if (reads->labels != NULL)
430 fprintf(outfile, "s/SEQUENCE %ld/%s/\n", (long) (index + 1),
431 reads->labels[index]);
432
433 fclose(outfile);
434
435 }
436
437 // Returns the value of a 32-bit little-endian-stored integer.
int32(const unsigned char * ptr)438 static int int32(const unsigned char * ptr)
439 {
440 int x = ptr[3];
441 x = (x << 8) | ptr[2];
442 x = (x << 8) | ptr[1];
443 x = (x << 8) | ptr[0];
444 return x;
445 }
446
447 // Imports sequences from a fastq file
448 // Memory space allocated within this function.
readSolexaFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)449 static void readSolexaFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
450 {
451 FILE *file = fopen(filename, "r");
452 IDnum counter = 0;
453 const int maxline = 500;
454 char line[500];
455 char readName[500];
456 char readSeq[500];
457 char str[100];
458 Coordinate start;
459
460 if (strcmp(filename, "-"))
461 file = fopen(filename, "r");
462 else
463 file = stdin;
464
465 if (file != NULL)
466 printf("Reading Solexa file %s\n", filename);
467 else
468 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
469
470 while (fgets(line, maxline, file) != NULL)
471 if (strchr(line, '.') == NULL) {
472 sscanf(line, "%s\t%*i\t%*i\t%*i\t%*c%[^\n]",
473 readName, readSeq);
474 fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
475 velvetifySequence(readSeq);
476 start = 0;
477 while (start <= strlen(readSeq)) {
478 strncpy(str, readSeq + start, 60);
479 str[60] = '\0';
480 fprintf(outfile, "%s\n", str);
481 start += 60;
482 }
483
484 counter++;
485 }
486
487 fclose(file);
488
489 printf("%d sequences found\n", counter);
490 puts("Done");
491 }
492
readElandFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)493 static void readElandFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
494 {
495 FILE *file = fopen(filename, "r");
496 IDnum counter = 0;
497 const int maxline = 5000;
498 char line[5000];
499 char readName[5000];
500 char readSeq[5000];
501 char str[100];
502 Coordinate start;
503
504 if (strcmp(filename, "-"))
505 file = fopen(filename, "r");
506 else
507 file = stdin;
508
509 if (file != NULL)
510 printf("Reading Solexa file %s\n", filename);
511 else
512 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
513
514 // Reopen file and memorize line:
515 while (fgets(line, maxline, file) != NULL) {
516 sscanf(line, "%[^\t]\t%[^\t\n]",
517 readName, readSeq);
518 fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
519 velvetifySequence(readSeq);
520 start = 0;
521 while (start <= strlen(readSeq)) {
522 strncpy(str, readSeq + start, 60);
523 str[60] = '\0';
524 fprintf(outfile, "%s\n", str);
525 start += 60;
526 }
527
528 counter++;
529 }
530
531 fclose(file);
532
533 printf("%d sequences found\n", counter);
534 puts("Done");
535 }
536
goToEndOfLine(char * line,FILE * file)537 void goToEndOfLine(char *line, FILE * file)
538 {
539 size_t length = strlen(line);
540 char c = line[length - 1];
541
542 while (c != '\n')
543 c = fgetc(file);
544 }
545
546 // Imports sequences from a fastq file
547 // Memory space allocated within this function.
readFastQFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)548 static void readFastQFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
549 {
550 FILE *file;
551 const int maxline = 5000;
552 char line[5000];
553 char str[100];
554 IDnum counter = 0;
555 Coordinate start, i;
556 char c;
557
558 if (strcmp(filename, "-"))
559 file = fopen(filename, "r");
560 else
561 file = stdin;
562
563 if (file != NULL)
564 printf("Reading FastQ file %s\n", filename);
565 else
566 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
567
568 // Checking if FastQ
569 c = getc(file);
570 if (c != '@')
571 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
572 ungetc(c, file);
573
574 while(fgets(line, maxline, file)) {
575
576 for (i = strlen(line) - 1;
577 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
578 line[i] = '\0';
579 }
580
581 fprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
582 counter++;
583
584 if(!fgets(line, maxline, file))
585 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
586
587 velvetifySequence(line);
588
589 start = 0;
590 while (start <= strlen(line)) {
591 strncpy(str, line + start, 60);
592 str[60] = '\0';
593 fprintf(outfile, "%s\n", str);
594 start += 60;
595 }
596
597 if(!fgets(line, maxline, file))
598 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
599 if(!fgets(line, maxline, file))
600 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
601 }
602
603 fclose(file);
604 printf("%d reads found.\n", counter);
605 puts("Done");
606 }
607
608 // Imports sequences from a zipped rfastq file
609 // Memory space allocated within this function.
readFastQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)610 static void readFastQGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
611 {
612 gzFile file;
613 const int maxline = 5000;
614 char line[5000];
615 char str[100];
616 IDnum counter = 0;
617 Coordinate start, i;
618 char c;
619
620 if (strcmp(filename, "-"))
621 file = gzopen(filename, "rb");
622 else {
623 file = gzdopen(fileno(stdin), "rb");
624 SET_BINARY_MODE(stdin);
625 }
626
627 if (file != NULL)
628 printf("Reading FastQ file %s\n", filename);
629 else
630 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
631
632 // Checking if FastQ
633 c = gzgetc(file);
634 if (c != '@')
635 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
636 gzungetc(c, file);
637
638 while (gzgets(file, line, maxline)) {
639 for (i = strlen(line) - 1;
640 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
641 line[i] = '\0';
642 }
643
644 fprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
645 counter++;
646
647 gzgets(file, line, maxline);
648
649 velvetifySequence(line);
650
651 start = 0;
652 while (start <= strlen(line)) {
653 strncpy(str, line + start, 60);
654 str[60] = '\0';
655 fprintf(outfile, "%s\n", str);
656 start += 60;
657 }
658
659 gzgets(file, line, maxline);
660 gzgets(file, line, maxline);
661 }
662
663 gzclose(file);
664 printf("%d reads found.\n", counter);
665 puts("Done");
666 }
667
668 // Imports sequences from a fasta file
669 // Memory is allocated within the function
readFastAFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)670 static void readFastAFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
671 {
672 FILE *file;
673 const int maxline = 5000;
674 char line[5000];
675 char str[100];
676 IDnum counter = 0;
677 Coordinate i;
678 char c;
679 Coordinate start;
680 int offset = 0;
681
682 if (strcmp(filename, "-"))
683 file = fopen(filename, "r");
684 else
685 file = stdin;
686
687 if (file != NULL)
688 printf("Reading FastA file %s;\n", filename);
689 else
690 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
691
692 // Checking if FastA
693 c = getc(file);
694 if (c != '>')
695 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
696 ungetc(c, file);
697
698 while (fgets(line, maxline, file)) {
699 if (line[0] == '>') {
700 if (strchr(line,'\t')) {
701 printf("FastA headers in %s contain tabs, please remove them.\n", filename);
702 printf("E.g.: %s", line);
703 puts("Exiting");
704 exit(1);
705 }
706
707 if (offset != 0) {
708 fprintf(outfile, "\n");
709 offset = 0;
710 }
711
712 for (i = strlen(line) - 1;
713 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
714 line[i] = '\0';
715 }
716
717 fprintf(outfile,"%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
718 counter++;
719 } else {
720 velvetifySequence(line);
721 start = 0;
722 while (start < strlen(line)) {
723 strncpy(str, line + start, 60 - offset);
724 str[60 - offset] = '\0';
725 fprintf(outfile, "%s", str);
726 offset += strlen(str);
727 if (offset >= 60) {
728 fprintf(outfile, "\n");
729 offset = 0;
730 }
731 start += strlen(str);
732 }
733 }
734 }
735
736 if (offset != 0)
737 fprintf(outfile, "\n");
738 fclose(file);
739
740 printf("%d sequences found\n", counter);
741 puts("Done");
742 }
743
744 // Imports sequences from a zipped fasta file
745 // Memory is allocated within the function
readFastAGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)746 static void readFastAGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
747 {
748 gzFile file;
749 const int maxline = 5000;
750 char line[5000];
751 char str[100];
752 IDnum counter = 0;
753 Coordinate i, start;
754 char c;
755 int offset = 0;
756
757 if (strcmp(filename, "-"))
758 file = gzopen(filename, "rb");
759 else {
760 file = gzdopen(fileno(stdin), "rb");
761 SET_BINARY_MODE(stdin);
762 }
763
764 if (file != NULL)
765 printf("Reading zipped FastA file %s;\n", filename);
766 else
767 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
768
769 // Checking if FastA
770 c = gzgetc(file);
771 if (c != '>')
772 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
773 gzungetc(c, file);
774
775 while (gzgets(file, line, maxline)) {
776 if (line[0] == '>') {
777 if (offset != 0) {
778 fprintf(outfile, "\n");
779 offset = 0;
780 }
781
782 for (i = strlen(line) - 1;
783 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
784 line[i] = '\0';
785 }
786
787 fprintf(outfile, "%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
788 counter++;
789 } else {
790 velvetifySequence(line);
791
792 start = 0;
793 while (start < strlen(line)) {
794 strncpy(str, line + start, 60 - offset);
795 str[60 - offset] = '\0';
796 fprintf(outfile, "%s", str);
797 offset += strlen(str);
798 if (offset >= 60) {
799 fprintf(outfile, "\n");
800 offset = 0;
801 }
802 start += strlen(str);
803 }
804 }
805 }
806
807 if (offset != 0)
808 fprintf(outfile, "\n");
809 gzclose(file);
810
811 printf("%d sequences found\n", counter);
812 puts("Done");
813 }
814
815 // Parser for new output
readMAQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)816 static void readMAQGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
817 {
818 gzFile file;
819 const int maxline = 1000;
820 char line[1000];
821 IDnum counter = 0;
822 char readName[500];
823 char readSeq[500];
824 char str[100];
825 Coordinate start;
826
827 if (strcmp(filename, "-"))
828 file = gzopen(filename, "rb");
829 else {
830 file = gzdopen(fileno(stdin), "rb");
831 SET_BINARY_MODE(stdin);
832 }
833
834 if (file != NULL)
835 printf("Reading zipped MAQ file %s\n", filename);
836 else
837 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
838
839 // Reopen file and memorize line:
840 while (gzgets(file, line, maxline)) {
841 sscanf(line, "%s\t%*i\t%*i\t%*c\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%[^\t]",
842 readName, readSeq);
843 fprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
844 velvetifySequence(readSeq);
845 start = 0;
846 while (start <= strlen(readSeq)) {
847 strncpy(str, readSeq + start, 60);
848 str[60] = '\0';
849 fprintf(outfile, "%s\n", str);
850 start += 60;
851 }
852
853 counter++;
854 }
855
856 gzclose(file);
857
858 printf("%d sequences found\n", counter);
859 puts("Done");
860 }
861
readSAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)862 static void readSAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex)
863 {
864 char line[5000];
865 unsigned long lineno, readCount;
866 char previous_qname_pairing[10];
867 char previous_qname[5000];
868 char previous_seq[5000];
869 boolean previous_paired = false;
870
871 FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
872 if (file)
873 printf("Reading SAM file %s\n", filename);
874 else
875 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
876
877 readCount = 0;
878 for (lineno = 1; fgets(line, sizeof(line), file); lineno++)
879 if (line[0] != '@') {
880 char *qname, *flag, *seq;
881 int i;
882
883 qname = strtok(line, "\t");
884 flag = strtok(NULL, "\t");
885 for (i = 3; i < 10; i++)
886 (void) strtok(NULL, "\t");
887 seq = strtok(NULL, "\t");
888
889 if (seq == NULL) {
890 fprintf(stderr,
891 "Line #%lu: ignoring SAM record with too few fields\n",
892 lineno);
893 }
894 else if (strcmp(seq, "*") == 0) {
895 fprintf(stderr,
896 "Line #%lu: ignoring SAM record with omitted SEQ field\n",
897 lineno);
898 }
899 else {
900 // Accept flags represented in either decimal or hex:
901 int flagbits = strtol(flag, NULL, 0);
902
903 const char *qname_pairing = "";
904 if (flagbits & 0x40)
905 qname_pairing = "/1";
906 else if (flagbits & 0x80)
907 qname_pairing = "/2";
908
909 if (flagbits & 0x10)
910 reverseComplementSequence(seq);
911
912 // Determine if paired to previous read
913 if (readCount > 0) {
914 if (cat % 2) {
915 if (previous_paired) {
916 // Last read paired to penultimate read
917 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
918 (long) ((*sequenceIndex)++), (int) cat);
919 writeFastaSequence(outfile, previous_seq);
920 previous_paired = false;
921 } else if (strcmp(qname, previous_qname) == 0) {
922 // Last read paired to current reads
923 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
924 (long) ((*sequenceIndex)++), (int) cat);
925 writeFastaSequence(outfile, previous_seq);
926 previous_paired = true;
927 } else {
928 // Last read unpaired
929 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
930 (long) ((*sequenceIndex)++), (int) cat - 1);
931 writeFastaSequence(outfile, previous_seq);
932 previous_paired = false;
933 }
934 } else {
935 // Unpaired dataset
936 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
937 (long) ((*sequenceIndex)++), (int) cat);
938 writeFastaSequence(outfile, previous_seq);
939 }
940 }
941
942 strcpy(previous_qname, qname);
943 strcpy(previous_qname_pairing, qname_pairing);
944 strcpy(previous_seq, seq);
945 velvetifySequence(previous_seq);
946
947 readCount++;
948 }
949 }
950
951 if (readCount) {
952 if (cat % 2) {
953 if (previous_paired) {
954 // Last read paired to penultimate read
955 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
956 (long) ((*sequenceIndex)++), (int) cat);
957 writeFastaSequence(outfile, previous_seq);
958 } else {
959 // Last read unpaired
960 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
961 (long) ((*sequenceIndex)++), (int) cat - 1);
962 writeFastaSequence(outfile, previous_seq);
963 }
964 } else {
965 // Unpaired dataset
966 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
967 (long) ((*sequenceIndex)++), (int) cat);
968 writeFastaSequence(outfile, previous_seq);
969 }
970
971 }
972
973 fclose(file);
974 printf("%lu reads found.\nDone\n", readCount);
975 }
976
readBAMint32(gzFile file)977 static int readBAMint32(gzFile file)
978 {
979 unsigned char buffer[4];
980 if (gzread(file, buffer, 4) != 4)
981 exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
982
983 return int32(buffer);
984 }
985
readBAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)986 static void readBAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex)
987 {
988 size_t seqCapacity = 0;
989 char *seq = NULL;
990 size_t bufferCapacity = 4;
991 unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
992 unsigned long recno, readCount;
993 int i, refCount;
994 gzFile file;
995 char previous_qname_pairing[10];
996 char previous_qname[5000];
997 char previous_seq[5000];
998 boolean previous_paired = false;
999
1000 if (strcmp(filename, "-") != 0)
1001 file = gzopen(filename, "rb");
1002 else {
1003 file = gzdopen(fileno(stdin), "rb");
1004 SET_BINARY_MODE(stdin);
1005 }
1006
1007 if (file != NULL)
1008 printf("Reading BAM file %s\n", filename);
1009 else
1010 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1011
1012 if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1013 exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1014
1015 // Skip header text
1016 if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1017 exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1018
1019 // Skip header reference list
1020 refCount = readBAMint32(file);
1021 for (i = 0; i < refCount; i++) {
1022 if (gzseek(file, readBAMint32(file) + 4, SEEK_CUR) == -1)
1023 exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1024 }
1025
1026 readCount = 0;
1027 for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1028 int blockSize = int32(buffer);
1029 int readLength;
1030
1031 if (bufferCapacity < 4 + blockSize) {
1032 bufferCapacity = 4 + blockSize + 4096;
1033 buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1034 }
1035
1036 if (gzread(file, &buffer[4], blockSize) != blockSize)
1037 exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1038
1039 readLength = int32(&buffer[20]);
1040 if (readLength == 0) {
1041 fprintf(stderr,
1042 "Record #%lu: ignoring BAM record with omitted SEQ field\n",
1043 recno);
1044 }
1045 else {
1046 int readNameLength = buffer[12];
1047 int flag_nc = int32(&buffer[16]);
1048 int flagbits = flag_nc >> 16;
1049 int cigarLength = flag_nc & 0xffff;
1050 char *qname = (char *)&buffer[36];
1051 unsigned char *rawseq =
1052 &buffer[36 + readNameLength + 4 * cigarLength];
1053
1054 const char *qname_pairing = "";
1055 if (flagbits & 0x40)
1056 qname_pairing = "/1";
1057 else if (flagbits & 0x80)
1058 qname_pairing = "/2";
1059
1060 if (seqCapacity < readLength + 1) {
1061 seqCapacity = readLength * 2 + 1;
1062 seq = reallocOrExit(seq, seqCapacity, char);
1063 }
1064
1065 for (i = 0; i < readLength; i += 2) {
1066 static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1067 unsigned int packed = *rawseq++;
1068 seq[i] = decode_bases[packed >> 4];
1069 seq[i+1] = decode_bases[packed & 0xf];
1070 }
1071 seq[readLength] = '\0';
1072
1073 if (flagbits & 0x10)
1074 reverseComplementSequence(seq);
1075
1076 // Determine if paired to previous read
1077 if (readCount > 0) {
1078 if (cat % 2) {
1079 if (previous_paired) {
1080 // Last read paired to penultimate read
1081 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1082 (long) ((*sequenceIndex)++), (int) cat);
1083 writeFastaSequence(outfile, previous_seq);
1084 previous_paired = false;
1085 } else if (strcmp(qname, previous_qname) == 0) {
1086 // Last read paired to current reads
1087 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1088 (long) ((*sequenceIndex)++), (int) cat);
1089 writeFastaSequence(outfile, previous_seq);
1090 previous_paired = true;
1091 } else {
1092 // Last read unpaired
1093 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1094 (long) ((*sequenceIndex)++), (int) cat - 1);
1095 writeFastaSequence(outfile, previous_seq);
1096 previous_paired = false;
1097 }
1098 } else {
1099 // Unpaired dataset
1100 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1101 (long) ((*sequenceIndex)++), (int) cat);
1102 writeFastaSequence(outfile, previous_seq);
1103 }
1104 }
1105
1106 strcpy(previous_qname, qname);
1107 strcpy(previous_qname_pairing, qname_pairing);
1108 strcpy(previous_seq, seq);
1109 velvetifySequence(previous_seq);
1110
1111 readCount++;
1112 }
1113 }
1114
1115 if (readCount) {
1116 if (cat % 2) {
1117 if (previous_paired) {
1118 // Last read paired to penultimate read
1119 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1120 (long) ((*sequenceIndex)++), (int) cat);
1121 writeFastaSequence(outfile, previous_seq);
1122 } else {
1123 // Last read unpaired
1124 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1125 (long) ((*sequenceIndex)++), (int) cat - 1);
1126 writeFastaSequence(outfile, previous_seq);
1127 }
1128 } else {
1129 // Unpaired dataset
1130 fprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1131 (long) ((*sequenceIndex)++), (int) cat);
1132 writeFastaSequence(outfile, previous_seq);
1133 }
1134 }
1135
1136 free(seq);
1137 free(buffer);
1138
1139 gzclose(file);
1140 printf("%lu reads found.\nDone\n", readCount);
1141 }
1142
1143 #define FASTQ 1
1144 #define FASTA 2
1145 #define GERALD 3
1146 #define ELAND 4
1147 #define FASTA_GZ 5
1148 #define FASTQ_GZ 6
1149 #define MAQ_GZ 7
1150 #define SAM 8
1151 #define BAM 9
1152
1153 // General argument parser for most functions
1154 // Basically a reused portion of toplevel code dumped into here
parseDataAndReadFiles(char * filename,int argc,char ** argv,boolean * double_strand)1155 void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand)
1156 {
1157 int argIndex = 1;
1158 FILE *outfile = fopen(filename, "w");
1159 int filetype = FASTA;
1160 Category cat = 0;
1161 IDnum sequenceIndex = 1;
1162 short short_var;
1163
1164 if (argc < 2) {
1165 puts("Wrong number of arguments!");
1166 puts("Correct usage:");
1167 puts("run -<filetype> <list of files> [...] ");
1168 puts("Allowed filetypes:");
1169 puts("\t-fasta");
1170 puts("\t-fastq");
1171 puts("\t-solexa");
1172 puts("\t-eland");
1173 puts("If reading exclusively fasta file, the -fasta parameter is not necessary");
1174 exit(1);
1175 }
1176
1177 for (argIndex = 1; argIndex < argc; argIndex++) {
1178 if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1179
1180 if (strcmp(argv[argIndex], "-fastq") == 0)
1181 filetype = FASTQ;
1182 else if (strcmp(argv[argIndex], "-fasta") == 0)
1183 filetype = FASTA;
1184 else if (strcmp(argv[argIndex], "-gerald") == 0)
1185 filetype = GERALD;
1186 else if (strcmp(argv[argIndex], "-eland") == 0)
1187 filetype = ELAND;
1188 else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1189 filetype = FASTQ_GZ;
1190 else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1191 filetype = FASTA_GZ;
1192 else if (strcmp(argv[argIndex], "-sam") == 0)
1193 filetype = SAM;
1194 else if (strcmp(argv[argIndex], "-bam") == 0)
1195 filetype = BAM;
1196 else if (strcmp(argv[argIndex], "-maq.gz") == 0)
1197 filetype = MAQ_GZ;
1198 else if (strcmp(argv[argIndex], "-short") == 0)
1199 cat = 0;
1200 else if (strcmp(argv[argIndex], "-shortPaired") ==
1201 0)
1202 cat = 1;
1203 else if (strncmp
1204 (argv[argIndex], "-shortPaired",
1205 12) == 0) {
1206 sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1207 cat = (Category) short_var;
1208 if (cat < 1 || cat > CATEGORIES) {
1209 printf("Unknown option: %s\n",
1210 argv[argIndex]);
1211 exit(1);
1212 }
1213 cat--;
1214 cat *= 2;
1215 cat++;
1216 } else if (strncmp(argv[argIndex], "-short", 6) ==
1217 0) {
1218 sscanf(argv[argIndex], "-short%hd", &short_var);
1219 cat = (Category) short_var;
1220 if (cat < 1 || cat > CATEGORIES) {
1221 printf("Unknown option: %s\n",
1222 argv[argIndex]);
1223 exit(1);
1224 }
1225 cat--;
1226 cat *= 2;
1227 } else if (strcmp(argv[argIndex], "-long") == 0)
1228 cat = CATEGORIES * 2;
1229 else if (strcmp(argv[argIndex], "-longPaired") ==
1230 0)
1231 cat = CATEGORIES * 2 + 1;
1232 else if (strcmp(argv[argIndex], "-strand_specific")
1233 == 0)
1234 *double_strand = false;
1235 else {
1236 printf("Unknown option: %s\n",
1237 argv[argIndex]);
1238 exit(1);
1239 }
1240
1241 continue;
1242 }
1243
1244 if (cat == -1)
1245 continue;
1246
1247 switch (filetype) {
1248 case FASTA:
1249 readFastAFile(outfile, argv[argIndex], cat, &sequenceIndex);
1250 break;
1251 case FASTQ:
1252 readFastQFile(outfile, argv[argIndex], cat, &sequenceIndex);
1253 break;
1254 case GERALD:
1255 readSolexaFile(outfile, argv[argIndex], cat, &sequenceIndex);
1256 break;
1257 case ELAND:
1258 readElandFile(outfile, argv[argIndex], cat, &sequenceIndex);
1259 break;
1260 case FASTA_GZ:
1261 readFastAGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1262 break;
1263 case FASTQ_GZ:
1264 readFastQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1265 break;
1266 case SAM:
1267 readSAMFile(outfile, argv[argIndex], cat, &sequenceIndex);
1268 break;
1269 case BAM:
1270 readBAMFile(outfile, argv[argIndex], cat, &sequenceIndex);
1271 break;
1272 case MAQ_GZ:
1273 readMAQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1274 break;
1275 default:
1276 puts("Screw up in parser... exiting");
1277 exit(1);
1278 }
1279 }
1280
1281 fclose(outfile);
1282 }
1283
createReadPairingArray(ReadSet * reads)1284 void createReadPairingArray(ReadSet* reads) {
1285 IDnum index;
1286 IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1287
1288 for (index = 0; index < reads->readCount; index++)
1289 mateReads[index] = -1;
1290
1291 reads->mateReads = mateReads;
1292 }
1293
pairUpReads(ReadSet * reads,Category cat)1294 boolean pairUpReads(ReadSet * reads, Category cat)
1295 {
1296 int phase = 0;
1297 IDnum index;
1298 boolean found = false;
1299
1300 for (index = 0; index < reads->readCount; index++) {
1301 if (reads->categories[index] != cat) {
1302 if (phase == 1) {
1303 reads->mateReads[index - 1] = -1;
1304 reads->categories[index - 1]--;
1305 phase = 0;
1306 }
1307 } else if (phase == 0) {
1308 found = true;
1309 reads->mateReads[index] = index + 1;
1310 phase = 1;
1311 } else {
1312 found = true;
1313 reads->mateReads[index] = index - 1;
1314 phase = 0;
1315 }
1316 }
1317
1318 return found;
1319 }
1320
detachDubiousReads(ReadSet * reads,boolean * dubiousReads)1321 void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1322 {
1323 IDnum index;
1324 IDnum pairID;
1325 IDnum sequenceCount = reads->readCount;
1326 IDnum *mateReads = reads->mateReads;
1327
1328 if (dubiousReads == NULL || mateReads == NULL)
1329 return;
1330
1331 for (index = 0; index < sequenceCount; index++) {
1332 if (!dubiousReads[index])
1333 continue;
1334
1335 pairID = mateReads[index];
1336
1337 if (pairID != -1) {
1338 //printf("Separating %d and %d\n", index, pairID);
1339 mateReads[index] = -1;
1340 mateReads[pairID] = -1;
1341 }
1342 }
1343 }
1344
exportRead(FILE * outfile,ReadSet * reads,IDnum index)1345 static void exportRead(FILE * outfile, ReadSet * reads, IDnum index)
1346 {
1347 Coordinate start, finish;
1348 char str[100];
1349 TightString *sequence = reads->tSequences[index];
1350
1351 if (sequence == NULL)
1352 return;
1353
1354 fprintf(outfile, ">SEQUENCE_%ld_length_%lld", (long) index,
1355 (long long) getLength(sequence));
1356
1357 if (reads->categories != NULL)
1358 fprintf(outfile, "\t%i", (int) reads->categories[index]);
1359
1360 fprintf(outfile, "\n");
1361
1362 start = 0;
1363 while (start <= getLength(sequence)) {
1364 finish = start + 60;
1365 readTightStringFragment(sequence, start, finish, str);
1366 fprintf(outfile, "%s\n", str);
1367 start = finish;
1368 }
1369
1370 fflush(outfile);
1371 }
1372
exportReadSet(char * filename,ReadSet * reads)1373 void exportReadSet(char *filename, ReadSet * reads)
1374 {
1375 IDnum index;
1376 FILE *outfile = fopen(filename, "w+");
1377
1378 if (outfile == NULL) {
1379 puts("Couldn't open file, sorry");
1380 return;
1381 } else
1382 printf("Writing into readset file: %s\n", filename);
1383
1384 for (index = 0; index < reads->readCount; index++) {
1385 exportRead(outfile, reads, index);
1386 }
1387
1388 fclose(outfile);
1389
1390 puts("Done");
1391 }
1392
importReadSet(char * filename)1393 ReadSet *importReadSet(char *filename)
1394 {
1395 FILE *file = fopen(filename, "r");
1396 char *sequence = NULL;
1397 Coordinate bpCount = 0;
1398 const int maxline = 5000;
1399 char line[5000];
1400 IDnum sequenceCount, sequenceIndex;
1401 IDnum index;
1402 ReadSet *reads;
1403 short int temp_short;
1404
1405 if (file != NULL)
1406 printf("Reading read set file %s;\n", filename);
1407 else
1408 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1409
1410 reads = newReadSet();
1411
1412 // Count number of separate sequences
1413 sequenceCount = 0;
1414 while (fgets(line, maxline, file) != NULL)
1415 if (line[0] == '>')
1416 sequenceCount++;
1417 fclose(file);
1418 printf("%d sequences found\n", sequenceCount);
1419
1420 reads->readCount = sequenceCount;
1421
1422 if (reads->readCount == 0) {
1423 reads->sequences = NULL;
1424 reads->categories = NULL;
1425 return reads;
1426 }
1427
1428 reads->sequences = callocOrExit(sequenceCount, char *);
1429 reads->categories = callocOrExit(sequenceCount, Category);
1430 // Counting base pair length of each sequence:
1431 file = fopen(filename, "r");
1432 sequenceIndex = -1;
1433 while (fgets(line, maxline, file) != NULL) {
1434 if (line[0] == '>') {
1435
1436 // Reading category info
1437 sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1438 &temp_short);
1439 reads->categories[sequenceIndex + 1] = (Category) temp_short;
1440
1441 if (sequenceIndex != -1)
1442 reads->sequences[sequenceIndex] =
1443 mallocOrExit(bpCount + 1, char);
1444 sequenceIndex++;
1445 bpCount = 0;
1446 } else {
1447 bpCount += (Coordinate) strlen(line) - 1;
1448 }
1449 }
1450
1451 //printf("Sequence %d has length %d\n", sequenceIndex, bpCount);
1452 reads->sequences[sequenceIndex] =
1453 mallocOrExit(bpCount + 1, char);
1454 fclose(file);
1455
1456 // Reopen file and memorize line:
1457 file = fopen(filename, "r");
1458 sequenceIndex = -1;
1459 while (fgets(line, maxline, file)) {
1460 if (line[0] == '>') {
1461 if (sequenceIndex != -1) {
1462 sequence[bpCount] = '\0';
1463 }
1464 sequenceIndex++;
1465 bpCount = 0;
1466 //printf("Starting to read sequence %d\n",
1467 // sequenceIndex);
1468 sequence = reads->sequences[sequenceIndex];
1469 } else {
1470 for (index = 0; index < (Coordinate) strlen(line) - 1;
1471 index++)
1472 sequence[bpCount + index] = line[index];
1473 bpCount += (Coordinate) (strlen(line) - 1);
1474 }
1475 }
1476
1477 sequence[bpCount] = '\0';
1478 fclose(file);
1479
1480 puts("Done");
1481 return reads;
1482
1483 }
1484
logInstructions(int argc,char ** argv,char * directory)1485 void logInstructions(int argc, char **argv, char *directory)
1486 {
1487 int index;
1488 char *logFilename =
1489 mallocOrExit(strlen(directory) + 100, char);
1490 FILE *logFile;
1491 time_t date;
1492 char *string;
1493
1494 time(&date);
1495 string = ctime(&date);
1496
1497 strcpy(logFilename, directory);
1498 strcat(logFilename, "/Log");
1499 logFile = fopen(logFilename, "a");
1500
1501 if (logFile == NULL)
1502 exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1503
1504 fprintf(logFile, "%s", string);
1505
1506 for (index = 0; index < argc; index++)
1507 fprintf(logFile, " %s", argv[index]);
1508
1509 fprintf(logFile, "\n");
1510
1511 fclose(logFile);
1512 free(logFilename);
1513 }
1514
destroyReadSet(ReadSet * reads)1515 void destroyReadSet(ReadSet * reads)
1516 {
1517 IDnum index;
1518
1519 if (reads == NULL)
1520 return;
1521
1522 if (reads->sequences != NULL)
1523 for (index = 0; index < reads->readCount; index++)
1524 free(reads->sequences[index]);
1525
1526 if (reads->tSequences != NULL)
1527 for (index = 0; index < reads->readCount; index++)
1528 destroyTightString(reads->tSequences[index]);
1529
1530 if (reads->labels != NULL)
1531 for (index = 0; index < reads->readCount; index++)
1532 free(reads->labels[index]);
1533
1534 if (reads->confidenceScores != NULL)
1535 for (index = 0; index < reads->readCount; index++)
1536 free(reads->confidenceScores[index]);
1537
1538 if (reads->kmerProbabilities != NULL)
1539 for (index = 0; index < reads->readCount; index++)
1540 free(reads->kmerProbabilities[index]);
1541
1542 free(reads->sequences);
1543 free(reads->tSequences);
1544 free(reads->labels);
1545 free(reads->confidenceScores);
1546 free(reads->kmerProbabilities);
1547 free(reads->mateReads);
1548 free(reads->categories);
1549 free(reads);
1550 }
1551
getSequenceLengths(ReadSet * reads,int wordLength)1552 Coordinate *getSequenceLengths(ReadSet * reads, int wordLength)
1553 {
1554 Coordinate *lengths = callocOrExit(reads->readCount, Coordinate);
1555 IDnum index;
1556 int lengthOffset = wordLength - 1;
1557
1558 for (index = 0; index < reads->readCount; index++)
1559 lengths[index] =
1560 getLength(reads->tSequences[index]) - lengthOffset;
1561
1562 return lengths;
1563 }
1564
getSequenceLengthsFromFile(char * filename,int wordLength)1565 Coordinate *getSequenceLengthsFromFile(char *filename, int wordLength)
1566 {
1567 Coordinate *lengths;
1568 FILE *file = fopen(filename, "r");
1569 Coordinate bpCount = 0;
1570 const int maxline = 100;
1571 char line[100];
1572 IDnum sequenceCount, sequenceIndex;
1573 int lengthOffset = wordLength - 1;
1574
1575 if (file != NULL)
1576 printf("Reading read set file %s;\n", filename);
1577 else
1578 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1579
1580 // Count number of separate sequences
1581 sequenceCount = 0;
1582 while (fgets(line, maxline, file) != NULL)
1583 if (line[0] == '>')
1584 sequenceCount++;
1585 fclose(file);
1586
1587 lengths = callocOrExit(sequenceCount, Coordinate);
1588 // Counting base pair length of each sequence:
1589 file = fopen(filename, "r");
1590 sequenceIndex = -1;
1591 while (fgets(line, maxline, file) != NULL) {
1592 if (line[0] == '>') {
1593 if (sequenceIndex != -1)
1594 lengths[sequenceIndex] =
1595 bpCount - lengthOffset;
1596 sequenceIndex++;
1597 bpCount = 0;
1598 } else {
1599 bpCount += (Coordinate) strlen(line) - 1;
1600 }
1601 }
1602 lengths[sequenceIndex] = bpCount - lengthOffset;
1603 fclose(file);
1604
1605 return lengths;
1606 }
1607