1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <math.h>
25 #include <time.h>
26 #include <limits.h>
27 #include <ctype.h>
28
29 #include "globals.h"
30 #include "tightString.h"
31 #include "readSet.h"
32 #include "utility.h"
33
34 #if !defined(BUNDLEDZLIB)
35 #include <zlib.h>
36 #elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
37 #include "../third-party/zlib-1.2.3/Win32/include/zlib.h"
38 #else
39 #include "../third-party/zlib-1.2.3/zlib.h"
40 #endif
41
42 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
43 # include <fcntl.h>
44 # include <io.h>
45 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
46 #else
47 # define SET_BINARY_MODE(file)
48 #endif
49
newReadSet()50 ReadSet *newReadSet()
51 {
52 ReadSet *rs = callocOrExit(1, ReadSet);
53 return rs;
54 }
55
56 //////////////////////////////////////////////////////////////////////////
57 // Reference identifiers
58 //////////////////////////////////////////////////////////////////////////
59
60 typedef struct referenceCoordinate_st ReferenceCoordinate;
61 static Coordinate reference_coordinate_double_strand = true;
62
63 struct referenceCoordinate_st {
64 char * name;
65 Coordinate start;
66 Coordinate finish;
67 IDnum referenceID;
68 boolean positive_strand;
69 } ATTRIBUTE_PACKED;
70
compareRefCoords(const void * ptrA,const void * ptrB)71 static int compareRefCoords(const void * ptrA, const void * ptrB) {
72 ReferenceCoordinate * A = (ReferenceCoordinate *) ptrA;
73 ReferenceCoordinate * B = (ReferenceCoordinate *) ptrB;
74 int comp = strcmp(A->name, B->name);
75
76 if (comp != 0)
77 return comp;
78 else if (!reference_coordinate_double_strand && A->positive_strand != B->positive_strand)
79 return A->positive_strand > B->positive_strand;
80 else {
81 if (A->finish > -1 && A->finish < B->start)
82 return -1;
83 else if (B->finish > -1 && A->start > B->finish)
84 return 1;
85 else return 0;
86 }
87 }
88
89 typedef struct referenceCoordinateTable_st ReferenceCoordinateTable;
90
91 struct referenceCoordinateTable_st {
92 ReferenceCoordinate * array;
93 IDnum arrayLength;
94 } ATTRIBUTE_PACKED;
95
newReferenceCoordinateTable()96 static ReferenceCoordinateTable * newReferenceCoordinateTable() {
97 ReferenceCoordinateTable * table = callocOrExit(1, ReferenceCoordinateTable);
98 table->array = NULL;
99 table->arrayLength = 0;
100 return table;
101 }
102
destroyReferenceCoordinateTable(ReferenceCoordinateTable * table)103 static void destroyReferenceCoordinateTable(ReferenceCoordinateTable * table) {
104 IDnum index;
105
106 if (table->array) {
107 for (index = 0; index < table->arrayLength; index++)
108 free(table->array[index].name);
109 free(table->array);
110 }
111 free(table);
112 }
113
resizeReferenceCoordinateTable(ReferenceCoordinateTable * table,IDnum extraLength)114 static void resizeReferenceCoordinateTable(ReferenceCoordinateTable * table, IDnum extraLength) {
115 if (table->array == NULL)
116 table->array = callocOrExit(extraLength, ReferenceCoordinate);
117 else
118 table->array = reallocOrExit(table->array, table->arrayLength + extraLength, ReferenceCoordinate);
119 }
120
findReferenceCoordinate(ReferenceCoordinateTable * table,char * name,Coordinate start,Coordinate finish,boolean positive_strand)121 static ReferenceCoordinate * findReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
122 ReferenceCoordinate * array = table->array;
123 ReferenceCoordinate refCoord;
124 Coordinate leftIndex = 0;
125 Coordinate rightIndex = table->arrayLength - 1;
126 Coordinate middleIndex;
127
128 refCoord.name = name;
129 refCoord.start = start;
130 refCoord.finish = finish;
131 refCoord.referenceID = 0;
132 refCoord.positive_strand = positive_strand;
133
134 while (true) {
135 middleIndex = (rightIndex + leftIndex) / 2;
136
137 if (leftIndex > rightIndex)
138 return NULL;
139 else if (compareRefCoords(&(array[middleIndex]), &refCoord) == 0)
140 return &(array[middleIndex]);
141 else if (leftIndex == middleIndex)
142 return NULL;
143 else if (compareRefCoords(&(array[middleIndex]), &refCoord) > 0)
144 rightIndex = middleIndex;
145 else
146 leftIndex = middleIndex;
147 }
148 }
149
addReferenceCoordinate(ReferenceCoordinateTable * table,char * name,Coordinate start,Coordinate finish,boolean positive_strand)150 static void addReferenceCoordinate(ReferenceCoordinateTable * table, char * name, Coordinate start, Coordinate finish, boolean positive_strand) {
151 ReferenceCoordinate * refCoord;
152
153 if ((refCoord = findReferenceCoordinate(table, name, start, finish, positive_strand))) {
154 velvetLog("Overlapping reference coordinates:\n");
155 velvetLog("%s:%lli-%lli\n", name, (long long) start, (long long) finish);
156 velvetLog("%s:%lli-%lli\n", refCoord->name, (long long) refCoord->start, (long long) refCoord->finish);
157 velvetLog("Exiting...\n");
158 #ifdef DEBUG
159 abort();
160 #endif
161 exit(1);
162 }
163
164 refCoord = &(table->array[table->arrayLength++]);
165
166 refCoord->name = name;
167 refCoord->start = start;
168 refCoord->finish = finish;
169 refCoord->referenceID = table->arrayLength;
170 refCoord->positive_strand = positive_strand;
171 }
172
sortReferenceCoordinateTable(ReferenceCoordinateTable * table)173 static void sortReferenceCoordinateTable(ReferenceCoordinateTable * table) {
174 qsort(table->array, table->arrayLength, sizeof(ReferenceCoordinate), compareRefCoords);
175 }
176
177 //////////////////////////////////////////////////////////////////////////
178 // File reading
179 //////////////////////////////////////////////////////////////////////////
180
velvetifySequence(char * str)181 static void velvetifySequence(char * str) {
182 int i = strlen(str) - 1;
183 char c;
184
185 for (i = strlen(str) - 1; i >= 0; i--) {
186 c = str[i];
187 switch (c) {
188 case '\n':
189 case '\r':
190 case EOF:
191 str[i] = '\0';
192 break;
193 case 'A':
194 case 'a':
195 str[i] = 'A';
196 break;
197 case 'C':
198 case 'c':
199 str[i] = 'C';
200 break;
201 case 'G':
202 case 'g':
203 str[i] = 'G';
204 break;
205 case 'T':
206 case 't':
207 str[i] = 'T';
208 break;
209 default:
210 str[i] = 'N';
211 }
212 }
213 }
214
reverseComplementSequence(char * str)215 static void reverseComplementSequence(char * str)
216 {
217 size_t length = strlen(str);
218 size_t i;
219
220 for (i = 0; i < length-1 - i; i++) {
221 char c = str[i];
222 str[i] = str[length-1 - i];
223 str[length-1 - i] = c;
224 }
225
226 #ifndef COLOR
227 for (i = 0; i < length; i++) {
228 switch (str[i]) {
229 case 'A':
230 case 'a':
231 str[i] = 'T';
232 break;
233 case 'C':
234 case 'c':
235 str[i] = 'G';
236 break;
237 case 'G':
238 case 'g':
239 str[i] = 'C';
240 break;
241 // As in velvetifySequence(), anything unusual ends up as 'A'
242 default:
243 str[i] = 'A';
244 break;
245 }
246 }
247 #endif
248 }
249
writeFastaSequence(FILE * outfile,const char * str)250 static void writeFastaSequence(FILE * outfile, const char * str)
251 {
252 size_t length = strlen(str);
253 size_t start;
254 for (start = 0; start < length; start += 60)
255 velvetFprintf(outfile, "%.60s\n", &str[start]);
256 }
257
convertSequences(ReadSet * rs)258 void convertSequences(ReadSet * rs)
259 {
260 rs->tSequences = newTightStringArrayFromStringArray(rs->sequences,
261 rs->readCount,
262 &rs->tSeqMem);
263 rs->sequences = NULL;
264 }
265
266 // Returns the value of a 32-bit little-endian-stored integer.
int32(const unsigned char * ptr)267 static int int32(const unsigned char * ptr)
268 {
269 int x = ptr[3];
270 x = (x << 8) | ptr[2];
271 x = (x << 8) | ptr[1];
272 x = (x << 8) | ptr[0];
273 return x;
274 }
275
276 // Imports sequences from a fastq file
277 // Memory space allocated within this function.
readSolexaFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)278 static void readSolexaFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
279 {
280 FILE *file = fopen(filename, "r");
281 IDnum counter = 0;
282 const int maxline = 500;
283 char line[500];
284 char readName[500];
285 char readSeq[500];
286 char str[100];
287 Coordinate start;
288
289 if (strcmp(filename, "-"))
290 file = fopen(filename, "r");
291 else
292 file = stdin;
293
294 if (file != NULL)
295 velvetLog("Reading Solexa file %s\n", filename);
296 else
297 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
298
299 while (fgets(line, maxline, file) != NULL)
300 if (strchr(line, '.') == NULL) {
301 sscanf(line, "%s\t%*i\t%*i\t%*i\t%*c%[^\n]",
302 readName, readSeq);
303 velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
304 velvetifySequence(readSeq);
305 start = 0;
306 while (start <= (Coordinate)strlen(readSeq)) {
307 strncpy(str, readSeq + start, 60);
308 str[60] = '\0';
309 velvetFprintf(outfile, "%s\n", str);
310 start += 60;
311 }
312
313 counter++;
314 }
315
316 fclose(file);
317
318 velvetLog("%li sequences found\n", (long) counter);
319 velvetLog("Done\n");
320 }
321
readElandFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)322 static void readElandFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
323 {
324 FILE *file = fopen(filename, "r");
325 IDnum counter = 0;
326 const int maxline = 5000;
327 char line[5000];
328 char readName[5000];
329 char readSeq[5000];
330 char str[100];
331 Coordinate start;
332
333 if (strcmp(filename, "-"))
334 file = fopen(filename, "r");
335 else
336 file = stdin;
337
338 if (file != NULL)
339 velvetLog("Reading Solexa file %s\n", filename);
340 else
341 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
342
343 // Reopen file and memorize line:
344 while (fgets(line, maxline, file) != NULL) {
345 sscanf(line, "%[^\t]\t%[^\t\n]",
346 readName, readSeq);
347 velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
348 velvetifySequence(readSeq);
349 start = 0;
350 while (start <= (Coordinate)strlen(readSeq)) {
351 strncpy(str, readSeq + start, 60);
352 str[60] = '\0';
353 velvetFprintf(outfile, "%s\n", str);
354 start += 60;
355 }
356
357 counter++;
358 }
359
360 fclose(file);
361
362 velvetLog("%li sequences found\n", (long) counter);
363 velvetLog("Done\n");
364 }
365
goToEndOfLine(char * line,FILE * file)366 void goToEndOfLine(char *line, FILE * file)
367 {
368 size_t length = strlen(line);
369 char c = line[length - 1];
370
371 while (c != '\n')
372 c = fgetc(file);
373 }
374
375 // Imports sequences from a fastq file
376 // Memory space allocated within this function.
readFastQFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)377 static void readFastQFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
378 {
379 FILE *file;
380 const int maxline = 5000;
381 char line[5000];
382 char str[100];
383 IDnum counter = 0;
384 Coordinate start, i;
385 char c;
386
387 if (strcmp(filename, "-"))
388 file = fopen(filename, "r");
389 else
390 file = stdin;
391
392 if (file != NULL)
393 velvetLog("Reading FastQ file %s\n", filename);
394 else
395 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
396
397 // Checking if FastQ
398 c = getc(file);
399 if (c != EOF && c != '@')
400 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
401 ungetc(c, file);
402
403 while(fgets(line, maxline, file)) {
404
405 for (i = strlen(line) - 1;
406 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
407 line[i] = '\0';
408 }
409
410 velvetFprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
411 counter++;
412
413 if(!fgets(line, maxline, file))
414 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
415
416 velvetifySequence(line);
417
418 start = 0;
419 while (start <= (Coordinate)strlen(line)) {
420 strncpy(str, line + start, 60);
421 str[60] = '\0';
422 velvetFprintf(outfile, "%s\n", str);
423 start += 60;
424 }
425
426 if(!fgets(line, maxline, file))
427 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
428 if(!fgets(line, maxline, file))
429 exitErrorf(EXIT_FAILURE, true, "%s incomplete.", filename);
430 }
431
432 fclose(file);
433 velvetLog("%li reads found.\n", (long) counter);
434 velvetLog("Done\n");
435 }
436
437 // Imports sequences from a raw sequence file
438 // Memory space allocated within this function.
readRawFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)439 static void readRawFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
440 {
441 FILE *file;
442 const int maxline = 5000;
443 char line[5000];
444 char str[100];
445 IDnum counter = 0;
446 Coordinate start;
447
448 if (strcmp(filename, "-"))
449 file = fopen(filename, "r");
450 else
451 file = stdin;
452
453 if (file != NULL)
454 velvetLog("Reading raw file %s\n", filename);
455 else
456 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
457
458 while(fgets(line, maxline, file)) {
459 velvetFprintf(outfile,">RAW\t%ld\t%d\n", (long) ((*sequenceIndex)++), (int) cat);
460 counter++;
461
462 if (strlen(line) >= maxline - 1) {
463 velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
464 #ifdef DEBUG
465 abort();
466 #endif
467 exit(1);
468 }
469 velvetifySequence(line);
470 start = 0;
471 while (start <= (Coordinate)strlen(line)) {
472 strncpy(str, line + start, 60);
473 str[60] = '\0';
474 velvetFprintf(outfile, "%s\n", str);
475 start += 60;
476 }
477 }
478
479 fclose(file);
480 velvetLog("%li reads found.\n", (long) counter);
481 velvetLog("Done\n");
482 }
483
484 // Imports sequences from a zipped rfastq file
485 // Memory space allocated within this function.
readFastQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)486 static void readFastQGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
487 {
488 gzFile file;
489 const int maxline = 5000;
490 char line[5000];
491 char str[100];
492 IDnum counter = 0;
493 Coordinate start, i;
494 char c;
495
496 if (strcmp(filename, "-"))
497 file = gzopen(filename, "rb");
498 else {
499 file = gzdopen(fileno(stdin), "rb");
500 SET_BINARY_MODE(stdin);
501 }
502
503 if (file != NULL)
504 velvetLog("Reading FastQ file %s\n", filename);
505 else
506 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
507
508 // Checking if FastQ
509 c = gzgetc(file);
510 if (c != EOF && c != '@')
511 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastQ format", filename);
512 gzungetc(c, file);
513
514 while (gzgets(file, line, maxline)) {
515 for (i = strlen(line) - 1;
516 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
517 line[i] = '\0';
518 }
519
520 velvetFprintf(outfile,">%s\t%ld\t%d\n", line + 1, (long) ((*sequenceIndex)++), (int) cat);
521 counter++;
522
523 gzgets(file, line, maxline);
524
525 velvetifySequence(line);
526
527 start = 0;
528 while (start <= (Coordinate)strlen(line)) {
529 strncpy(str, line + start, 60);
530 str[60] = '\0';
531 velvetFprintf(outfile, "%s\n", str);
532 start += 60;
533 }
534
535 gzgets(file, line, maxline);
536 gzgets(file, line, maxline);
537 }
538
539 gzclose(file);
540 velvetLog("%li reads found.\n", (long) counter);
541 velvetLog("Done\n");
542 }
543
544 // Imports sequences from a zipped raw file
545 // Memory space allocated within this function.
readRawGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)546 static void readRawGZFile(FILE * outfile, char *filename, Category cat, IDnum *sequenceIndex)
547 {
548 gzFile file;
549 const int maxline = 5000;
550 char line[5000];
551 char str[100];
552 IDnum counter = 0;
553 Coordinate start;
554
555 if (strcmp(filename, "-"))
556 file = gzopen(filename, "rb");
557 else {
558 file = gzdopen(fileno(stdin), "rb");
559 SET_BINARY_MODE(stdin);
560 }
561
562 if (file != NULL)
563 velvetLog("Reading zipped raw sequence file %s\n", filename);
564 else
565 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
566
567 while (gzgets(file, line, maxline)) {
568 velvetFprintf(outfile,">RAW\t%ld\t%d\n", (long) ((*sequenceIndex)++), (int) cat);
569 counter++;
570
571 if (strlen(line) >= maxline - 1) {
572 velvetLog("Raw sequence files cannot contain reads longer than %i bp\n", maxline - 1);
573 #ifdef DEBUG
574 abort();
575 #endif
576 exit(1);
577 }
578
579 velvetifySequence(line);
580
581 start = 0;
582 while (start <= (Coordinate)strlen(line)) {
583 strncpy(str, line + start, 60);
584 str[60] = '\0';
585 velvetFprintf(outfile, "%s\n", str);
586 start += 60;
587 }
588 }
589
590 gzclose(file);
591 velvetLog("%li reads found.\n", (long) counter);
592 velvetLog("Done\n");
593 }
594
fillReferenceCoordinateTable(char * filename,ReferenceCoordinateTable * refCoords,IDnum counter)595 static void fillReferenceCoordinateTable(char *filename, ReferenceCoordinateTable * refCoords, IDnum counter)
596 {
597 FILE *file;
598 const int maxline = 5000;
599 char line[5000];
600 char * name;
601 long long start, finish;
602 Coordinate i;
603 IDnum index = 0;
604
605 if (strcmp(filename, "-"))
606 file = fopen(filename, "r");
607 else
608 file = stdin;
609
610 if (counter == 0)
611 return;
612
613 resizeReferenceCoordinateTable(refCoords,counter);
614
615 while (fgets(line, maxline, file) && index < counter) {
616 if (line[0] == '>') {
617 name = callocOrExit(strlen(line), char);
618
619 if (strchr(line, ':')) {
620 sscanf(strtok(line, ":-\r\n"), ">%s", name);
621 sscanf(strtok(NULL, ":-\r\n"), "%lli", &start);
622 sscanf(strtok(NULL, ":-\r\n"), "%lli", &finish);
623 if (start <= finish)
624 addReferenceCoordinate(refCoords, name, start, finish, true);
625 else
626 addReferenceCoordinate(refCoords, name, finish, start, false);
627 } else {
628 for (i = strlen(line) - 1;
629 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
630 line[i] = '\0';
631 }
632
633 strcpy(name, line + 1);
634 addReferenceCoordinate(refCoords, name, 1, -1, true);
635 }
636
637 index++;
638 }
639 }
640
641 sortReferenceCoordinateTable(refCoords);
642 }
643
644
645 // Imports sequences from a fasta file
646 // Memory is allocated within the function
readFastAFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)647 static void readFastAFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex, ReferenceCoordinateTable * refCoords)
648 {
649 FILE *file;
650 const int maxline = 5000;
651 char line[5000];
652 char str[100];
653 IDnum counter = 0;
654 Coordinate i;
655 char c;
656 Coordinate start;
657 int offset = 0;
658
659 if (strcmp(filename, "-"))
660 file = fopen(filename, "r");
661 else
662 file = stdin;
663
664 if (file != NULL)
665 velvetLog("Reading FastA file %s;\n", filename);
666 else
667 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
668
669 // Checking if FastA
670 c = getc(file);
671 if (c != EOF && c != '>')
672 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
673 ungetc(c, file);
674
675 while (fgets(line, maxline, file)) {
676 if (line[0] == '>') {
677 if (offset != 0) {
678 velvetFprintf(outfile, "\n");
679 offset = 0;
680 }
681
682 for (i = strlen(line) - 1;
683 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
684 line[i] = '\0';
685 }
686
687 velvetFprintf(outfile,"%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
688 counter++;
689 } else {
690 velvetifySequence(line);
691 start = 0;
692 while (start < (Coordinate)strlen(line)) {
693 strncpy(str, line + start, 60 - offset);
694 str[60 - offset] = '\0';
695 velvetFprintf(outfile, "%s", str);
696 offset += strlen(str);
697 if (offset >= 60) {
698 velvetFprintf(outfile, "\n");
699 offset = 0;
700 }
701 start += strlen(str);
702 }
703 }
704 }
705
706 if (offset != 0)
707 velvetFprintf(outfile, "\n");
708 fclose(file);
709
710 if (cat == REFERENCE)
711 fillReferenceCoordinateTable(filename, refCoords, counter);
712
713 velvetLog("%li sequences found\n", (long) counter);
714 velvetLog("Done\n");
715 }
716
717 // Imports sequences from a zipped fasta file
718 // Memory is allocated within the function
readFastAGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)719 static void readFastAGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
720 {
721 gzFile file;
722 const int maxline = 5000;
723 char line[5000];
724 char str[100];
725 IDnum counter = 0;
726 Coordinate i, start;
727 char c;
728 int offset = 0;
729
730 if (strcmp(filename, "-"))
731 file = gzopen(filename, "rb");
732 else {
733 file = gzdopen(fileno(stdin), "rb");
734 SET_BINARY_MODE(stdin);
735 }
736
737 if (file != NULL)
738 velvetLog("Reading zipped FastA file %s;\n", filename);
739 else
740 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
741
742 // Checking if FastA
743 c = gzgetc(file);
744 if (c != EOF && c != '>')
745 exitErrorf(EXIT_FAILURE, false, "%s does not seem to be in FastA format", filename);
746 gzungetc(c, file);
747
748 while (gzgets(file, line, maxline)) {
749 if (line[0] == '>') {
750 if (offset != 0) {
751 velvetFprintf(outfile, "\n");
752 offset = 0;
753 }
754
755 for (i = strlen(line) - 1;
756 i >= 0 && (line[i] == '\n' || line[i] == '\r'); i--) {
757 line[i] = '\0';
758 }
759
760 velvetFprintf(outfile, "%s\t%ld\t%d\n", line, (long) ((*sequenceIndex)++), (int) cat);
761 counter++;
762 } else {
763 velvetifySequence(line);
764
765 start = 0;
766 while (start < (Coordinate)strlen(line)) {
767 strncpy(str, line + start, 60 - offset);
768 str[60 - offset] = '\0';
769 velvetFprintf(outfile, "%s", str);
770 offset += strlen(str);
771 if (offset >= 60) {
772 velvetFprintf(outfile, "\n");
773 offset = 0;
774 }
775 start += strlen(str);
776 }
777 }
778 }
779
780 if (offset != 0)
781 velvetFprintf(outfile, "\n");
782 gzclose(file);
783
784 velvetLog("%li sequences found\n", (long) counter);
785 velvetLog("Done\n");
786 }
787
788 // Parser for new output
readMAQGZFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex)789 static void readMAQGZFile(FILE* outfile, char *filename, Category cat, IDnum * sequenceIndex)
790 {
791 gzFile file;
792 const int maxline = 1000;
793 char line[1000];
794 IDnum counter = 0;
795 char readName[500];
796 char readSeq[500];
797 char str[100];
798 Coordinate start;
799
800 if (strcmp(filename, "-"))
801 file = gzopen(filename, "rb");
802 else {
803 file = gzdopen(fileno(stdin), "rb");
804 SET_BINARY_MODE(stdin);
805 }
806
807 if (file != NULL)
808 velvetLog("Reading zipped MAQ file %s\n", filename);
809 else
810 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
811
812 // Reopen file and memorize line:
813 while (gzgets(file, line, maxline)) {
814 sscanf(line, "%s\t%*i\t%*i\t%*c\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%*i\t%[^\t]",
815 readName, readSeq);
816 velvetFprintf(outfile, ">%s\t%ld\t%d\n", readName, (long) ((*sequenceIndex)++), (int) cat);
817 velvetifySequence(readSeq);
818 start = 0;
819 while (start <= (Coordinate)strlen(readSeq)) {
820 strncpy(str, readSeq + start, 60);
821 str[60] = '\0';
822 velvetFprintf(outfile, "%s\n", str);
823 start += 60;
824 }
825
826 counter++;
827 }
828
829 gzclose(file);
830
831 velvetLog("%li sequences found\n", (long) counter);
832 velvetLog("Done\n");
833 }
834
readSAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)835 static void readSAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
836 {
837 char line[5000];
838 unsigned long lineno, readCount;
839 char previous_qname_pairing[10];
840 char previous_qname[5000];
841 char previous_seq[5000];
842 char previous_rname[5000];
843 char previous_cigar[5000];
844 long long previous_pos = -1;
845 int previous_orientation = 0;
846 boolean previous_paired = false;
847 ReferenceCoordinate * refCoord;
848 char c;
849 int cigar_index;
850 long long cigar_num;
851
852 if (cat == REFERENCE) {
853 velvetLog("SAM file %s cannot contain reference sequences.\n", filename);
854 velvetLog("Please check the command line.\n");
855 #ifdef DEBUG
856 abort();
857 #endif
858 exit(1);
859 }
860
861 FILE *file = (strcmp(filename, "-") != 0)? fopen(filename, "r") : stdin;
862 if (file)
863 velvetLog("Reading SAM file %s\n", filename);
864 else
865 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
866
867 readCount = 0;
868 for (lineno = 1; fgets(line, sizeof(line), file); lineno++)
869 if (line[0] != '@') {
870 char *qname, *flag, *seq, *rname, *cigar;
871 long long pos;
872 int orientation;
873 int i;
874
875 qname = strtok(line, "\t");
876 flag = strtok(NULL, "\t");
877 rname = strtok(NULL, "\t");
878 sscanf(strtok(NULL, "\t"), "%lli", &pos);
879 orientation = 1;
880
881 // Mapping scor
882 (void) strtok(NULL, "\t");
883 cigar = strtok(NULL, "\t");
884
885 // Columns 7,8,9 are paired name, position and score
886 for (i = 7; i < 10; i++)
887 (void) strtok(NULL, "\t");
888 seq = strtok(NULL, "\t");
889
890 if (seq == NULL) {
891 velvetFprintf(stderr,
892 "Line #%lu: ignoring SAM record with too few fields\n",
893 lineno);
894 }
895 else if (strcmp(seq, "*") == 0) {
896 velvetFprintf(stderr,
897 "Line #%lu: ignoring SAM record with omitted SEQ field\n",
898 lineno);
899 }
900 else {
901 // Accept flags represented in either decimal or hex:
902 int flagbits = strtol(flag, NULL, 0);
903
904 if (flagbits & 0x4)
905 strcpy(rname, "");
906
907 const char *qname_pairing = "";
908 if (flagbits & 0x40)
909 qname_pairing = "/1";
910 else if (flagbits & 0x80)
911 qname_pairing = "/2";
912
913 if (flagbits & 0x10) {
914 orientation = -1;
915 reverseComplementSequence(seq);
916 }
917
918 // Determine if paired to previous read
919 if (readCount > 0) {
920 if (cat % 2) {
921 if (previous_paired) {
922 // Last read paired to penultimate read
923 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
924 (long) ((*sequenceIndex)++), (int) cat);
925 writeFastaSequence(outfile, previous_seq);
926 previous_paired = false;
927 } else if (strcmp(qname, previous_qname) == 0 && strcmp(qname_pairing, previous_qname_pairing) == 0) {
928 // New multi-mapping issue
929 previous_paired = false;
930 } else if (strcmp(qname, previous_qname) == 0) {
931 // Last read paired to current reads
932 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
933 (long) ((*sequenceIndex)++), (int) cat);
934 writeFastaSequence(outfile, previous_seq);
935 previous_paired = true;
936 } else {
937 // Last read unpaired
938 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
939 (long) ((*sequenceIndex)++), (int) cat - 1);
940 writeFastaSequence(outfile, previous_seq);
941 previous_paired = false;
942 }
943 } else {
944 // Unpaired dataset
945 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
946 (long) ((*sequenceIndex)++), (int) cat);
947 writeFastaSequence(outfile, previous_seq);
948 }
949
950 if ((refCoord = findReferenceCoordinate(refCoords, previous_rname, (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
951 if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
952 ;
953 else {
954 cigar_num = 0;
955 for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
956 c = previous_cigar[cigar_index];
957 if (c == 'M' || c == '=' || c == 'X') {
958 if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
959 if (refCoord->positive_strand) {
960 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
961 } else
962 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
963 }
964 cigar_num = 0;
965 } else if (c == 'S' || c == 'I') {
966 previous_pos -= cigar_num;
967 cigar_num = 0;
968 } else if (c == 'D' || c == 'N') {
969 previous_pos += cigar_num;
970 cigar_num = 0;
971 } else if (c == 'H' || c == 'P') {
972 cigar_num = 0;
973 } else if (isdigit(c)) {
974 cigar_num = 10 * cigar_num + (c - 48);
975 } else {
976 abort();
977 }
978 }
979 }
980 }
981 }
982
983 strcpy(previous_qname, qname);
984 strcpy(previous_qname_pairing, qname_pairing);
985 strcpy(previous_seq, seq);
986 strcpy(previous_rname, rname);
987 strcpy(previous_cigar, cigar);
988 previous_pos = pos;
989 previous_orientation = orientation;
990 velvetifySequence(previous_seq);
991
992 readCount++;
993 }
994 }
995
996 if (readCount) {
997 if (cat % 2) {
998 if (previous_paired) {
999 // Last read paired to penultimate read
1000 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1001 (long) ((*sequenceIndex)++), (int) cat);
1002 writeFastaSequence(outfile, previous_seq);
1003 } else {
1004 // Last read unpaired
1005 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1006 (long) ((*sequenceIndex)++), (int) cat - 1);
1007 writeFastaSequence(outfile, previous_seq);
1008 }
1009 } else {
1010 // Unpaired dataset
1011 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1012 (long) ((*sequenceIndex)++), (int) cat);
1013 writeFastaSequence(outfile, previous_seq);
1014 }
1015
1016 if ((refCoord = findReferenceCoordinate(refCoords, previous_rname, (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1017 if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1018 ;
1019 else {
1020 cigar_num = 0;
1021 for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1022 c = previous_cigar[cigar_index];
1023 if (c == 'M' || c == '=' || c == 'X') {
1024 if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1025 if (refCoord->positive_strand) {
1026 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1027 } else
1028 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1029 }
1030 cigar_num = 0;
1031 } else if (c == 'S' || c == 'I') {
1032 previous_pos -= cigar_num;
1033 cigar_num = 0;
1034 } else if (c == 'D' || c == 'N') {
1035 previous_pos += cigar_num;
1036 cigar_num = 0;
1037 } else if (c == 'H' || c == 'P') {
1038 cigar_num = 0;
1039 } else if (isdigit(c)) {
1040 cigar_num = 10 * cigar_num + (c - 48);
1041 } else {
1042 abort();
1043 }
1044 }
1045 }
1046 }
1047 }
1048
1049 fclose(file);
1050 velvetLog("%lu reads found.\n", readCount);
1051 velvetLog("Done\n");
1052 }
1053
readBAMint32(gzFile file)1054 static int readBAMint32(gzFile file)
1055 {
1056 unsigned char buffer[4];
1057 if (gzread(file, buffer, 4) != 4)
1058 exitErrorf(EXIT_FAILURE, false, "BAM file header truncated");
1059
1060 return int32(buffer);
1061 }
1062
readBAMFile(FILE * outfile,char * filename,Category cat,IDnum * sequenceIndex,ReferenceCoordinateTable * refCoords)1063 static void readBAMFile(FILE *outfile, char *filename, Category cat, IDnum *sequenceIndex, ReferenceCoordinateTable * refCoords)
1064 {
1065 size_t seqCapacity = 0;
1066 char *seq = NULL;
1067 char cigar[5000];
1068 char cigar_buffer[5000];
1069 size_t bufferCapacity = 4;
1070 unsigned char *buffer = mallocOrExit(bufferCapacity, unsigned char);
1071 unsigned long recno, readCount;
1072 int i, refCount;
1073 gzFile file;
1074 char previous_qname_pairing[10];
1075 char previous_qname[5000];
1076 char previous_seq[5000];
1077 char previous_cigar[5000];
1078 int previous_rID = 0;
1079 long long previous_pos = -1;
1080 int previous_orientation = 0;
1081 boolean previous_paired = false;
1082 int previous_flagbits = 0;
1083 char ** refNames;
1084 ReferenceCoordinate * refCoord;
1085 char c;
1086 int cigar_index;
1087 long long cigar_num;
1088
1089 if (cat == REFERENCE) {
1090 velvetLog("BAM file %s cannot contain reference sequences.\n", filename);
1091 velvetLog("Please check the command line.\n");
1092 #ifdef DEBUG
1093 abort();
1094 #endif
1095 exit(1);
1096 }
1097
1098 if (strcmp(filename, "-") != 0)
1099 file = gzopen(filename, "rb");
1100 else {
1101 file = gzdopen(fileno(stdin), "rb");
1102 SET_BINARY_MODE(stdin);
1103 }
1104
1105 if (file != NULL)
1106 velvetLog("Reading BAM file %s\n", filename);
1107 else
1108 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1109
1110 if (! (gzread(file, buffer, 4) == 4 && memcmp(buffer, "BAM\1", 4) == 0))
1111 exitErrorf(EXIT_FAILURE, false, "%s is not in BAM format", filename);
1112
1113 // Skip header text
1114 if (gzseek(file, readBAMint32(file), SEEK_CUR) == -1)
1115 exitErrorf(EXIT_FAILURE, false, "gzseek failed");
1116
1117 // Skip header reference list
1118 refCount = readBAMint32(file);
1119 refNames = callocOrExit(refCount, char *);
1120 for (i = 0; i < refCount; i++) {
1121 int strLength;
1122
1123 if (gzread(file, buffer, 4) != 4)
1124 exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1125
1126 strLength = int32(buffer);
1127 refNames[i] = callocOrExit(strLength, char);
1128
1129 if (bufferCapacity < (size_t)(4 + strLength)) {
1130 bufferCapacity = 4 + strLength + 4096;
1131 buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1132 }
1133
1134 if (gzread(file, buffer, 4 + strLength) != 4 + strLength)
1135 exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1136
1137 strcpy(refNames[i], (char *) buffer);
1138 }
1139
1140 readCount = 0;
1141 for (recno = 1; gzread(file, buffer, 4) == 4; recno++) {
1142 int blockSize = int32(buffer);
1143 int readLength;
1144
1145 if (bufferCapacity < (size_t)(4 + blockSize)) {
1146 bufferCapacity = 4 + blockSize + 4096;
1147 buffer = reallocOrExit(buffer, bufferCapacity, unsigned char);
1148 }
1149
1150 if (gzread(file, &buffer[4], blockSize) != blockSize)
1151 exitErrorf(EXIT_FAILURE, false, "BAM alignment record truncated");
1152
1153 readLength = int32(&buffer[20]);
1154 if (readLength == 0) {
1155 velvetFprintf(stderr,
1156 "Record #%lu: ignoring BAM record with omitted SEQ field\n",
1157 recno);
1158 }
1159 else {
1160 int readNameLength = buffer[12];
1161 int flag_nc = int32(&buffer[16]);
1162 int flagbits = flag_nc >> 16;
1163 int cigarLength = flag_nc & 0xffff;
1164 char *qname = (char *)&buffer[36];
1165 uint32_t *rawcigar = (uint32_t *) &buffer[36 + readNameLength];
1166 unsigned char *rawseq =
1167 &buffer[36 + readNameLength + 4 * cigarLength];
1168 int rID = int32(&buffer[4]);
1169 // NOTE: BAM file coords are 0-based, not 1-based like SAM files
1170 // No comment
1171 long long pos = int32(&buffer[8]) + 1;
1172 int orientation = 1;
1173
1174 const char *qname_pairing = "";
1175 if (flagbits & 0x40)
1176 qname_pairing = "/1";
1177 else if (flagbits & 0x80)
1178 qname_pairing = "/2";
1179
1180 strcpy(cigar, "");
1181 for (i = 0; i < cigarLength; i++) {
1182 static const char decode_ops[] = "MIDNSHP=X";
1183 uint32_t packed = *(rawcigar++);
1184 sprintf(cigar_buffer, "%i%c", packed >> 4, decode_ops[packed & 0xf]);
1185 strcat(cigar, cigar_buffer);
1186 }
1187
1188 if (seqCapacity < (size_t)(readLength + 1)) {
1189 seqCapacity = readLength * 2 + 1;
1190 seq = reallocOrExit(seq, seqCapacity, char);
1191 }
1192
1193 for (i = 0; i < readLength; i += 2) {
1194 static const char decode_bases[] = "=ACMGRSVTWYHKDBN";
1195 unsigned int packed = *(rawseq++);
1196 seq[i] = decode_bases[packed >> 4];
1197 seq[i+1] = decode_bases[packed & 0xf];
1198 }
1199 seq[readLength] = '\0';
1200
1201 if (flagbits & 0x10) {
1202 orientation = -1;
1203 reverseComplementSequence(seq);
1204 }
1205
1206 // Determine if paired to previous read
1207 if (readCount > 0) {
1208 if (cat % 2) {
1209 if (previous_paired) {
1210 // Last read paired to penultimate read
1211 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1212 (long) ((*sequenceIndex)++), (int) cat);
1213 writeFastaSequence(outfile, previous_seq);
1214 previous_paired = false;
1215 } else if (strcmp(qname, previous_qname) == 0 && strcmp(qname_pairing, previous_qname_pairing) == 0) {
1216 // New multi-mapping issue
1217 previous_paired = false;
1218 } else if (strcmp(qname, previous_qname) == 0) {
1219 // Last read paired to current reads
1220 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1221 (long) ((*sequenceIndex)++), (int) cat);
1222 writeFastaSequence(outfile, previous_seq);
1223 previous_paired = true;
1224 } else {
1225 // Last read unpaired
1226 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1227 (long) ((*sequenceIndex)++), (int) cat - 1);
1228 writeFastaSequence(outfile, previous_seq);
1229 previous_paired = false;
1230 }
1231 } else {
1232 // Unpaired dataset
1233 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1234 (long) ((*sequenceIndex)++), (int) cat);
1235 writeFastaSequence(outfile, previous_seq);
1236 }
1237
1238 if (!(previous_flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[previous_rID], (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1239 if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1240 ;
1241 else {
1242 cigar_num = 0;
1243 for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1244 c = previous_cigar[cigar_index];
1245 if (c == 'M' || c == '=' || c == 'X') {
1246 if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1247 if (refCoord->positive_strand) {
1248 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1249 } else
1250 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1251 }
1252 cigar_num = 0;
1253 } else if (c == 'S' || c == 'I') {
1254 previous_pos -= cigar_num;
1255 cigar_num = 0;
1256 } else if (c == 'D' || c == 'N') {
1257 previous_pos += cigar_num;
1258 cigar_num = 0;
1259 } else if (c == 'H' || c == 'P') {
1260 cigar_num = 0;
1261 } else if (isdigit(c)) {
1262 cigar_num = 10 * cigar_num + (c - 48);
1263 } else {
1264 abort();
1265 }
1266 }
1267 }
1268 }
1269 }
1270
1271 strcpy(previous_qname, qname);
1272 strcpy(previous_qname_pairing, qname_pairing);
1273 strcpy(previous_seq, seq);
1274 strcpy(previous_cigar, cigar);
1275 previous_rID = rID;
1276 previous_pos = pos;
1277 previous_orientation = orientation;
1278 previous_flagbits = flagbits;
1279 velvetifySequence(previous_seq);
1280
1281 readCount++;
1282 }
1283 }
1284
1285 if (readCount) {
1286 if (cat % 2) {
1287 if (previous_paired) {
1288 // Last read paired to penultimate read
1289 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1290 (long) ((*sequenceIndex)++), (int) cat);
1291 writeFastaSequence(outfile, previous_seq);
1292 } else {
1293 // Last read unpaired
1294 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1295 (long) ((*sequenceIndex)++), (int) cat - 1);
1296 writeFastaSequence(outfile, previous_seq);
1297 }
1298 } else {
1299 // Unpaired dataset
1300 velvetFprintf(outfile, ">%s%s\t%ld\t%d\n", previous_qname, previous_qname_pairing,
1301 (long) ((*sequenceIndex)++), (int) cat);
1302 writeFastaSequence(outfile, previous_seq);
1303 }
1304
1305 if (!(previous_flagbits & 0x4) && (refCoord = findReferenceCoordinate(refCoords, refNames[previous_rID], (Coordinate) previous_pos, (Coordinate) previous_pos + strlen(previous_seq) - 1, previous_orientation))) {
1306 if (strlen(previous_cigar) == 1 && previous_cigar[0] == '*')
1307 ;
1308 else {
1309 cigar_num = 0;
1310 for (cigar_index = 0; cigar_index < (int)strlen(previous_cigar); cigar_index++) {
1311 c = previous_cigar[cigar_index];
1312 if (c == 'M' || c == '=' || c == 'X') {
1313 if (refCoord->finish < 0 || previous_pos < refCoord->finish) {
1314 if (refCoord->positive_strand) {
1315 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) previous_orientation * refCoord->referenceID, (long long) (previous_pos - refCoord->start));
1316 } else
1317 velvetFprintf(outfile, "M\t%li\t%lli\n", (long) - previous_orientation * refCoord->referenceID, (long long) (refCoord->finish - previous_pos - strlen(previous_seq)));
1318 }
1319 cigar_num = 0;
1320 } else if (c == 'S' || c == 'I') {
1321 previous_pos -= cigar_num;
1322 cigar_num = 0;
1323 } else if (c == 'D' || c == 'N') {
1324 previous_pos += cigar_num;
1325 cigar_num = 0;
1326 } else if (c == 'H' || c == 'P') {
1327 cigar_num = 0;
1328 } else if (isdigit(c)) {
1329 cigar_num = 10 * cigar_num + (c - 48);
1330 } else {
1331 abort();
1332 }
1333 }
1334 }
1335 }
1336 }
1337
1338 free(seq);
1339 free(buffer);
1340
1341 gzclose(file);
1342 velvetLog("%lu reads found.\n", readCount);
1343 velvetLog("Done\n");
1344 }
1345
printUsage()1346 static void printUsage()
1347 {
1348 puts("Usage:");
1349 puts("./velveth directory hash_length {[-file_format][-read_type] filename} [options]");
1350 puts("");
1351 puts("\tdirectory\t\t: directory name for output files");
1352 printf("\thash_length\t\t: odd integer (if even, it will be decremented) <= %i (if above, will be reduced)\n", MAXKMERLENGTH);
1353 puts("\tfilename\t\t: path to sequence file or - for standard input");
1354 puts("");
1355 puts("File format options:");
1356 puts("\t-fasta");
1357 puts("\t-fastq");
1358 puts("\t-raw");
1359 puts("\t-fasta.gz");
1360 puts("\t-fastq.gz");
1361 puts("\t-raw.gz");
1362 puts("\t-sam");
1363 puts("\t-bam");
1364 puts("\t-eland");
1365 puts("\t-gerald");
1366 puts("");
1367 puts("Read type options:");
1368 puts("\t-short");
1369 puts("\t-shortPaired");
1370 puts("\t-short2");
1371 puts("\t-shortPaired2");
1372 puts("\t-long");
1373 puts("\t-longPaired");
1374 puts("\t-reference");
1375 puts("");
1376 puts("Options:");
1377 puts("\t-strand_specific\t: for strand specific transcriptome sequencing data (default: off)");
1378 puts("");
1379 puts("Output:");
1380 puts("\tdirectory/Roadmaps");
1381 puts("\tdirectory/Sequences");
1382 puts("\t\t[Both files are picked up by graph, so please leave them there]");
1383 }
1384
1385 #define FASTQ 1
1386 #define FASTA 2
1387 #define GERALD 3
1388 #define ELAND 4
1389 #define FASTA_GZ 5
1390 #define FASTQ_GZ 6
1391 #define MAQ_GZ 7
1392 #define SAM 8
1393 #define BAM 9
1394 #define RAW 10
1395 #define RAW_GZ 11
1396
1397 // General argument parser for most functions
1398 // Basically a reused portion of toplevel code dumped into here
parseDataAndReadFiles(char * filename,int argc,char ** argv,boolean * double_strand,boolean * noHash)1399 void parseDataAndReadFiles(char * filename, int argc, char **argv, boolean * double_strand, boolean * noHash)
1400 {
1401 int argIndex = 1;
1402 FILE *outfile;
1403 int filetype = FASTA;
1404 Category cat = 0;
1405 IDnum sequenceIndex = 1;
1406 short short_var;
1407 ReferenceCoordinateTable * refCoords = newReferenceCoordinateTable();
1408 boolean reuseSequences = false;
1409
1410 if (argc < 2) {
1411 printUsage();
1412 #ifdef DEBUG
1413 abort();
1414 #endif
1415 exit(1);
1416 }
1417
1418 for (argIndex = 1; argIndex < argc; argIndex++) {
1419 if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1420 *double_strand = false;
1421 reference_coordinate_double_strand = false;
1422 } else if (strcmp(argv[argIndex], "-reuse_Sequences") == 0) {
1423 reuseSequences = true;
1424 } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1425 *noHash = true;
1426 }
1427 }
1428
1429 if (reuseSequences)
1430 return;
1431
1432 outfile = fopen(filename, "w");
1433
1434 for (argIndex = 1; argIndex < argc; argIndex++) {
1435 if (argv[argIndex][0] == '-' && strlen(argv[argIndex]) > 1) {
1436
1437 if (strcmp(argv[argIndex], "-fastq") == 0)
1438 filetype = FASTQ;
1439 else if (strcmp(argv[argIndex], "-fasta") == 0)
1440 filetype = FASTA;
1441 else if (strcmp(argv[argIndex], "-gerald") == 0)
1442 filetype = GERALD;
1443 else if (strcmp(argv[argIndex], "-eland") == 0)
1444 filetype = ELAND;
1445 else if (strcmp(argv[argIndex], "-fastq.gz") == 0)
1446 filetype = FASTQ_GZ;
1447 else if (strcmp(argv[argIndex], "-fasta.gz") == 0)
1448 filetype = FASTA_GZ;
1449 else if (strcmp(argv[argIndex], "-sam") == 0)
1450 filetype = SAM;
1451 else if (strcmp(argv[argIndex], "-bam") == 0)
1452 filetype = BAM;
1453 else if (strcmp(argv[argIndex], "-raw") == 0)
1454 filetype = RAW;
1455 else if (strcmp(argv[argIndex], "-raw.gz") == 0)
1456 filetype = RAW_GZ;
1457 else if (strcmp(argv[argIndex], "-maq.gz") == 0)
1458 filetype = MAQ_GZ;
1459 else if (strcmp(argv[argIndex], "-short") == 0)
1460 cat = 0;
1461 else if (strcmp(argv[argIndex], "-shortPaired") ==
1462 0)
1463 cat = 1;
1464 else if (strncmp
1465 (argv[argIndex], "-shortPaired",
1466 12) == 0) {
1467 sscanf(argv[argIndex], "-shortPaired%hd", &short_var);
1468 cat = (Category) short_var;
1469 if (cat < 1 || cat > CATEGORIES) {
1470 velvetLog("Unknown option: %s\n",
1471 argv[argIndex]);
1472 #ifdef DEBUG
1473 abort();
1474 #endif
1475 exit(1);
1476 }
1477 cat--;
1478 cat *= 2;
1479 cat++;
1480 } else if (strncmp(argv[argIndex], "-short", 6) ==
1481 0) {
1482 sscanf(argv[argIndex], "-short%hd", &short_var);
1483 cat = (Category) short_var;
1484 if (cat < 1 || cat > CATEGORIES) {
1485 velvetLog("Unknown option: %s\n",
1486 argv[argIndex]);
1487 #ifdef DEBUG
1488 abort();
1489 #endif
1490 exit(1);
1491 }
1492 cat--;
1493 cat *= 2;
1494 } else if (strcmp(argv[argIndex], "-long") == 0)
1495 cat = CATEGORIES * 2;
1496 else if (strcmp(argv[argIndex], "-longPaired") ==
1497 0)
1498 cat = CATEGORIES * 2 + 1;
1499 else if (strcmp(argv[argIndex], "-reference") ==
1500 0)
1501 cat = CATEGORIES * 2 + 2;
1502 else if (strcmp(argv[argIndex], "-strand_specific") == 0) {
1503 *double_strand = false;
1504 reference_coordinate_double_strand = false;
1505 } else if (strcmp(argv[argIndex], "-noHash") == 0) {
1506 ;
1507 }
1508 else {
1509 velvetLog("Unknown option: %s\n",
1510 argv[argIndex]);
1511 #ifdef DEBUG
1512 abort();
1513 #endif
1514 exit(1);
1515 }
1516
1517 continue;
1518 }
1519
1520 if (cat == -1)
1521 continue;
1522
1523 switch (filetype) {
1524 case FASTA:
1525 readFastAFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1526 break;
1527 case FASTQ:
1528 readFastQFile(outfile, argv[argIndex], cat, &sequenceIndex);
1529 break;
1530 case RAW:
1531 readRawFile(outfile, argv[argIndex], cat, &sequenceIndex);
1532 break;
1533 case GERALD:
1534 readSolexaFile(outfile, argv[argIndex], cat, &sequenceIndex);
1535 break;
1536 case ELAND:
1537 readElandFile(outfile, argv[argIndex], cat, &sequenceIndex);
1538 break;
1539 case FASTA_GZ:
1540 readFastAGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1541 break;
1542 case FASTQ_GZ:
1543 readFastQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1544 break;
1545 case RAW_GZ:
1546 readRawGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1547 break;
1548 case SAM:
1549 readSAMFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1550 break;
1551 case BAM:
1552 readBAMFile(outfile, argv[argIndex], cat, &sequenceIndex, refCoords);
1553 break;
1554 case MAQ_GZ:
1555 readMAQGZFile(outfile, argv[argIndex], cat, &sequenceIndex);
1556 break;
1557 default:
1558 velvetLog("Screw up in parser... exiting\n");
1559 #ifdef DEBUG
1560 abort();
1561 #endif
1562 exit(1);
1563 }
1564 }
1565
1566 destroyReferenceCoordinateTable(refCoords);
1567 fclose(outfile);
1568 }
1569
createReadPairingArray(ReadSet * reads)1570 void createReadPairingArray(ReadSet* reads)
1571 {
1572 IDnum index;
1573 IDnum *mateReads = mallocOrExit(reads->readCount, IDnum);
1574 Category cat = 0;
1575 int phase = 0;
1576
1577 for (index = 0; index < reads->readCount; index++)
1578 mateReads[index] = -1;
1579
1580 reads->mateReads = mateReads;
1581
1582 for (index = 0; index < reads->readCount; index++)
1583 {
1584 // Paired category
1585 if (cat & 1)
1586 {
1587 // Leaving the paired category
1588 if (reads->categories[index] != cat)
1589 {
1590 if (phase == 1)
1591 {
1592 reads->mateReads[index - 1] = -1;
1593 reads->categories[index - 1]--;
1594 phase = 0;
1595 }
1596 cat = reads->categories[index];
1597 // Into another paired category
1598 if (cat & 1)
1599 {
1600 reads->mateReads[index] = index + 1;
1601 phase = 1;
1602 }
1603 }
1604 else if (phase == 0)
1605 {
1606 reads->mateReads[index] = index + 1;
1607 phase = 1;
1608 }
1609 else
1610 {
1611 reads->mateReads[index] = index - 1;
1612 phase = 0;
1613 }
1614 }
1615 // Leaving an unpaired category
1616 else if (reads->categories[index] != cat)
1617 {
1618 cat = reads->categories[index];
1619 // Into a paired category
1620 if (cat & 1)
1621 {
1622 reads->mateReads[index] = index + 1;
1623 phase = 1;
1624 }
1625 }
1626 }
1627 }
1628
pairedCategories(ReadSet * reads)1629 int pairedCategories(ReadSet * reads)
1630 {
1631 boolean pairedCat[CATEGORIES + 1];
1632 int pairedCatCount = 0;
1633 IDnum index;
1634
1635 for (index = 0; index <= CATEGORIES; index++)
1636 pairedCat[index] = 0;
1637
1638 for (index = 0; index < reads->readCount; index++) {
1639 if (reads->categories[index] & 1 && !pairedCat[reads->categories[index] / 2]) {
1640 pairedCat[reads->categories[index] / 2] = true;
1641 if (pairedCatCount++ == CATEGORIES)
1642 break;
1643 }
1644 }
1645
1646 return pairedCatCount;
1647 }
1648
isSecondInPair(ReadSet * reads,IDnum index)1649 boolean isSecondInPair(ReadSet * reads, IDnum index)
1650 {
1651 return reads->secondInPair[index / 8] & (1 << (index & 7));
1652 }
1653
1654
computeSecondInPair(ReadSet * reads)1655 static void computeSecondInPair(ReadSet * reads)
1656 {
1657 IDnum index;
1658 Category currentCat = 0;
1659 Category previousCat = 0;
1660 int phase = 0;
1661
1662 if (reads->secondInPair)
1663 free (reads->secondInPair);
1664 reads->secondInPair = callocOrExit((reads->readCount + 7) / 8, unsigned char);
1665
1666 for (index = 0; index < reads->readCount; index++)
1667 {
1668 currentCat = reads->categories[index];
1669 if (currentCat & 1)
1670 {
1671 if (previousCat == currentCat)
1672 {
1673 if (phase == 0)
1674 {
1675 phase = 1;
1676 }
1677 else
1678 {
1679 reads->secondInPair[index / 8] |= (1 << (index & 7));
1680 phase = 0;
1681 }
1682 }
1683 else
1684 phase = 1;
1685 }
1686 previousCat = currentCat;
1687 }
1688 }
1689
detachDubiousReads(ReadSet * reads,boolean * dubiousReads)1690 void detachDubiousReads(ReadSet * reads, boolean * dubiousReads)
1691 {
1692 IDnum index;
1693 IDnum pairID;
1694 IDnum sequenceCount = reads->readCount;
1695 IDnum *mateReads = reads->mateReads;
1696
1697 if (dubiousReads == NULL || mateReads == NULL)
1698 return;
1699
1700 for (index = 0; index < sequenceCount; index++) {
1701 if (!dubiousReads[index] || reads->categories[index] % 2 == 0 )
1702 continue;
1703
1704 if (isSecondInPair(reads, index))
1705 pairID = index - 1;
1706 else
1707 pairID = index + 1;
1708
1709 reads->categories[index] = (reads->categories[index] / 2) * 2;
1710 reads->categories[pairID] = (reads->categories[pairID] / 2) * 2;
1711 }
1712 }
1713
importReadSet(char * filename)1714 ReadSet *importReadSet(char *filename)
1715 {
1716 FILE *file = fopen(filename, "r");
1717 char *sequence = NULL;
1718 Coordinate bpCount = 0;
1719 const int maxline = 5000;
1720 char line[5000];
1721 IDnum sequenceCount, sequenceIndex;
1722 ReadSet *reads;
1723 short int temp_short;
1724 int lineLength;
1725
1726 if (file != NULL)
1727 velvetLog("Reading read set file %s;\n", filename);
1728 else
1729 exitErrorf(EXIT_FAILURE, true, "Could not open %s", filename);
1730
1731 reads = newReadSet();
1732
1733 // Count number of separate sequences
1734 sequenceCount = 0;
1735 while (fgets(line, maxline, file) != NULL)
1736 if (line[0] == '>')
1737 sequenceCount++;
1738 fclose(file);
1739 velvetLog("%li sequences found\n", (long) sequenceCount);
1740
1741 reads->readCount = sequenceCount;
1742
1743 if (reads->readCount == 0) {
1744 reads->sequences = NULL;
1745 reads->categories = NULL;
1746 return reads;
1747 }
1748
1749 reads->sequences = callocOrExit(sequenceCount, char *);
1750 reads->categories = callocOrExit(sequenceCount, Category);
1751 // Counting base pair length of each sequence:
1752 file = fopen(filename, "r");
1753 sequenceIndex = -1;
1754 while (fgets(line, maxline, file) != NULL) {
1755 if (line[0] == '>') {
1756
1757 // Reading category info
1758 sscanf(line, "%*[^\t]\t%*[^\t]\t%hd",
1759 &temp_short);
1760 reads->categories[sequenceIndex + 1] = (Category) temp_short;
1761
1762 if (sequenceIndex != -1)
1763 reads->sequences[sequenceIndex] =
1764 mallocOrExit(bpCount + 1, char);
1765 sequenceIndex++;
1766 bpCount = 0;
1767 } if (line[0] == 'M') {;
1768 // Map line
1769 } else {
1770 bpCount += (Coordinate) strlen(line) - 1;
1771
1772 if (sizeof(ShortLength) == sizeof(int16_t) && bpCount > SHRT_MAX) {
1773 velvetLog("Read %li of length %lli, longer than limit %i\n",
1774 (long) sequenceIndex + 1, (long long) bpCount, SHRT_MAX);
1775 velvetLog("You should modify recompile with the LONGSEQUENCES option (cf. manual)\n");
1776 exit(1);
1777 }
1778 }
1779 }
1780
1781 //velvetLog("Sequence %d has length %d\n", sequenceIndex, bpCount);
1782 reads->sequences[sequenceIndex] =
1783 mallocOrExit(bpCount + 1, char);
1784 fclose(file);
1785
1786 // Reopen file and memorize line:
1787 file = fopen(filename, "r");
1788 sequenceIndex = -1;
1789 while (fgets(line, maxline, file)) {
1790 if (line[0] == '>') {
1791 if (sequenceIndex != -1) {
1792 sequence[bpCount] = '\0';
1793 }
1794 sequenceIndex++;
1795 bpCount = 0;
1796 //velvetLog("Starting to read sequence %d\n",
1797 // sequenceIndex);
1798 sequence = reads->sequences[sequenceIndex];
1799 } else if (line[0] == 'M') {;
1800 // Map line
1801 } else {
1802 lineLength = strlen(line) - 1;
1803 strncpy(sequence + bpCount, line, lineLength);
1804 bpCount += (Coordinate) lineLength;
1805 }
1806 }
1807
1808 sequence[bpCount] = '\0';
1809 fclose(file);
1810 computeSecondInPair(reads);
1811
1812 velvetLog("Done\n");
1813 return reads;
1814
1815 }
1816
logInstructions(int argc,char ** argv,char * directory)1817 void logInstructions(int argc, char **argv, char *directory)
1818 {
1819 int index;
1820 char *logFilename =
1821 mallocOrExit(strlen(directory) + 100, char);
1822 FILE *logFile;
1823 time_t date;
1824 char *string;
1825
1826 time(&date);
1827 string = ctime(&date);
1828
1829 strcpy(logFilename, directory);
1830 strcat(logFilename, "/Log");
1831 logFile = fopen(logFilename, "a");
1832
1833 if (logFile == NULL)
1834 exitErrorf(EXIT_FAILURE, true, "Could not write to %s", logFilename);
1835
1836 velvetFprintf(logFile, "%s", string);
1837
1838 for (index = 0; index < argc; index++)
1839 velvetFprintf(logFile, " %s", argv[index]);
1840
1841 velvetFprintf(logFile, "\n");
1842
1843 velvetFprintf(logFile, "Version %i.%i.%2.2i\n", VERSION_NUMBER,
1844 RELEASE_NUMBER, UPDATE_NUMBER);
1845 velvetFprintf(logFile, "Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)\n");
1846 velvetFprintf(logFile, "This is free software; see the source for copying conditions. There is NO\n");
1847 velvetFprintf(logFile, "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n");
1848 velvetFprintf(logFile, "Compilation settings:\n");
1849 velvetFprintf(logFile, "CATEGORIES = %i\n", CATEGORIES);
1850 velvetFprintf(logFile, "MAXKMERLENGTH = %i\n", MAXKMERLENGTH);
1851 #ifdef _OPENMP
1852 velvetFprintf(logFile, "OPENMP\n");
1853 #endif
1854 #ifdef LONGSEQUENCES
1855 velvetFprintf(logFile, "LONGSEQUENCES\n");
1856 #endif
1857 #ifdef BIGASSEMBLY
1858 velvetFprintf(logFile, "BIGASSEMBLY\n");
1859 #endif
1860 #ifdef COLOR
1861 velvetFprintf(logFile, "COLOR\n");
1862 #endif
1863 #ifdef DEBUG
1864 velvetFprintf(logFile, "DEBUG\n");
1865 #endif
1866 velvetFprintf(logFile, "\n");
1867
1868 fclose(logFile);
1869 free(logFilename);
1870 }
1871
destroyReadSet(ReadSet * reads)1872 void destroyReadSet(ReadSet * reads)
1873 {
1874 IDnum index;
1875
1876 if (reads == NULL)
1877 return;
1878
1879 if (reads->sequences != NULL)
1880 {
1881 for (index = 0; index < reads->readCount; index++)
1882 free(reads->sequences[index]);
1883 free(reads->sequences);
1884 }
1885
1886 if (reads->tSequences != NULL)
1887 free (reads->tSequences);
1888
1889 if (reads->tSeqMem != NULL)
1890 free (reads->tSeqMem);
1891
1892 if (reads->labels != NULL)
1893 for (index = 0; index < reads->readCount; index++)
1894 free(reads->labels[index]);
1895
1896 if (reads->confidenceScores != NULL)
1897 for (index = 0; index < reads->readCount; index++)
1898 free(reads->confidenceScores[index]);
1899
1900 if (reads->kmerProbabilities != NULL)
1901 for (index = 0; index < reads->readCount; index++)
1902 free(reads->kmerProbabilities[index]);
1903
1904 free(reads->labels);
1905 free(reads->confidenceScores);
1906 free(reads->kmerProbabilities);
1907 free(reads->mateReads);
1908 free(reads->categories);
1909 free(reads->secondInPair);
1910 free(reads);
1911 }
1912
getSequenceLengths(ReadSet * reads,int wordLength)1913 ShortLength *getSequenceLengths(ReadSet * reads, int wordLength)
1914 {
1915 ShortLength *lengths = callocOrExit(reads->readCount, ShortLength);
1916 ShortLength index;
1917 int lengthOffset = wordLength - 1;
1918
1919 for (index = 0; index < reads->readCount; index++)
1920 lengths[index] =
1921 getLength(getTightStringInArray(reads->tSequences, index)) - lengthOffset;
1922
1923 return lengths;
1924 }
1925