1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <time.h>
25 #include <sys/time.h>
26
27 #ifdef _OPENMP
28 #include <omp.h>
29 #endif
30
31 #include "globals.h"
32 #include "readSet.h"
33 #include "splay.h"
34 #include "tightString.h"
35 #include "utility.h"
36 #include "kmer.h"
37 #include "kmerOccurenceTable.h"
38 #include "recycleBin.h"
39
40 typedef struct mask_st Mask;
41
42 struct mask_st {
43 Coordinate start;
44 Coordinate finish;
45 Mask* next;
46 };
47
48 static RecycleBin * maskMemory = NULL;
49
allocateMask()50 static Mask *allocateMask()
51 {
52 if (maskMemory == NULL)
53 maskMemory = newRecycleBin(sizeof(Mask), 10000);
54
55 return (Mask *) allocatePointer(maskMemory);
56 }
57
newMask(Coordinate position)58 static Mask * newMask(Coordinate position)
59 {
60 Mask * mask = allocateMask();
61 mask->start = position;
62 mask->finish = position;
63 mask->next = NULL;
64 return mask;
65 }
66
67 #define HASH_BUCKETS_NB 16777216
68
69 #ifdef _OPENMP
70
71 #define NB_PUSH 32
72 #define BUFFER_SIZE 4096
73
74 static StringBuffer **annotationBuffer = NULL;
75 static StringBuffer **annotationBufferW = NULL;
76 static int *nbPush = NULL;
77 static boolean producing = 1;
78
initAnnotationBuffers(void)79 static void initAnnotationBuffers(void)
80 {
81 int n;
82 int i;
83
84 n = omp_get_max_threads();
85 annotationBuffer = callocOrExit(n, StringBuffer*);
86 annotationBufferW = callocOrExit(n, StringBuffer*);
87 nbPush = callocOrExit(n, int);
88
89 for (i = 0; i < n; i++)
90 {
91 annotationBuffer[i] = newStringBuffer(BUFFER_SIZE);
92 annotationBufferW[i] = newStringBuffer(BUFFER_SIZE);
93 }
94 }
95
destroyAnnotationBuffers(void)96 static void destroyAnnotationBuffers(void)
97 {
98 int n;
99 int i;
100
101 n = omp_get_max_threads();
102
103 for (i = 0; i < n; i++)
104 {
105 destroyStringBuffer(annotationBuffer[i], 1);
106 destroyStringBuffer(annotationBufferW[i], 1);
107 }
108
109 free(annotationBuffer);
110 free(annotationBufferW);
111 free(nbPush);
112 annotationBuffer = NULL;
113 annotationBufferW = NULL;
114 nbPush = NULL;
115 }
116
pushBufferCommit(int thread)117 static void pushBufferCommit(int thread)
118 {
119 StringBuffer *tmp;
120 char *s;
121
122 s = annotationBufferW[thread]->str;
123 do
124 {
125 #pragma omp flush(s)
126 }
127 while (*s);
128 tmp = annotationBufferW[thread];
129 annotationBufferW[thread] = annotationBuffer[thread];
130 annotationBuffer[thread] = tmp;
131 tmp = annotationBufferW[thread];
132 #pragma omp flush(tmp)
133 }
134
pushBuffer(int thread)135 static void pushBuffer(int thread)
136 {
137 if (++nbPush[thread] == NB_PUSH)
138 {
139 nbPush[thread] = 0;
140 pushBufferCommit(thread);
141 }
142 }
143
writeBuffers(FILE * outFile,int nbThreads)144 static void writeBuffers(FILE *outFile, int nbThreads)
145 {
146 int i;
147
148 for (i = 0; i < nbThreads; i++)
149 {
150 StringBuffer *b;
151 char *s;
152
153 b = annotationBufferW[i];
154 #pragma omp flush(b)
155 s = b->str;
156 #pragma omp flush(s)
157 if (*s)
158 {
159 velvetFprintf(outFile, "%s", annotationBufferW[i]->str);
160 resetStringBuffer(annotationBufferW[i]);
161 }
162 }
163 }
164
bufferWritter(FILE * outFile)165 static void bufferWritter(FILE *outFile)
166 {
167 int n;
168
169 n = omp_get_max_threads();
170 #pragma omp flush(producing)
171 while (producing)
172 {
173 writeBuffers(outFile, n);
174 #pragma omp flush(producing)
175 }
176 writeBuffers(outFile, n);
177 }
178
appendLine(char * line,int thread)179 static void appendLine(char *line, int thread)
180 {
181 appendStringBuffer(annotationBuffer[thread], line);
182 }
183 #else
184
185 #define BUFFER_SIZE 1024
186
187 StringBuffer *annotationBuffer = NULL;
188
appendLine(char * line,int thread)189 static void appendLine(char *line, int thread)
190 {
191 appendStringBuffer(annotationBuffer, line);
192 }
193 #endif
194
195 struct splayTable_st {
196 SplayTree **table;
197 #ifdef _OPENMP
198 omp_lock_t *tableLocks;
199 #endif
200 KmerOccurenceTable *kmerOccurenceTable;
201 int WORDLENGTH;
202 boolean double_strand;
203 };
204
newSplayTable(int WORDLENGTH,boolean double_strand)205 SplayTable *newSplayTable(int WORDLENGTH, boolean double_strand)
206 {
207 SplayTable *splayTable = mallocOrExit(1, SplayTable);
208 splayTable->WORDLENGTH = WORDLENGTH;
209 splayTable->table = callocOrExit(HASH_BUCKETS_NB, SplayTree *);
210 splayTable->kmerOccurenceTable = NULL;
211 splayTable->double_strand = double_strand;
212 #ifdef _OPENMP
213 splayTable->tableLocks = mallocOrExit(HASH_BUCKETS_NB, omp_lock_t);
214 int i;
215 #pragma omp parallel for
216 for (i = 0; i < HASH_BUCKETS_NB; i++)
217 omp_init_lock(splayTable->tableLocks + i);
218 initSplayTreeMemory();
219 #endif
220 return splayTable;
221 }
222
destroySplayTable(SplayTable * splayTable)223 void destroySplayTable(SplayTable * splayTable)
224 {
225 velvetLog("Destroying splay table\n");
226
227 destroyAllSplayTrees();
228 free(splayTable->table);
229 destroyKmerOccurenceTable(splayTable->kmerOccurenceTable);
230 free(splayTable);
231
232 velvetLog("Splay table destroyed\n");
233 }
234
hash_kmer(Kmer * kmer)235 static KmerKey hash_kmer(Kmer * kmer)
236 {
237 #if KMER_LONGLONGS
238 KmerKey key = kmer->longlongs[0];
239
240 #if KMER_LONGLONGS > 1
241 key ^= kmer->longlongs[1];
242 #endif
243 #if KMER_LONGLONGS > 2
244 key ^= kmer->longlongs[2];
245 #endif
246
247 key = (~key) + (key << 21);
248 key = key ^ (key >> 24);
249 key = (key + (key << 3)) + (key << 8);
250 key = key ^ (key >> 14);
251 key = (key + (key << 2)) + (key << 4);
252 key = key ^ (key >> 28);
253 key = key + (key << 31);
254
255 return key % HASH_BUCKETS_NB;
256 #elif KMER_LONGS
257 KmerKey key = kmer->longs;
258
259 key += ~(key << 15);
260 key ^= (key >> 10);
261 key += (key << 3);
262 key ^= (key >> 6);
263 key += ~(key << 11);
264 key ^= (key >> 16);
265
266 return key % HASH_BUCKETS_NB;
267
268 #elif KMER_INTS
269 return kmer->ints % HASH_BUCKETS_NB;
270 #elif KMER_CHARS
271 return kmer->chars % HASH_BUCKETS_NB;
272 #endif
273 }
274
getNearestHSPIndex(Coordinate position,IDnum * sequenceIDs,Coordinate sequenceLength)275 static Coordinate getNearestHSPIndex(Coordinate position, IDnum * sequenceIDs, Coordinate sequenceLength) {
276 Coordinate back_offset = -1;
277 Coordinate front_offset = -1;
278
279 for (back_offset = 1; position - back_offset > 0; back_offset++)
280 if (sequenceIDs[position - back_offset])
281 break;
282
283 for (front_offset = 1; position + front_offset < sequenceLength; front_offset++)
284 if (sequenceIDs[position + front_offset])
285 break;
286
287 if (back_offset == position && position + front_offset == sequenceLength)
288 return -1;
289 else if (back_offset == position)
290 return position + front_offset;
291 else if (front_offset + position == sequenceLength)
292 return position - back_offset;
293 else
294 return back_offset < front_offset? position - back_offset : position + front_offset;
295 }
296
getMostAppropriateHit(Coordinate readCoord,Coordinate readLength,boolean direct,KmerOccurence * kmerOccurence,IDnum mapCount,IDnum * mapSequenceID,Coordinate * mapCoord,int wordLength)297 static KmerOccurence * getMostAppropriateHit(Coordinate readCoord, Coordinate readLength, boolean direct, KmerOccurence * kmerOccurence, IDnum mapCount, IDnum * mapSequenceID, Coordinate * mapCoord, int wordLength) {
298 KmerOccurence * current;
299 KmerOccurence * best = NULL;
300 Coordinate expectedPosition;
301 Coordinate positionError;
302 IDnum mapIndex;
303
304 // If only one hit
305 if (!getNextKmerOccurence(kmerOccurence))
306 return kmerOccurence;
307
308 // If multiple hits by unmapped read
309 if (mapCount == 0)
310 return NULL;
311
312 // Compare cases
313 for (current = kmerOccurence; current; current = getNextKmerOccurence(current)) {
314 for (mapIndex = 0; mapIndex < mapCount; mapIndex++) {
315
316 // If wrong sequence or unconsistent orientation
317 if ((direct && getKmerOccurenceNodeID(current) != mapSequenceID[mapIndex])
318 || (!direct && getKmerOccurenceNodeID(current) != -mapSequenceID[mapIndex]))
319 continue;
320
321 // Compute where it is supposed to land on reference
322 if (mapSequenceID[mapIndex] < 0)
323 expectedPosition = mapCoord[mapIndex] + readLength - readCoord - 1;
324 else
325 expectedPosition = mapCoord[mapIndex] + readCoord - wordLength + 1;
326
327 // Compute positional error
328 positionError = getKmerOccurencePosition(current) - expectedPosition;
329
330 // If potential hit record
331 if (positionError < 1 && positionError > -1) {
332 if (best)
333 // If competing hit, give up
334 return NULL;
335 else
336 // Record current hit
337 best = current;
338 }
339 }
340 }
341
342 return best;
343 }
344
345 static inline boolean
doFindOrInsertOccurenceInSplayTree(Kmer * kmer,IDnum * seqID,Coordinate * position,SplayTable * table)346 doFindOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
347 Coordinate * position, SplayTable *table)
348 {
349 #ifdef _OPENMP
350 const KmerKey kmerHash = hash_kmer(kmer);
351 boolean ret;
352
353 omp_set_lock(table->tableLocks + kmerHash);
354 ret = findOrInsertOccurenceInSplayTree(kmer, seqID, position,
355 table->table + kmerHash);
356 omp_unset_lock(table->tableLocks + kmerHash);
357
358 return ret;
359 #else
360 return findOrInsertOccurenceInSplayTree(kmer, seqID, position,
361 &table->table[hash_kmer(kmer)]);
362 #endif
363 }
364
365
findOrInsertOccurenceInSplayTable(Kmer * kmer,IDnum * seqID,Coordinate * position,SplayTable * table,IDnum * sequenceIDs,Coordinate * coords,Coordinate readIndex,Coordinate readLength,boolean direct)366 static boolean findOrInsertOccurenceInSplayTable(Kmer * kmer, IDnum * seqID,
367 Coordinate * position,
368 SplayTable * table, IDnum * sequenceIDs,
369 Coordinate * coords, Coordinate readIndex, Coordinate readLength, boolean direct)
370 {
371 KmerOccurence * hit;
372 Coordinate HSPIndex;
373
374 // Check if previous anchor
375 if (sequenceIDs && sequenceIDs[readIndex]) {
376 if (direct)
377 *seqID = sequenceIDs[readIndex];
378 else
379 *seqID = -sequenceIDs[readIndex];
380 if (sequenceIDs[readIndex] > 0)
381 *position = coords[readIndex] + readIndex;
382 else
383 *position = coords[readIndex] - readIndex + readLength - 1;
384
385 return true;
386 }
387 else if (coords && coords[readIndex])
388 // If in buffer zone:
389 return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
390
391 // Look up first in reference sequence k-mers
392 if (table->kmerOccurenceTable
393 && (hit = findKmerInKmerOccurenceTable(kmer, table->kmerOccurenceTable))) {
394 if (!getNextKmerOccurence(hit)) {
395 *seqID = getKmerOccurenceNodeID(hit);
396 *position = getKmerOccurencePosition(hit);
397 return true;
398 } else if ((HSPIndex = getNearestHSPIndex(*position, sequenceIDs, readLength)) > 0) {
399 hit = getMostAppropriateHit(readIndex, readLength, direct, hit, 1, &(sequenceIDs[HSPIndex]), &(coords[HSPIndex]), table->WORDLENGTH);
400 if (hit) {
401 *seqID = getKmerOccurenceNodeID(hit);
402 *position = getKmerOccurencePosition(hit);
403 return true;
404 }
405
406 }
407 }
408
409 // If not, go through the novel k-mers
410 return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
411 }
412
printAnnotations(IDnum * sequenceIDs,Coordinate * coords,TightString * array,SplayTable * table,FILE * file,boolean second_in_pair,IDnum seqID)413 static void printAnnotations(IDnum *sequenceIDs, Coordinate * coords,
414 TightString * array, SplayTable * table,
415 FILE * file, boolean second_in_pair, IDnum seqID)
416 {
417 Coordinate readNucleotideIndex = 0;
418 Coordinate writeNucleotideIndex = 0;
419 Kmer word;
420 Kmer antiWord;
421 boolean annotationClosed = true;
422 IDnum sequenceID;
423 Coordinate coord;
424 boolean found;
425 Coordinate position = 0;
426 Coordinate start = 0;
427 Coordinate finish = 0;
428 IDnum referenceSequenceID = 0;
429 Nucleotide nucleotide;
430 char lineBuffer[MAXLINE];
431 TightString * tString = getTightStringInArray(array, seqID - 1);
432 int thread = 0;
433
434 clearKmer(&word);
435 clearKmer(&antiWord);
436
437 #ifdef _OPENMP
438 thread = omp_get_thread_num();
439 #endif
440
441 sprintf(lineBuffer, "ROADMAP %li\n", (long)seqID);
442 appendLine(lineBuffer, thread);
443
444 // Neglect any string shorter than WORDLENGTH :
445 if (getLength(tString) < table->WORDLENGTH) {
446 #ifdef _OPENMP
447 pushBuffer(thread);
448 #else
449 velvetFprintf(file, "%s", annotationBuffer->str);
450 resetStringBuffer(annotationBuffer);
451 #endif
452 return;
453 }
454
455 // Fill in the initial word :
456 for (readNucleotideIndex = 0;
457 readNucleotideIndex < table->WORDLENGTH - 1;
458 readNucleotideIndex++) {
459 nucleotide = getNucleotide(readNucleotideIndex, tString);
460 pushNucleotide(&word, nucleotide);
461 #ifdef COLOR
462 reversePushNucleotide(&antiWord, nucleotide);
463 #else
464 reversePushNucleotide(&antiWord, 3 - nucleotide);
465 #endif
466 }
467
468 while (readNucleotideIndex < getLength(tString)) {
469 // Shift word:
470 nucleotide = getNucleotide(readNucleotideIndex, tString);
471 pushNucleotide(&word, nucleotide);
472 #ifdef COLOR
473 reversePushNucleotide(&antiWord, nucleotide);
474 #else
475 reversePushNucleotide(&antiWord, 3 - nucleotide);
476 #endif
477
478 sequenceID = seqID;
479 coord = writeNucleotideIndex;
480
481 if (table->double_strand) {
482 if (compareKmers(&word, &antiWord) <= 0) {
483 found =
484 findOrInsertOccurenceInSplayTable(&word,
485 &sequenceID,
486 &coord,
487 table,
488 sequenceIDs,
489 coords,
490 readNucleotideIndex,
491 getLength(tString),
492 true);
493 } else {
494 sequenceID = -sequenceID;
495 found =
496 findOrInsertOccurenceInSplayTable(&antiWord,
497 &sequenceID,
498 &coord,
499 table,
500 sequenceIDs,
501 coords,
502 readNucleotideIndex,
503 getLength(tString),
504 false);
505 sequenceID = -sequenceID;
506 if( sequenceID!=seqID && sequenceID<0 ){
507 printf("*** sequenceID = %d\n", sequenceID);
508 exit(-1);
509 }
510 }
511 } else {
512 if (!second_in_pair) {
513 found =
514 findOrInsertOccurenceInSplayTable(&word,
515 &sequenceID,
516 &coord,
517 table,
518 sequenceIDs,
519 coords,
520 readNucleotideIndex,
521 getLength(tString),
522 true);
523 } else {
524 sequenceID = -sequenceID;
525 found =
526 findOrInsertOccurenceInSplayTable(&antiWord,
527 &sequenceID,
528 &coord,
529 table,
530 sequenceIDs,
531 coords,
532 readNucleotideIndex,
533 getLength(tString),
534 false);
535 sequenceID = -sequenceID;
536 }
537 }
538
539 if (!found) {
540 writeNucleotideIndex++;
541 if (!annotationClosed) {
542 sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
543 (long) referenceSequenceID, (long long) position,
544 (long long) start, (long long) finish);
545 appendLine(lineBuffer, thread);
546 }
547 annotationClosed = true;
548 }
549 // Otherwise create/complete annotation:
550 else {
551 // Forbidden k-mer
552 if (sequenceID == 0) {
553 break;
554 }
555 // Closed/inexistant annotation
556 else if (annotationClosed) {
557 referenceSequenceID = sequenceID;
558 position = writeNucleotideIndex;
559 start = finish = coord;
560
561 if (referenceSequenceID > 0)
562 finish++;
563 else
564 finish--;
565
566 annotationClosed = false;
567 }
568 // Open annotation
569 else if (sequenceID == referenceSequenceID
570 && coord == finish) {
571 if (referenceSequenceID > 0)
572 finish++;
573 else
574 finish--;
575 }
576 // Previous non corresponding annotation
577 else {
578 sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
579 (long) referenceSequenceID, (long long) position,
580 (long long) start, (long long) finish);
581 appendLine(lineBuffer, thread);
582
583 referenceSequenceID = sequenceID;
584 position = writeNucleotideIndex;
585 start = finish = coord;
586
587 if (referenceSequenceID > 0)
588 finish++;
589 else
590 finish--;
591 }
592 }
593
594 readNucleotideIndex++;
595 }
596
597 if (!annotationClosed) {
598 sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
599 (long) referenceSequenceID, (long long) position,
600 (long long) start, (long long) finish);
601 appendLine(lineBuffer, thread);
602 }
603 #ifdef _OPENMP
604 pushBuffer(thread);
605 #else
606 velvetFprintf(file, "%s", annotationBuffer->str);
607 resetStringBuffer(annotationBuffer);
608 #endif
609 return;
610 }
611
computeClearHSPs(TightString * array,FILE * seqFile,boolean second_in_pair,SplayTable * table,IDnum ** sequenceIDs,Coordinate ** coords,IDnum seqID)612 static void computeClearHSPs(TightString * array, FILE * seqFile, boolean second_in_pair, SplayTable * table, IDnum ** sequenceIDs, Coordinate ** coords, IDnum seqID) {
613 Coordinate readNucleotideIndex = 0;
614 Kmer word;
615 Kmer antiWord;
616 Kmer polyA;
617 Nucleotide nucleotide;
618 KmerOccurence * hit;
619 char line[MAXLINE];
620 char* start;
621 char c;
622
623 Coordinate mapCount = 0;
624 Coordinate maxCount = 10;
625 IDnum * mapReferenceIDs = callocOrExit(maxCount, IDnum);
626 Coordinate * mapCoords = callocOrExit(maxCount, Coordinate);
627 long long_var;
628 long long longlong_var;
629 int penalty;
630 TightString * tString;
631 Coordinate length;
632
633 clearKmer(&polyA);
634
635 #ifdef _OPENMP
636 #pragma omp critical
637 {
638 #endif
639 // Get read ID:
640 if (!fgets(line, MAXLINE, seqFile)) {
641 puts("Incomplete Sequences file (computeHSPScores)");
642 #ifdef DEBUG
643 abort();
644 #endif
645 exit(1);
646 }
647 start = strchr(line, '\t');
648 tString = getTightStringInArray(array, seqID - 1);
649 length = getLength(tString);
650 *sequenceIDs = callocOrExit(length, IDnum);
651 *coords = callocOrExit(length, Coordinate);
652
653 // Parse file for mapping info
654 while (seqFile && (c = getc(seqFile)) != EOF) {
655 if (c == '>')
656 break;
657
658 fgets(line, MAXLINE, seqFile);
659
660 if (c == 'M') {
661 sscanf(line,"\t%li\t%lli\n", &long_var, &longlong_var);
662 mapReferenceIDs[mapCount] = (IDnum) long_var;
663 mapCoords[mapCount] = (Coordinate) longlong_var;
664
665 if (++mapCount == maxCount) {
666 maxCount *= 2;
667 mapReferenceIDs = reallocOrExit(mapReferenceIDs, maxCount, IDnum);
668 mapCoords = reallocOrExit(mapCoords, maxCount, Coordinate);
669 }
670 }
671 }
672 #ifdef _OPENMP
673 }
674 #endif
675
676 // First pass for unambiguous hits
677 // Fill in the initial word :
678 clearKmer(&word);
679 clearKmer(&antiWord);
680 for (readNucleotideIndex = 0;
681 readNucleotideIndex < table->WORDLENGTH - 1;
682 readNucleotideIndex++) {
683 nucleotide = getNucleotide(readNucleotideIndex, tString);
684 pushNucleotide(&word, nucleotide);
685 #ifdef COLOR
686 reversePushNucleotide(&antiWord, nucleotide);
687 #else
688 reversePushNucleotide(&antiWord, 3 - nucleotide);
689 #endif
690 }
691
692 // Kill silly poly-T beginnings
693 while (readNucleotideIndex < getLength(tString) && (compareKmers(&antiWord, &polyA) == 0 || compareKmers(&word, &polyA) == 0)) {
694 nucleotide = getNucleotide(readNucleotideIndex++, tString);
695 pushNucleotide(&word, nucleotide);
696 #ifdef COLOR
697 reversePushNucleotide(&antiWord, nucleotide);
698 #else
699 reversePushNucleotide(&antiWord, 3 - nucleotide);
700 #endif
701 }
702
703 while (readNucleotideIndex < getLength(tString)) {
704 // Shift word:
705 nucleotide = getNucleotide(readNucleotideIndex, tString);
706 pushNucleotide(&word, nucleotide);
707
708 #ifdef COLOR
709 reversePushNucleotide(&antiWord, nucleotide);
710 #else
711 reversePushNucleotide(&antiWord, 3 - nucleotide);
712 #endif
713
714 if (table->double_strand) {
715 if (compareKmers(&word, &antiWord) <= 0) {
716 hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
717
718 if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
719 (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
720 } else {
721 hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
722
723 if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
724 (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
725 }
726 } else {
727 if (!second_in_pair) {
728 hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
729
730 if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
731 (*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
732 } else {
733 hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
734
735 if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
736 (*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
737 }
738 }
739
740 if ((*sequenceIDs)[readNucleotideIndex]) {
741 if ((*sequenceIDs)[readNucleotideIndex] > 0)
742 (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) - readNucleotideIndex;
743 else
744 (*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) + readNucleotideIndex - getLength(tString) + 1;
745 }
746
747 // Barrier to flip-flopping
748 if ((*sequenceIDs)[readNucleotideIndex - 1] != 0
749 && ((*sequenceIDs)[readNucleotideIndex] != (*sequenceIDs)[readNucleotideIndex - 1]
750 || (*coords)[readNucleotideIndex] != (*coords)[readNucleotideIndex - 1])) {
751 // Break in continuity... skip k positions
752 (*sequenceIDs)[readNucleotideIndex] = 0;
753 (*coords)[readNucleotideIndex] = -1;
754 readNucleotideIndex++;
755
756 for (penalty = 0; penalty < table->WORDLENGTH - 1 && readNucleotideIndex < getLength(tString); penalty++) {
757 nucleotide = getNucleotide(readNucleotideIndex, tString);
758 pushNucleotide(&word, nucleotide);
759
760 #ifdef COLOR
761 reversePushNucleotide(&antiWord, nucleotide);
762 #else
763 reversePushNucleotide(&antiWord, 3 - nucleotide);
764 #endif
765 (*sequenceIDs)[readNucleotideIndex] = 0;
766 (*coords)[readNucleotideIndex] = -1;
767 readNucleotideIndex++;
768 }
769 } else
770 readNucleotideIndex++;
771
772 }
773
774 free(mapReferenceIDs);
775 free(mapCoords);
776 }
777
inputSequenceIntoSplayTable(TightString * array,SplayTable * table,FILE * file,FILE * seqFile,boolean second_in_pair,IDnum seqID)778 void inputSequenceIntoSplayTable(TightString * array,
779 SplayTable * table,
780 FILE * file, FILE * seqFile,
781 boolean second_in_pair,
782 IDnum seqID)
783 {
784 IDnum * sequenceIDs = NULL;
785 Coordinate * coords = NULL;
786
787 // If appropriate, get the HSPs on reference sequences
788 if (table->kmerOccurenceTable)
789 computeClearHSPs(array, seqFile, second_in_pair, table, &sequenceIDs, &coords, seqID);
790
791 // Go through read, eventually with annotations
792 printAnnotations(sequenceIDs, coords, array, table, file, second_in_pair, seqID);
793
794 // Clean up
795 if (sequenceIDs) {
796 free(sequenceIDs);
797 free(coords);
798 }
799 }
800
inputReferenceIntoSplayTable(TightString * tString,SplayTable * table,FILE * file,IDnum seqID,Mask * mask)801 void inputReferenceIntoSplayTable(TightString * tString,
802 SplayTable * table, FILE * file, IDnum seqID, Mask * mask)
803 {
804 IDnum currentIndex;
805 Coordinate readNucleotideIndex = 0;
806 Coordinate kmerIndex = 0;
807 Kmer word;
808 Kmer antiWord;
809 Nucleotide nucleotide;
810 Mask * currentMask = mask;
811 #ifdef _OPENMP
812 char lineBuffer[MAXLINE];
813 #endif
814
815 clearKmer(&word);
816 clearKmer(&antiWord);
817
818 currentIndex = seqID;
819 #ifdef _OPENMP
820 sprintf(lineBuffer, "ROADMAP %li\n", (long)currentIndex);
821 appendLine(lineBuffer, omp_get_thread_num());
822 #else
823 velvetFprintf(file, "ROADMAP %li\n", (long)currentIndex);
824 #endif
825
826 // Neglect any string shorter than WORDLENGTH :
827 if (getLength(tString) < table->WORDLENGTH) {
828 return;
829 }
830
831 // Fill in the initial word :
832 for (readNucleotideIndex = 0;
833 readNucleotideIndex < table->WORDLENGTH - 1;
834 readNucleotideIndex++) {
835 nucleotide = getNucleotide(readNucleotideIndex, tString);
836 pushNucleotide(&word, nucleotide);
837 if (table->double_strand) {
838 #ifdef COLOR
839 reversePushNucleotide(&antiWord, nucleotide);
840 #else
841 reversePushNucleotide(&antiWord, 3 - nucleotide);
842 #endif
843 }
844 }
845
846 while (readNucleotideIndex < getLength(tString)) {
847 // Shift word:
848 nucleotide = getNucleotide(readNucleotideIndex, tString);
849 pushNucleotide(&word, nucleotide);
850
851 if (table->double_strand) {
852 #ifdef COLOR
853 reversePushNucleotide(&antiWord, nucleotide);
854 #else
855 reversePushNucleotide(&antiWord, 3 - nucleotide);
856 #endif
857 }
858
859 // Check for gap masks:
860 if (currentMask && currentMask->start - table->WORDLENGTH + 1 <= readNucleotideIndex) {
861 while(currentMask && currentMask->finish + table->WORDLENGTH - 1 < readNucleotideIndex)
862 currentMask = currentMask->next;
863
864 if (currentMask && currentMask->finish + table->WORDLENGTH - 1 >= readNucleotideIndex) {
865 readNucleotideIndex++;
866 kmerIndex++;
867 continue;
868 }
869 }
870
871 // Record k-mer
872 if (table->double_strand) {
873 if (compareKmers(&word, &antiWord) <= 0)
874 recordKmerOccurence(&word, currentIndex,
875 kmerIndex,
876 table->kmerOccurenceTable);
877 else
878 recordKmerOccurence(&antiWord, -currentIndex,
879 kmerIndex,
880 table->kmerOccurenceTable);
881 } else {
882 recordKmerOccurence(&word, currentIndex,
883 kmerIndex,
884 table->kmerOccurenceTable);
885 }
886 readNucleotideIndex++;
887 kmerIndex++;
888 }
889
890 return;
891 }
892
countReferenceKmers(ReadSet * reads,int wordLength)893 static Coordinate countReferenceKmers(ReadSet * reads, int wordLength) {
894 IDnum readIndex;
895 Coordinate length = 0;
896
897
898 for (readIndex = 0; readIndex < reads->readCount && reads->categories[readIndex] == REFERENCE; readIndex++)
899 {
900 Coordinate tmpLength = getLength(getTightStringInArray(reads->tSequences, readIndex));
901 if (tmpLength >= wordLength)
902 length += tmpLength - wordLength + 1;
903 }
904
905 return length;
906 }
907
scanReferenceSequences(FILE * file,IDnum referenceSequenceCount)908 Mask ** scanReferenceSequences(FILE * file, IDnum referenceSequenceCount) {
909 Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
910 IDnum index;
911 char line[MAXLINE];
912 char c;
913
914 // Search sequences for masks
915 for (index = 0; index < referenceSequenceCount; index++) {
916 Mask * current = NULL;
917 Coordinate position = 0;
918 boolean openMask = false;
919
920 // Read through header
921 fgets(line, MAXLINE, file);
922
923 // Read through sequence
924 while ((c = getc(file))) {
925 if (c == EOF || c == '>')
926 break;
927 else if (c == '\r' || c == '\n')
928 continue;
929 else if (c == 'n' || c == 'N') {
930 if (openMask)
931 current->finish++;
932 else if (referenceMasks[index] == NULL) {
933 referenceMasks[index] = newMask(position);
934 current = referenceMasks[index];
935 } else {
936 current->next = newMask(position);
937 current = current->next;
938 }
939 openMask = true;
940 position++;
941 } else {
942 openMask = false;
943 position++;
944 }
945 }
946 }
947
948 return referenceMasks;
949 }
950
inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,SplayTable * table,char * filename,char * seqFilename)951 void inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,
952 SplayTable * table,
953 char *filename, char* seqFilename)
954 {
955 IDnum index;
956 IDnum sequenceCount = reads->readCount;
957 TightString *array;
958 FILE *outfile = fopen(filename, "w");
959 FILE *seqFile = NULL;
960 IDnum kmerCount;
961 IDnum referenceSequenceCount = 0;
962 struct timeval start, end, diff;
963
964 // DEBUG
965 Mask ** referenceMasks;
966
967 if (outfile == NULL)
968 exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", filename);
969 else
970 velvetLog("Writing into roadmap file %s...\n", filename);
971
972 // Count reference sequences
973 for (index = 0; index < reads->readCount && reads->categories[index] == REFERENCE; index++)
974 referenceSequenceCount++;
975
976 velvetFprintf(outfile, "%ld\t%ld\t%i\t%hi\n", (long) sequenceCount, (long) referenceSequenceCount, table->WORDLENGTH, (short) table->double_strand);
977
978 if (reads->tSequences == NULL)
979 convertSequences(reads);
980
981 gettimeofday(&start, NULL);
982 array = reads->tSequences;
983
984 #ifdef _OPENMP
985 if (omp_get_max_threads() == 1)
986 {
987 omp_set_num_threads(2);
988 omp_set_nested(0);
989 }
990 else
991 omp_set_nested(1);
992 initAnnotationBuffers();
993 #else
994 annotationBuffer = newStringBuffer(BUFFER_SIZE);
995 #endif
996
997 if (referenceSequenceCount && (kmerCount = countReferenceKmers(reads, table->WORDLENGTH)) > 0) {
998 table->kmerOccurenceTable = newKmerOccurenceTable(24 , table->WORDLENGTH);
999 allocateKmerOccurences(kmerCount, table->kmerOccurenceTable);
1000 seqFile = fopen(seqFilename, "r");
1001
1002 if (seqFile == NULL)
1003 exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", seqFilename);
1004 else
1005 velvetLog("Reading mapping info from file %s\n", seqFilename);
1006
1007 // Skip through reference headers quickly
1008 referenceMasks = scanReferenceSequences(seqFile, referenceSequenceCount);
1009
1010 #ifdef _OPENMP
1011 producing = 1;
1012 #pragma omp parallel sections
1013 {
1014 #pragma omp section
1015 {
1016 bufferWritter(outfile);
1017 }
1018 #pragma omp section
1019 {
1020 #pragma omp parallel for
1021 #endif
1022 for (index = 0; index < referenceSequenceCount; index++)
1023 inputReferenceIntoSplayTable(getTightStringInArray(array, index),
1024 table, outfile, index + 1, referenceMasks[index]);
1025
1026 #ifdef _OPENMP
1027 for (index = omp_get_max_threads() - 1; index >= 0; index--)
1028 pushBufferCommit(index);
1029 producing = 0;
1030 #pragma omp flush(producing)
1031 }
1032 }
1033 #endif
1034
1035 if (maskMemory)
1036 destroyRecycleBin(maskMemory);
1037 sortKmerOccurenceTable(table->kmerOccurenceTable);
1038 }
1039
1040 velvetLog("Inputting sequences...\n");
1041
1042 #ifdef _OPENMP
1043 producing = 1;
1044 #pragma omp parallel sections
1045 {
1046 #pragma omp section
1047 {
1048 bufferWritter(outfile);
1049 }
1050 #pragma omp section
1051 {
1052 #pragma omp parallel for
1053 #endif
1054 for (index = referenceSequenceCount; index < sequenceCount; index++)
1055 {
1056 boolean second_in_pair;
1057
1058 // Progress report on screen
1059 if (index % 1000000 == 0) {
1060 velvetLog("Inputting sequence %li / %li\n",
1061 (long)index, (long)sequenceCount);
1062 fflush(stdout);
1063 }
1064
1065 // Test to make sure that all the reference reads are before all the other reads
1066 if (reads->categories[index] == REFERENCE) {
1067 velvetLog("Reference sequence placed after a non-reference read!\n");
1068 velvetLog(">> Please re-order the filenames in your command line so as "
1069 "to have the reference sequence files before all the others\n");
1070 #ifdef DEBUG
1071 abort();
1072 #endif
1073 exit(0);
1074 }
1075 second_in_pair = reads->categories[index] % 2 && isSecondInPair(reads, index);
1076
1077 // Hashing the reads
1078 inputSequenceIntoSplayTable(array, table,
1079 outfile, seqFile,
1080 second_in_pair, index + 1);
1081 }
1082 #ifdef _OPENMP
1083 for (index = omp_get_max_threads() - 1; index >= 0; index--)
1084 pushBufferCommit(index);
1085 producing = 0;
1086 #pragma omp flush(producing)
1087 }
1088 }
1089 destroyAnnotationBuffers();
1090 #else
1091 destroyStringBuffer(annotationBuffer, 1);
1092 #endif
1093
1094 gettimeofday(&end, NULL);
1095 timersub(&end, &start, &diff);
1096 velvetLog(" === Sequences loaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1097
1098 fclose(outfile);
1099 if (seqFile)
1100 fclose(seqFile);
1101
1102 //free(reads->tSequences);
1103 //reads->tSequences = NULL;
1104 //destroyReadSet(reads);
1105 velvetLog("Done inputting sequences\n");
1106 }
1107