1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <string.h>
24 #include <time.h>
25 #include <sys/time.h>
26 
27 #ifdef _OPENMP
28 #include <omp.h>
29 #endif
30 
31 #include "globals.h"
32 #include "readSet.h"
33 #include "splay.h"
34 #include "tightString.h"
35 #include "utility.h"
36 #include "kmer.h"
37 #include "kmerOccurenceTable.h"
38 #include "recycleBin.h"
39 
40 typedef struct mask_st Mask;
41 
42 struct mask_st {
43 	Coordinate start;
44 	Coordinate finish;
45 	Mask* next;
46 };
47 
48 static RecycleBin * maskMemory = NULL;
49 
allocateMask()50 static Mask *allocateMask()
51 {
52 	if (maskMemory == NULL)
53 		maskMemory = newRecycleBin(sizeof(Mask), 10000);
54 
55 	return (Mask *) allocatePointer(maskMemory);
56 }
57 
newMask(Coordinate position)58 static Mask * newMask(Coordinate position)
59 {
60 	Mask * mask = allocateMask();
61 	mask->start = position;
62 	mask->finish = position;
63 	mask->next = NULL;
64 	return mask;
65 }
66 
67 #define HASH_BUCKETS_NB 16777216
68 
69 #ifdef _OPENMP
70 
71 #define NB_PUSH 32
72 #define BUFFER_SIZE 4096
73 
74 static StringBuffer **annotationBuffer = NULL;
75 static StringBuffer **annotationBufferW = NULL;
76 static int *nbPush = NULL;
77 static boolean producing = 1;
78 
initAnnotationBuffers(void)79 static void initAnnotationBuffers(void)
80 {
81 	int n;
82 	int i;
83 
84 	n = omp_get_max_threads();
85 	annotationBuffer = callocOrExit(n, StringBuffer*);
86 	annotationBufferW = callocOrExit(n, StringBuffer*);
87 	nbPush = callocOrExit(n, int);
88 
89 	for (i = 0; i < n; i++)
90 	{
91 		annotationBuffer[i] = newStringBuffer(BUFFER_SIZE);
92 		annotationBufferW[i] = newStringBuffer(BUFFER_SIZE);
93 	}
94 }
95 
destroyAnnotationBuffers(void)96 static void destroyAnnotationBuffers(void)
97 {
98 	int n;
99 	int i;
100 
101 	n = omp_get_max_threads();
102 
103 	for (i = 0; i < n; i++)
104 	{
105 		destroyStringBuffer(annotationBuffer[i], 1);
106 		destroyStringBuffer(annotationBufferW[i], 1);
107 	}
108 
109 	free(annotationBuffer);
110 	free(annotationBufferW);
111 	free(nbPush);
112 	annotationBuffer = NULL;
113 	annotationBufferW = NULL;
114 	nbPush = NULL;
115 }
116 
pushBufferCommit(int thread)117 static void pushBufferCommit(int thread)
118 {
119 	StringBuffer *tmp;
120 	char *s;
121 
122 	s = annotationBufferW[thread]->str;
123 	do
124 	{
125 		#pragma omp flush(s)
126 	}
127 	while (*s);
128 	tmp = annotationBufferW[thread];
129 	annotationBufferW[thread] = annotationBuffer[thread];
130 	annotationBuffer[thread] = tmp;
131 	tmp = annotationBufferW[thread];
132 	#pragma omp flush(tmp)
133 }
134 
pushBuffer(int thread)135 static void pushBuffer(int thread)
136 {
137 	if (++nbPush[thread] == NB_PUSH)
138 	{
139 		nbPush[thread] = 0;
140 		pushBufferCommit(thread);
141 	}
142 }
143 
writeBuffers(FILE * outFile,int nbThreads)144 static void writeBuffers(FILE *outFile, int nbThreads)
145 {
146 	int i;
147 
148 	for (i = 0; i < nbThreads; i++)
149 	{
150 		StringBuffer *b;
151 		char *s;
152 
153 		b = annotationBufferW[i];
154 		#pragma omp flush(b)
155 		s = b->str;
156 		#pragma omp flush(s)
157 		if (*s)
158 		{
159 			velvetFprintf(outFile, "%s", annotationBufferW[i]->str);
160 			resetStringBuffer(annotationBufferW[i]);
161 		}
162 	}
163 }
164 
bufferWritter(FILE * outFile)165 static void bufferWritter(FILE *outFile)
166 {
167 	int n;
168 
169 	n = omp_get_max_threads();
170 	#pragma omp flush(producing)
171 	while (producing)
172 	{
173 		writeBuffers(outFile, n);
174 		#pragma omp flush(producing)
175 	}
176 	writeBuffers(outFile, n);
177 }
178 
appendLine(char * line,int thread)179 static void appendLine(char *line, int thread)
180 {
181 	appendStringBuffer(annotationBuffer[thread], line);
182 }
183 #else
184 
185 #define BUFFER_SIZE 1024
186 
187 StringBuffer *annotationBuffer = NULL;
188 
appendLine(char * line,int thread)189 static void appendLine(char *line, int thread)
190 {
191 	appendStringBuffer(annotationBuffer, line);
192 }
193 #endif
194 
195 struct splayTable_st {
196 	SplayTree **table;
197 #ifdef _OPENMP
198 	omp_lock_t *tableLocks;
199 #endif
200 	KmerOccurenceTable *kmerOccurenceTable;
201 	int WORDLENGTH;
202 	boolean double_strand;
203 };
204 
newSplayTable(int WORDLENGTH,boolean double_strand)205 SplayTable *newSplayTable(int WORDLENGTH, boolean double_strand)
206 {
207 	SplayTable *splayTable = mallocOrExit(1, SplayTable);
208 	splayTable->WORDLENGTH = WORDLENGTH;
209 	splayTable->table = callocOrExit(HASH_BUCKETS_NB, SplayTree *);
210 	splayTable->kmerOccurenceTable = NULL;
211 	splayTable->double_strand = double_strand;
212 #ifdef _OPENMP
213 	splayTable->tableLocks = mallocOrExit(HASH_BUCKETS_NB, omp_lock_t);
214 	int i;
215 	#pragma omp parallel for
216 	for (i = 0; i < HASH_BUCKETS_NB; i++)
217 		omp_init_lock(splayTable->tableLocks + i);
218 	initSplayTreeMemory();
219 #endif
220 	return splayTable;
221 }
222 
destroySplayTable(SplayTable * splayTable)223 void destroySplayTable(SplayTable * splayTable)
224 {
225 	velvetLog("Destroying splay table\n");
226 
227 	destroyAllSplayTrees();
228 	free(splayTable->table);
229 	destroyKmerOccurenceTable(splayTable->kmerOccurenceTable);
230 	free(splayTable);
231 
232 	velvetLog("Splay table destroyed\n");
233 }
234 
hash_kmer(Kmer * kmer)235 static KmerKey hash_kmer(Kmer * kmer)
236 {
237 #if KMER_LONGLONGS
238 	KmerKey key = kmer->longlongs[0];
239 
240 #if KMER_LONGLONGS > 1
241 	key ^= kmer->longlongs[1];
242 #endif
243 #if KMER_LONGLONGS > 2
244 	key ^= kmer->longlongs[2];
245 #endif
246 
247 	key = (~key) + (key << 21);
248 	key = key ^ (key >> 24);
249 	key = (key + (key << 3)) + (key << 8);
250 	key = key ^ (key >> 14);
251 	key = (key + (key << 2)) + (key << 4);
252 	key = key ^ (key >> 28);
253 	key = key + (key << 31);
254 
255 	return key % HASH_BUCKETS_NB;
256 #elif KMER_LONGS
257 	KmerKey key = kmer->longs;
258 
259 	key += ~(key << 15);
260 	key ^= (key >> 10);
261 	key += (key << 3);
262 	key ^= (key >> 6);
263 	key += ~(key << 11);
264 	key ^= (key >> 16);
265 
266 	return key % HASH_BUCKETS_NB;
267 
268 #elif KMER_INTS
269 	return kmer->ints % HASH_BUCKETS_NB;
270 #elif KMER_CHARS
271 	return kmer->chars % HASH_BUCKETS_NB;
272 #endif
273 }
274 
getNearestHSPIndex(Coordinate position,IDnum * sequenceIDs,Coordinate sequenceLength)275 static Coordinate getNearestHSPIndex(Coordinate position, IDnum * sequenceIDs, Coordinate sequenceLength) {
276 	Coordinate back_offset = -1;
277 	Coordinate front_offset = -1;
278 
279 	for (back_offset = 1; position - back_offset > 0; back_offset++)
280 		if (sequenceIDs[position - back_offset])
281 			break;
282 
283 	for (front_offset = 1; position + front_offset < sequenceLength; front_offset++)
284 		if (sequenceIDs[position + front_offset])
285 			break;
286 
287 	if (back_offset == position && position + front_offset == sequenceLength)
288 		return -1;
289 	else if (back_offset == position)
290 		return position + front_offset;
291 	else if (front_offset + position == sequenceLength)
292 		return position - back_offset;
293 	else
294 		return back_offset < front_offset? position - back_offset : position + front_offset;
295 }
296 
getMostAppropriateHit(Coordinate readCoord,Coordinate readLength,boolean direct,KmerOccurence * kmerOccurence,IDnum mapCount,IDnum * mapSequenceID,Coordinate * mapCoord,int wordLength)297 static KmerOccurence * getMostAppropriateHit(Coordinate readCoord, Coordinate readLength, boolean direct, KmerOccurence * kmerOccurence, IDnum mapCount, IDnum * mapSequenceID, Coordinate * mapCoord, int wordLength) {
298 	KmerOccurence * current;
299 	KmerOccurence * best = NULL;
300 	Coordinate expectedPosition;
301 	Coordinate positionError;
302 	IDnum mapIndex;
303 
304 	// If only one hit
305 	if (!getNextKmerOccurence(kmerOccurence))
306 		return kmerOccurence;
307 
308 	// If multiple hits by unmapped read
309 	if (mapCount == 0)
310 		return NULL;
311 
312 	// Compare cases
313 	for (current = kmerOccurence; current; current = getNextKmerOccurence(current)) {
314 		for (mapIndex = 0; mapIndex < mapCount; mapIndex++) {
315 
316 			// If wrong sequence or unconsistent orientation
317 			if ((direct && getKmerOccurenceNodeID(current) != mapSequenceID[mapIndex])
318 			    || (!direct && getKmerOccurenceNodeID(current) != -mapSequenceID[mapIndex]))
319 				continue;
320 
321 			// Compute where it is supposed to land on reference
322 			if (mapSequenceID[mapIndex] < 0)
323 				expectedPosition = mapCoord[mapIndex] + readLength - readCoord - 1;
324 			else
325 				expectedPosition = mapCoord[mapIndex] + readCoord - wordLength + 1;
326 
327 			// Compute positional error
328 			positionError = getKmerOccurencePosition(current) - expectedPosition;
329 
330 			// If potential hit record
331 			if (positionError < 1 && positionError > -1) {
332 				if (best)
333 					// If competing hit, give up
334 					return NULL;
335 				else
336 					// Record current hit
337 					best = current;
338 			}
339 		}
340 	}
341 
342 	return best;
343 }
344 
345 static inline boolean
doFindOrInsertOccurenceInSplayTree(Kmer * kmer,IDnum * seqID,Coordinate * position,SplayTable * table)346 doFindOrInsertOccurenceInSplayTree(Kmer * kmer, IDnum * seqID,
347 				   Coordinate * position, SplayTable *table)
348 {
349 #ifdef _OPENMP
350 	const KmerKey kmerHash = hash_kmer(kmer);
351 	boolean ret;
352 
353 	omp_set_lock(table->tableLocks + kmerHash);
354 	ret =  findOrInsertOccurenceInSplayTree(kmer, seqID, position,
355 						table->table + kmerHash);
356 	omp_unset_lock(table->tableLocks + kmerHash);
357 
358 	return ret;
359 #else
360 	return findOrInsertOccurenceInSplayTree(kmer, seqID, position,
361 						&table->table[hash_kmer(kmer)]);
362 #endif
363 }
364 
365 
findOrInsertOccurenceInSplayTable(Kmer * kmer,IDnum * seqID,Coordinate * position,SplayTable * table,IDnum * sequenceIDs,Coordinate * coords,Coordinate readIndex,Coordinate readLength,boolean direct)366 static boolean findOrInsertOccurenceInSplayTable(Kmer * kmer, IDnum * seqID,
367 					      Coordinate * position,
368 					      SplayTable * table, IDnum * sequenceIDs,
369 						 Coordinate * coords, Coordinate readIndex, Coordinate readLength, boolean direct)
370 {
371 	KmerOccurence * hit;
372 	Coordinate HSPIndex;
373 
374 	// Check if previous anchor
375 	if (sequenceIDs && sequenceIDs[readIndex]) {
376 		if (direct)
377 			*seqID = sequenceIDs[readIndex];
378 		else
379 			*seqID = -sequenceIDs[readIndex];
380 		if (sequenceIDs[readIndex] > 0)
381 			*position = coords[readIndex] + readIndex;
382 		else
383 			*position = coords[readIndex] - readIndex + readLength - 1;
384 
385 		return true;
386 	}
387 	else if (coords && coords[readIndex])
388 		// If in buffer zone:
389 		return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
390 
391 	// Look up first in reference sequence k-mers
392 	if (table->kmerOccurenceTable
393 	    && (hit = findKmerInKmerOccurenceTable(kmer, table->kmerOccurenceTable))) {
394 		if (!getNextKmerOccurence(hit)) {
395 			*seqID = getKmerOccurenceNodeID(hit);
396 			*position = getKmerOccurencePosition(hit);
397 			return true;
398 		} else if ((HSPIndex = getNearestHSPIndex(*position, sequenceIDs, readLength)) > 0) {
399 	    		hit = getMostAppropriateHit(readIndex, readLength, direct, hit, 1, &(sequenceIDs[HSPIndex]), &(coords[HSPIndex]), table->WORDLENGTH);
400 			if (hit) {
401 				*seqID = getKmerOccurenceNodeID(hit);
402 				*position = getKmerOccurencePosition(hit);
403 				return true;
404 			}
405 
406 		}
407 	}
408 
409 	// If not, go through the novel k-mers
410 	return doFindOrInsertOccurenceInSplayTree(kmer, seqID, position, table);
411 }
412 
printAnnotations(IDnum * sequenceIDs,Coordinate * coords,TightString * array,SplayTable * table,FILE * file,boolean second_in_pair,IDnum seqID)413 static void printAnnotations(IDnum *sequenceIDs, Coordinate * coords,
414 			     TightString * array, SplayTable * table,
415 			     FILE * file, boolean second_in_pair, IDnum seqID)
416 {
417 	Coordinate readNucleotideIndex = 0;
418 	Coordinate writeNucleotideIndex = 0;
419 	Kmer word;
420 	Kmer antiWord;
421 	boolean annotationClosed = true;
422 	IDnum sequenceID;
423 	Coordinate coord;
424 	boolean found;
425 	Coordinate position = 0;
426 	Coordinate start = 0;
427 	Coordinate finish = 0;
428 	IDnum referenceSequenceID = 0;
429 	Nucleotide nucleotide;
430 	char lineBuffer[MAXLINE];
431 	TightString * tString = getTightStringInArray(array, seqID - 1);
432 	int thread = 0;
433 
434 	clearKmer(&word);
435 	clearKmer(&antiWord);
436 
437 #ifdef _OPENMP
438 	thread = omp_get_thread_num();
439 #endif
440 
441 	sprintf(lineBuffer, "ROADMAP %li\n", (long)seqID);
442 	appendLine(lineBuffer, thread);
443 
444 	// Neglect any string shorter than WORDLENGTH :
445 	if (getLength(tString) < table->WORDLENGTH) {
446 #ifdef _OPENMP
447 		pushBuffer(thread);
448 #else
449 		velvetFprintf(file, "%s", annotationBuffer->str);
450 		resetStringBuffer(annotationBuffer);
451 #endif
452 		return;
453 	}
454 
455 	// Fill in the initial word :
456 	for (readNucleotideIndex = 0;
457 	     readNucleotideIndex < table->WORDLENGTH - 1;
458 	     readNucleotideIndex++) {
459 		nucleotide = getNucleotide(readNucleotideIndex, tString);
460 		pushNucleotide(&word, nucleotide);
461 #ifdef COLOR
462 		reversePushNucleotide(&antiWord, nucleotide);
463 #else
464 		reversePushNucleotide(&antiWord, 3 - nucleotide);
465 #endif
466 	}
467 
468 	while (readNucleotideIndex < getLength(tString)) {
469 		// Shift word:
470 		nucleotide = getNucleotide(readNucleotideIndex, tString);
471 		pushNucleotide(&word, nucleotide);
472 #ifdef COLOR
473 		reversePushNucleotide(&antiWord, nucleotide);
474 #else
475 		reversePushNucleotide(&antiWord, 3 - nucleotide);
476 #endif
477 
478 		sequenceID = seqID;
479 		coord = writeNucleotideIndex;
480 
481 		if (table->double_strand) {
482 			if (compareKmers(&word, &antiWord) <= 0) {
483 				found =
484 				    findOrInsertOccurenceInSplayTable(&word,
485 								      &sequenceID,
486 								      &coord,
487 								      table,
488 								      sequenceIDs,
489 								      coords,
490 								      readNucleotideIndex,
491 								      getLength(tString),
492 								      true);
493 			} else {
494 				sequenceID = -sequenceID;
495 				found =
496 				    findOrInsertOccurenceInSplayTable(&antiWord,
497 								      &sequenceID,
498 								      &coord,
499 								      table,
500 								      sequenceIDs,
501 								      coords,
502 								      readNucleotideIndex,
503 								      getLength(tString),
504 								      false);
505 				sequenceID = -sequenceID;
506 				if( sequenceID!=seqID && sequenceID<0 ){
507 				  printf("*** sequenceID = %d\n", sequenceID);
508 				  exit(-1);
509 				}
510 			}
511 		} else {
512 			if (!second_in_pair) {
513 				found =
514 				    findOrInsertOccurenceInSplayTable(&word,
515 								      &sequenceID,
516 								      &coord,
517 								      table,
518 								      sequenceIDs,
519 								      coords,
520 								      readNucleotideIndex,
521 								      getLength(tString),
522 								      true);
523 			} else {
524 				sequenceID = -sequenceID;
525 				found =
526 				    findOrInsertOccurenceInSplayTable(&antiWord,
527 								      &sequenceID,
528 								      &coord,
529 								      table,
530 								      sequenceIDs,
531 								      coords,
532 								      readNucleotideIndex,
533 								      getLength(tString),
534 								      false);
535 				sequenceID = -sequenceID;
536 			}
537 		}
538 
539 		if (!found) {
540 			writeNucleotideIndex++;
541 			if (!annotationClosed) {
542 				sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
543 					(long) referenceSequenceID, (long long) position,
544 					(long long) start, (long long) finish);
545 				appendLine(lineBuffer, thread);
546 			}
547 			annotationClosed = true;
548 		}
549 		// Otherwise create/complete annotation:
550 		else {
551 			// Forbidden k-mer
552 			if (sequenceID == 0) {
553 				break;
554 			}
555 			// Closed/inexistant annotation
556 			else if (annotationClosed) {
557 				referenceSequenceID = sequenceID;
558 				position = writeNucleotideIndex;
559 				start = finish = coord;
560 
561 				if (referenceSequenceID > 0)
562 					finish++;
563 				else
564 					finish--;
565 
566 				annotationClosed = false;
567 			}
568 			// Open annotation
569 			else if (sequenceID == referenceSequenceID
570 				 && coord == finish) {
571 				if (referenceSequenceID > 0)
572 					finish++;
573 				else
574 					finish--;
575 			}
576 			// Previous non corresponding annotation
577 			else {
578 				sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
579 					(long) referenceSequenceID, (long long) position,
580 					(long long) start, (long long) finish);
581 				appendLine(lineBuffer, thread);
582 
583 				referenceSequenceID = sequenceID;
584 				position = writeNucleotideIndex;
585 				start = finish = coord;
586 
587 				if (referenceSequenceID > 0)
588 					finish++;
589 				else
590 					finish--;
591 			}
592 		}
593 
594 		readNucleotideIndex++;
595 	}
596 
597 	if (!annotationClosed) {
598 		sprintf(lineBuffer, "%ld\t%lld\t%lld\t%lld\n",
599 			(long) referenceSequenceID, (long long) position,
600 			(long long) start, (long long) finish);
601 		appendLine(lineBuffer, thread);
602 	}
603 #ifdef _OPENMP
604 	pushBuffer(thread);
605 #else
606 	velvetFprintf(file, "%s", annotationBuffer->str);
607 	resetStringBuffer(annotationBuffer);
608 #endif
609 	return;
610 }
611 
computeClearHSPs(TightString * array,FILE * seqFile,boolean second_in_pair,SplayTable * table,IDnum ** sequenceIDs,Coordinate ** coords,IDnum seqID)612 static void computeClearHSPs(TightString * array, FILE * seqFile, boolean second_in_pair, SplayTable * table, IDnum ** sequenceIDs, Coordinate ** coords, IDnum seqID) {
613 	Coordinate readNucleotideIndex = 0;
614 	Kmer word;
615 	Kmer antiWord;
616 	Kmer polyA;
617 	Nucleotide nucleotide;
618 	KmerOccurence * hit;
619 	char line[MAXLINE];
620         char* start;
621 	char c;
622 
623 	Coordinate mapCount = 0;
624 	Coordinate maxCount = 10;
625 	IDnum * mapReferenceIDs = callocOrExit(maxCount, IDnum);
626 	Coordinate * mapCoords = callocOrExit(maxCount, Coordinate);
627 	long long_var;
628 	long long longlong_var;
629 	int penalty;
630 	TightString * tString;
631 	Coordinate length;
632 
633 	clearKmer(&polyA);
634 
635 #ifdef _OPENMP
636 	#pragma omp critical
637 	{
638 #endif
639 		// Get read ID:
640 		if (!fgets(line, MAXLINE, seqFile)) {
641 			puts("Incomplete Sequences file (computeHSPScores)");
642 #ifdef DEBUG
643 			abort();
644 #endif
645 			exit(1);
646 		}
647                 start = strchr(line, '\t');
648 		tString = getTightStringInArray(array, seqID - 1);
649 		length = getLength(tString);
650 		*sequenceIDs = callocOrExit(length, IDnum);
651 		*coords = callocOrExit(length, Coordinate);
652 
653 		// Parse file for mapping info
654 		while (seqFile && (c = getc(seqFile)) != EOF) {
655 			if (c == '>')
656 				break;
657 
658 			fgets(line, MAXLINE, seqFile);
659 
660 			if (c == 'M') {
661 				sscanf(line,"\t%li\t%lli\n", &long_var, &longlong_var);
662 				mapReferenceIDs[mapCount] = (IDnum) long_var;
663 				mapCoords[mapCount] = (Coordinate) longlong_var;
664 
665 				if (++mapCount == maxCount) {
666 					maxCount *= 2;
667 					mapReferenceIDs = reallocOrExit(mapReferenceIDs, maxCount, IDnum);
668 					mapCoords = reallocOrExit(mapCoords, maxCount, Coordinate);
669 				}
670 			}
671 		}
672 #ifdef _OPENMP
673 	}
674 #endif
675 
676 	// First pass for unambiguous hits
677 	// Fill in the initial word :
678 	clearKmer(&word);
679 	clearKmer(&antiWord);
680 	for (readNucleotideIndex = 0;
681 	     readNucleotideIndex < table->WORDLENGTH - 1;
682 	     readNucleotideIndex++) {
683 		nucleotide = getNucleotide(readNucleotideIndex, tString);
684 		pushNucleotide(&word, nucleotide);
685 #ifdef COLOR
686 		reversePushNucleotide(&antiWord, nucleotide);
687 #else
688 		reversePushNucleotide(&antiWord, 3 - nucleotide);
689 #endif
690 	}
691 
692 	// Kill silly poly-T beginnings
693 	while (readNucleotideIndex < getLength(tString) && (compareKmers(&antiWord, &polyA) == 0 || compareKmers(&word, &polyA) == 0)) {
694 		nucleotide = getNucleotide(readNucleotideIndex++, tString);
695 		pushNucleotide(&word, nucleotide);
696 #ifdef COLOR
697 		reversePushNucleotide(&antiWord, nucleotide);
698 #else
699 		reversePushNucleotide(&antiWord, 3 - nucleotide);
700 #endif
701 	}
702 
703 	while (readNucleotideIndex < getLength(tString)) {
704 		// Shift word:
705 		nucleotide = getNucleotide(readNucleotideIndex, tString);
706 		pushNucleotide(&word, nucleotide);
707 
708 #ifdef COLOR
709 		reversePushNucleotide(&antiWord, nucleotide);
710 #else
711 		reversePushNucleotide(&antiWord, 3 - nucleotide);
712 #endif
713 
714 		if (table->double_strand) {
715 			if (compareKmers(&word, &antiWord) <= 0) {
716 				hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
717 
718 				if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
719 					(*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
720 			} else {
721 				hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
722 
723 				if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
724 					(*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
725 			}
726 		} else {
727 			if (!second_in_pair) {
728 				hit = findKmerInKmerOccurenceTable(&word, table->kmerOccurenceTable);
729 
730 				if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), true, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
731 					(*sequenceIDs)[readNucleotideIndex] = getKmerOccurenceNodeID(hit);
732 			} else {
733 				hit = findKmerInKmerOccurenceTable(&antiWord, table->kmerOccurenceTable);
734 
735 				if (hit && (hit = getMostAppropriateHit(readNucleotideIndex, getLength(tString), false, hit, mapCount, mapReferenceIDs, mapCoords, table->WORDLENGTH)))
736 					(*sequenceIDs)[readNucleotideIndex] = -getKmerOccurenceNodeID(hit);
737 			}
738 		}
739 
740 		if ((*sequenceIDs)[readNucleotideIndex]) {
741 			if ((*sequenceIDs)[readNucleotideIndex] > 0)
742 				(*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) - readNucleotideIndex;
743 			else
744 				(*coords)[readNucleotideIndex] = getKmerOccurencePosition(hit) + readNucleotideIndex - getLength(tString) + 1;
745 		}
746 
747 		// Barrier to flip-flopping
748 		if ((*sequenceIDs)[readNucleotideIndex - 1] != 0
749 		    && ((*sequenceIDs)[readNucleotideIndex] != (*sequenceIDs)[readNucleotideIndex - 1]
750 			|| (*coords)[readNucleotideIndex] != (*coords)[readNucleotideIndex - 1])) {
751 			// Break in continuity... skip k positions
752 			(*sequenceIDs)[readNucleotideIndex] = 0;
753 			(*coords)[readNucleotideIndex] = -1;
754 			readNucleotideIndex++;
755 
756 			for (penalty = 0; penalty < table->WORDLENGTH  - 1 && readNucleotideIndex < getLength(tString); penalty++) {
757 				nucleotide = getNucleotide(readNucleotideIndex, tString);
758 				pushNucleotide(&word, nucleotide);
759 
760 #ifdef COLOR
761 				reversePushNucleotide(&antiWord, nucleotide);
762 #else
763 				reversePushNucleotide(&antiWord, 3 - nucleotide);
764 #endif
765 				(*sequenceIDs)[readNucleotideIndex] = 0;
766 				(*coords)[readNucleotideIndex] = -1;
767 				readNucleotideIndex++;
768 			}
769 		} else
770 			readNucleotideIndex++;
771 
772 	}
773 
774 	free(mapReferenceIDs);
775 	free(mapCoords);
776 }
777 
inputSequenceIntoSplayTable(TightString * array,SplayTable * table,FILE * file,FILE * seqFile,boolean second_in_pair,IDnum seqID)778 void inputSequenceIntoSplayTable(TightString * array,
779 				 SplayTable * table,
780 				 FILE * file, FILE * seqFile,
781 				 boolean second_in_pair,
782 				 IDnum seqID)
783 {
784 	IDnum * sequenceIDs = NULL;
785 	Coordinate * coords = NULL;
786 
787 	// If appropriate, get the HSPs on reference sequences
788 	if (table->kmerOccurenceTable)
789 		computeClearHSPs(array, seqFile, second_in_pair, table, &sequenceIDs, &coords, seqID);
790 
791 	// Go through read, eventually with annotations
792 	printAnnotations(sequenceIDs, coords, array, table, file, second_in_pair, seqID);
793 
794 	// Clean up
795 	if (sequenceIDs) {
796 		free(sequenceIDs);
797 		free(coords);
798 	}
799 }
800 
inputReferenceIntoSplayTable(TightString * tString,SplayTable * table,FILE * file,IDnum seqID,Mask * mask)801 void inputReferenceIntoSplayTable(TightString * tString,
802 				 SplayTable * table, FILE * file, IDnum seqID, Mask * mask)
803 {
804 	IDnum currentIndex;
805 	Coordinate readNucleotideIndex = 0;
806 	Coordinate kmerIndex = 0;
807 	Kmer word;
808 	Kmer antiWord;
809 	Nucleotide nucleotide;
810 	Mask * currentMask = mask;
811 #ifdef _OPENMP
812 	char lineBuffer[MAXLINE];
813 #endif
814 
815 	clearKmer(&word);
816 	clearKmer(&antiWord);
817 
818 	currentIndex = seqID;
819 #ifdef _OPENMP
820 	sprintf(lineBuffer, "ROADMAP %li\n", (long)currentIndex);
821 	appendLine(lineBuffer, omp_get_thread_num());
822 #else
823 	velvetFprintf(file, "ROADMAP %li\n", (long)currentIndex);
824 #endif
825 
826 	// Neglect any string shorter than WORDLENGTH :
827 	if (getLength(tString) < table->WORDLENGTH) {
828 		return;
829 	}
830 
831 	// Fill in the initial word :
832 	for (readNucleotideIndex = 0;
833 	     readNucleotideIndex < table->WORDLENGTH - 1;
834 	     readNucleotideIndex++) {
835 		nucleotide = getNucleotide(readNucleotideIndex, tString);
836 		pushNucleotide(&word, nucleotide);
837 		if (table->double_strand) {
838 #ifdef COLOR
839 			reversePushNucleotide(&antiWord, nucleotide);
840 #else
841 			reversePushNucleotide(&antiWord, 3 - nucleotide);
842 #endif
843 		}
844 	}
845 
846 	while (readNucleotideIndex < getLength(tString)) {
847 		// Shift word:
848 		nucleotide = getNucleotide(readNucleotideIndex, tString);
849 		pushNucleotide(&word, nucleotide);
850 
851 		if (table->double_strand) {
852 #ifdef COLOR
853 			reversePushNucleotide(&antiWord, nucleotide);
854 #else
855 			reversePushNucleotide(&antiWord, 3 - nucleotide);
856 #endif
857 		}
858 
859 		// Check for gap masks:
860 		if (currentMask && currentMask->start - table->WORDLENGTH + 1 <= readNucleotideIndex) {
861 			while(currentMask && currentMask->finish + table->WORDLENGTH - 1 < readNucleotideIndex)
862 				currentMask = currentMask->next;
863 
864 			if (currentMask && currentMask->finish + table->WORDLENGTH - 1 >= readNucleotideIndex) {
865 				readNucleotideIndex++;
866 				kmerIndex++;
867 				continue;
868 			}
869 		}
870 
871 		// Record k-mer
872 		if (table->double_strand) {
873 			if (compareKmers(&word, &antiWord) <= 0)
874 				recordKmerOccurence(&word, currentIndex,
875 						    kmerIndex,
876 						    table->kmerOccurenceTable);
877 			else
878 				recordKmerOccurence(&antiWord, -currentIndex,
879 							kmerIndex,
880 							table->kmerOccurenceTable);
881 		} else {
882 			recordKmerOccurence(&word, currentIndex,
883 					    kmerIndex,
884 					    table->kmerOccurenceTable);
885 		}
886 		readNucleotideIndex++;
887 		kmerIndex++;
888 	}
889 
890 	return;
891 }
892 
countReferenceKmers(ReadSet * reads,int wordLength)893 static Coordinate countReferenceKmers(ReadSet * reads, int wordLength) {
894 	IDnum readIndex;
895 	Coordinate length = 0;
896 
897 
898 	for (readIndex = 0; readIndex < reads->readCount && reads->categories[readIndex] == REFERENCE; readIndex++)
899 	{
900 		Coordinate tmpLength = getLength(getTightStringInArray(reads->tSequences, readIndex));
901 		if (tmpLength >= wordLength)
902 			length += tmpLength - wordLength + 1;
903 	}
904 
905 	return length;
906 }
907 
scanReferenceSequences(FILE * file,IDnum referenceSequenceCount)908 Mask ** scanReferenceSequences(FILE * file, IDnum referenceSequenceCount) {
909 	Mask ** referenceMasks = callocOrExit(referenceSequenceCount, Mask*);
910 	IDnum index;
911 	char line[MAXLINE];
912 	char c;
913 
914 	// Search sequences for masks
915 	for (index = 0; index < referenceSequenceCount; index++) {
916 		Mask * current = NULL;
917 		Coordinate position = 0;
918 		boolean openMask = false;
919 
920 		// Read through header
921 		fgets(line, MAXLINE, file);
922 
923 		// Read through sequence
924 		while ((c = getc(file))) {
925 			if (c == EOF || c == '>')
926 				break;
927 			else if (c == '\r' || c == '\n')
928 				continue;
929 			else if (c == 'n' || c == 'N') {
930 				if (openMask)
931 					current->finish++;
932 				else if (referenceMasks[index] == NULL) {
933 					referenceMasks[index] = newMask(position);
934 					current = referenceMasks[index];
935 				} else {
936 					current->next = newMask(position);
937 					current = current->next;
938 				}
939 				openMask = true;
940 				position++;
941 			} else {
942 				openMask = false;
943 				position++;
944 			}
945 		}
946 	}
947 
948 	return referenceMasks;
949 }
950 
inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,SplayTable * table,char * filename,char * seqFilename)951 void inputSequenceArrayIntoSplayTableAndArchive(ReadSet * reads,
952 						SplayTable * table,
953 						char *filename, char* seqFilename)
954 {
955 	IDnum index;
956 	IDnum sequenceCount = reads->readCount;
957 	TightString *array;
958 	FILE *outfile = fopen(filename, "w");
959 	FILE *seqFile = NULL;
960 	IDnum kmerCount;
961 	IDnum referenceSequenceCount = 0;
962 	struct timeval start, end, diff;
963 
964 	// DEBUG
965 	Mask ** referenceMasks;
966 
967 	if (outfile == NULL)
968 		exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", filename);
969 	else
970 		velvetLog("Writing into roadmap file %s...\n", filename);
971 
972 	// Count reference sequences
973 	for (index = 0; index < reads->readCount && reads->categories[index] == REFERENCE; index++)
974 		referenceSequenceCount++;
975 
976 	velvetFprintf(outfile, "%ld\t%ld\t%i\t%hi\n", (long) sequenceCount, (long) referenceSequenceCount, table->WORDLENGTH, (short) table->double_strand);
977 
978 	if (reads->tSequences == NULL)
979 		convertSequences(reads);
980 
981 	gettimeofday(&start, NULL);
982 	array = reads->tSequences;
983 
984 #ifdef _OPENMP
985 	if (omp_get_max_threads() == 1)
986 	{
987 		omp_set_num_threads(2);
988 		omp_set_nested(0);
989 	}
990 	else
991 		omp_set_nested(1);
992 	initAnnotationBuffers();
993 #else
994 	annotationBuffer = newStringBuffer(BUFFER_SIZE);
995 #endif
996 
997 	if (referenceSequenceCount && (kmerCount = countReferenceKmers(reads, table->WORDLENGTH)) > 0) {
998 		table->kmerOccurenceTable = newKmerOccurenceTable(24 , table->WORDLENGTH);
999 		allocateKmerOccurences(kmerCount, table->kmerOccurenceTable);
1000 		seqFile = fopen(seqFilename, "r");
1001 
1002 		if (seqFile == NULL)
1003 			exitErrorf(EXIT_FAILURE, true, "Couldn't write to file %s", seqFilename);
1004 		else
1005 			velvetLog("Reading mapping info from file %s\n", seqFilename);
1006 
1007 		// Skip through reference headers quickly
1008 		referenceMasks = scanReferenceSequences(seqFile, referenceSequenceCount);
1009 
1010 #ifdef _OPENMP
1011 		producing = 1;
1012 		#pragma omp parallel sections
1013 		{
1014 			#pragma omp section
1015 			{
1016 				bufferWritter(outfile);
1017 			}
1018 			#pragma omp section
1019 			{
1020 				#pragma omp parallel for
1021 #endif
1022 				for (index = 0; index < referenceSequenceCount; index++)
1023 					inputReferenceIntoSplayTable(getTightStringInArray(array, index),
1024 								     table, outfile, index + 1, referenceMasks[index]);
1025 
1026 #ifdef _OPENMP
1027 				for (index = omp_get_max_threads() - 1; index >= 0; index--)
1028 					pushBufferCommit(index);
1029 				producing = 0;
1030 				#pragma omp flush(producing)
1031 			}
1032 		}
1033 #endif
1034 
1035 		if (maskMemory)
1036 			destroyRecycleBin(maskMemory);
1037 		sortKmerOccurenceTable(table->kmerOccurenceTable);
1038 	}
1039 
1040 	velvetLog("Inputting sequences...\n");
1041 
1042 #ifdef _OPENMP
1043 	producing = 1;
1044 	#pragma omp parallel sections
1045 	{
1046 		#pragma omp section
1047 		{
1048 			bufferWritter(outfile);
1049 		}
1050 		#pragma omp section
1051 		{
1052 			#pragma omp parallel for
1053 #endif
1054 			for (index = referenceSequenceCount; index < sequenceCount; index++)
1055 			{
1056 				boolean second_in_pair;
1057 
1058 				// Progress report on screen
1059 				if (index % 1000000 == 0) {
1060 					velvetLog("Inputting sequence %li / %li\n",
1061 						  (long)index, (long)sequenceCount);
1062 					fflush(stdout);
1063 				}
1064 
1065 				// Test to make sure that all the reference reads are before all the other reads
1066 				if (reads->categories[index] == REFERENCE) {
1067 					velvetLog("Reference sequence placed after a non-reference read!\n");
1068 					velvetLog(">> Please re-order the filenames in your command line so as "
1069 						  "to have the reference sequence files before all the others\n");
1070 #ifdef DEBUG
1071 					abort();
1072 #endif
1073 					exit(0);
1074 				}
1075 				second_in_pair = reads->categories[index] % 2 && isSecondInPair(reads, index);
1076 
1077 				// Hashing the reads
1078 				inputSequenceIntoSplayTable(array, table,
1079 							    outfile, seqFile,
1080 							    second_in_pair, index + 1);
1081 			}
1082 #ifdef _OPENMP
1083 			for (index = omp_get_max_threads() - 1; index >= 0; index--)
1084 				pushBufferCommit(index);
1085 			producing = 0;
1086 			#pragma omp flush(producing)
1087 		}
1088 	}
1089 	destroyAnnotationBuffers();
1090 #else
1091 	destroyStringBuffer(annotationBuffer, 1);
1092 #endif
1093 
1094 	gettimeofday(&end, NULL);
1095 	timersub(&end, &start, &diff);
1096 	velvetLog(" === Sequences loaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1097 
1098 	fclose(outfile);
1099 	if (seqFile)
1100 		fclose(seqFile);
1101 
1102 	//free(reads->tSequences);
1103 	//reads->tSequences = NULL;
1104 	//destroyReadSet(reads);
1105 	velvetLog("Done inputting sequences\n");
1106 }
1107