1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <string.h>
25 #include <limits.h>
26 #include <sys/time.h>
27 
28 #ifdef _OPENMP
29 #include <omp.h>
30 #endif
31 
32 #include "globals.h"
33 #include "graph.h"
34 #include "passageMarker.h"
35 #include "readSet.h"
36 #include "tightString.h"
37 #include "recycleBin.h"
38 #include "utility.h"
39 #include "kmer.h"
40 #include "kmerOccurenceTable.h"
41 #include "roadMap.h"
42 
43 #define ADENINE 0
44 #define CYTOSINE 1
45 #define GUANINE 2
46 #define THYMINE 3
47 
48 
49 //////////////////////////////////////////////////////////
50 // Node Locking
51 //////////////////////////////////////////////////////////
52 
53 #ifdef _OPENMP
54 
55 /* Array of per-node locks */
56 
57 static omp_lock_t *nodeLocks = NULL;
58 
59 static void
createNodeLocks(Graph * graph)60 createNodeLocks(Graph *graph)
61 {
62 	IDnum nbNodes;
63 	IDnum nodeIndex;
64 
65 	nbNodes = nodeCount(graph) + 1;
66 	if (nodeLocks)
67 		free (nodeLocks);
68 	nodeLocks = mallocOrExit(nbNodes, omp_lock_t);
69 
70 	#pragma omp parallel for
71 	for (nodeIndex = 0; nodeIndex < nbNodes; nodeIndex++)
72 		omp_init_lock(nodeLocks + nodeIndex);
73 }
74 
lockNode(Node * node)75 static inline void lockNode(Node *node)
76 {
77 	IDnum nodeID = getNodeID(node);
78 
79 	if (nodeID < 0)
80 		nodeID = -nodeID;
81 	omp_set_lock (nodeLocks + nodeID);
82 }
83 
84 /* Assumes node is already locked */
lockTwoNodes(Node * node,Node * node2)85 static inline void lockTwoNodes(Node *node, Node *node2)
86 {
87 	IDnum nodeID = getNodeID(node);
88 	IDnum node2ID = getNodeID(node2);
89 
90 	if (nodeID < 0)
91 		nodeID = -nodeID;
92 	if (node2ID < 0)
93 		node2ID = -node2ID;
94 
95 	if (nodeID == node2ID)
96 		return;
97 
98 	/* Lock lowest ID first to avoid deadlocks */
99 	if (nodeID < node2ID)
100 	{
101 		omp_set_lock (nodeLocks + node2ID);
102 	}
103 	else if (!omp_test_lock (nodeLocks + node2ID))
104 	{
105 		omp_unset_lock (nodeLocks + nodeID);
106 		omp_set_lock (nodeLocks + node2ID);
107 		omp_set_lock (nodeLocks + nodeID);
108 	}
109 }
110 
unLockTwoNodes(Node * node,Node * node2)111 static inline void unLockTwoNodes(Node *node, Node *node2)
112 {
113 	IDnum nodeID = getNodeID(node);
114 	IDnum node2ID = getNodeID(node2);
115 
116 	if (nodeID < 0)
117 		nodeID = -nodeID;
118 	if (node2ID < 0)
119 		node2ID = -node2ID;
120 
121 	omp_unset_lock (nodeLocks + nodeID);
122 	if (nodeID != node2ID)
123 		omp_unset_lock (nodeLocks + node2ID);
124 }
125 
unLockNode(Node * node)126 static inline void unLockNode(Node *node)
127 {
128 	IDnum nodeID = getNodeID(node);
129 
130 	if (nodeID < 0)
131 		nodeID = -nodeID;
132 	omp_unset_lock (nodeLocks + nodeID);
133 }
134 
135 #endif
136 
137 //////////////////////////////////////////////////////////
138 // Node Lists
139 //////////////////////////////////////////////////////////
140 typedef struct smallNodeList_st SmallNodeList;
141 
142 struct smallNodeList_st {
143 	Node *node;
144 	SmallNodeList *next;
145 } ATTRIBUTE_PACKED;
146 
147 static RecycleBin *smallNodeListMemory = NULL;
148 
149 #define BLOCKSIZE 1000
150 
151 #ifdef _OPENMP
initSmallNodeListMemory(void)152 static void initSmallNodeListMemory(void)
153 {
154 	int n = omp_get_max_threads();
155 
156 #pragma omp critical
157 	{
158 		if (smallNodeListMemory == NULL)
159 			smallNodeListMemory = newRecycleBinArray(n, sizeof(SmallNodeList), BLOCKSIZE);
160 	}
161 }
162 #endif
163 
allocateSmallNodeList()164 static SmallNodeList *allocateSmallNodeList()
165 {
166 #ifdef _OPENMP
167 #ifdef DEBUG
168 	if (smallNodeListMemory == NULL)
169 	{
170 		velvetLog("The memory for small nodes seems uninitialised, "
171 				"this is probably a bug, aborting.\n");
172 		abort();
173 	}
174 #endif
175 	return allocatePointer(getRecycleBinInArray(smallNodeListMemory,
176 				omp_get_thread_num()));
177 #else
178 	if (smallNodeListMemory == NULL)
179 		smallNodeListMemory = newRecycleBin(sizeof(SmallNodeList), BLOCKSIZE);
180 
181 	return allocatePointer(smallNodeListMemory);
182 #endif
183 }
184 
deallocateSmallNodeList(SmallNodeList * smallNodeList)185 static void deallocateSmallNodeList(SmallNodeList * smallNodeList)
186 {
187 #ifdef _OPENMP
188 	deallocatePointer(getRecycleBinInArray(smallNodeListMemory,
189 					       omp_get_thread_num()),
190 			  smallNodeList);
191 #else
192 	deallocatePointer(smallNodeListMemory, smallNodeList);
193 #endif
194 }
195 
destroySmallNodeListMemmory(void)196 static void destroySmallNodeListMemmory(void)
197 {
198 	if (smallNodeListMemory != NULL)
199 	{
200 #ifdef _OPENMP
201 		destroyRecycleBinArray(smallNodeListMemory);
202 #else
203 		destroyRecycleBin(smallNodeListMemory);
204 #endif
205 		smallNodeListMemory = NULL;
206 	}
207 }
208 
memorizeNode(Node * node,SmallNodeList ** nodePile)209 static inline void memorizeNode(Node * node, SmallNodeList ** nodePile)
210 {
211 	SmallNodeList *list = allocateSmallNodeList();
212 	list->node = node;
213 	list->next = *nodePile;
214 	*nodePile = list;
215 #ifndef _OPENMP
216    	setSingleNodeStatus(node, true);
217 #endif
218 }
219 
isNodeMemorized(Node * node,SmallNodeList * nodePile)220 static inline boolean isNodeMemorized(Node * node, SmallNodeList * nodePile)
221 {
222 #ifdef _OPENMP
223 	/* SF TODO There must be a faster way to do this: bit mask, hash table, tree, ... ? */
224 	SmallNodeList * list;
225 
226 	for (list = nodePile; list; list = list->next)
227 		if (list->node == node)
228 			return true;
229 
230 	return false;
231 #else
232 	return getNodeStatus(node);
233 #endif
234 }
235 
unMemorizeNodes(SmallNodeList ** nodePile)236 static void unMemorizeNodes(SmallNodeList ** nodePile)
237 {
238 	SmallNodeList * list;
239 
240 	while (*nodePile) {
241 		list = *nodePile;
242 		*nodePile = list->next;
243 #ifndef _OPENMP
244    		setSingleNodeStatus(list->node, false);
245 #endif
246 		deallocateSmallNodeList(list);
247 	}
248 }
249 
250 ///////////////////////////////////////////////////////////
251 // Reference Mappings
252 ///////////////////////////////////////////////////////////
253 typedef struct referenceMapping_st ReferenceMapping;
254 
255 struct referenceMapping_st {
256 	IDnum referenceStart;
257 	IDnum nodeStart;
258 	IDnum length;
259 	IDnum referenceID;
260 	IDnum nodeID;
261 } ATTRIBUTE_PACKED;
262 
countMappings(char * preGraphFilename)263 static IDnum countMappings(char * preGraphFilename) {
264 	FILE *file = fopen(preGraphFilename, "r");
265 	const int maxline = MAXLINE;
266 	char line[MAXLINE];
267 	IDnum count = 0;
268 
269 	// Go past NODE blocks
270 	while(fgets(line, maxline, file))
271 		if (line[0] == 'S')
272 			break;
273 
274 	// Count relevant lines
275 	while(fgets(line, maxline, file))
276 		if (line[0] != 'S')
277 			count++;
278 
279 	fclose(file);
280 	return count;
281 }
282 
recordReferenceMappings(char * preGraphFilename,IDnum arrayLength)283 static ReferenceMapping * recordReferenceMappings(char * preGraphFilename, IDnum arrayLength) {
284 	ReferenceMapping * mappings = callocOrExit(arrayLength, ReferenceMapping);
285 	FILE *file = fopen(preGraphFilename, "r");
286 	const int maxline = MAXLINE;
287 	char line[MAXLINE];
288 	ReferenceMapping * current = mappings;
289 	IDnum referenceID;
290 	long long_var;
291 	long long coord1, coord2, coord3;
292 
293 	// Go past NODE blocks
294 	while(fgets(line, maxline, file))
295 		if (line[0] == 'S')
296 			break;
297 
298 	sscanf(line, "SEQ\t%li\n", &long_var);
299 	referenceID = long_var;
300 
301 	// Go relevant lines
302 	while(fgets(line, maxline, file)) {
303 		if (line[0] != 'S') {
304 			sscanf(line, "%li\t%lli\t%lli\t%lli\n", &long_var, &coord1, &coord2, &coord3);
305 			current->referenceID = referenceID;
306 			current->nodeID = long_var;
307 			current->nodeStart = coord1;
308 			current->referenceStart = coord2;
309 			current->length = coord3;
310 			current++;
311 		} else {
312 			sscanf(line, "SEQ\t%li\n", &long_var);
313 			referenceID = long_var;
314 		}
315 	}
316 
317 	fclose(file);
318 	return mappings;
319 }
320 
compareRefMaps(const void * ptrA,const void * ptrB)321 static int compareRefMaps(const void * ptrA, const void * ptrB) {
322 	ReferenceMapping * A = (ReferenceMapping *) ptrA;
323 	ReferenceMapping * B = (ReferenceMapping *) ptrB;
324 
325 	if (A->referenceID > B->referenceID)
326 		return 1;
327 	else if (A->referenceID < B->referenceID)
328 		return -1;
329 	else {
330 		if (A->referenceStart >= B->referenceStart + B->length)
331 			return 1;
332 		else if (A->referenceStart + A->length <= B->referenceStart)
333 			return -1;
334 		else
335 			return 0;
336 	}
337 }
338 
computeReferenceMappings(char * preGraphFilename,ReadSet * reads,Coordinate * referenceMappingLength,IDnum * referenceCount)339 static ReferenceMapping * computeReferenceMappings(char * preGraphFilename, ReadSet * reads, Coordinate * referenceMappingLength, IDnum * referenceCount) {
340 	IDnum index;
341 	ReferenceMapping * referenceMappings;
342 
343 	for(index = 0; index < reads->readCount && reads->categories[index] == 2 * CATEGORIES + 2; index++)
344 		(*referenceCount)++;
345 
346 	if (*referenceCount == 0) {
347 		*referenceMappingLength = 0;
348 		return NULL;
349 	}
350 
351 	*referenceMappingLength = countMappings(preGraphFilename);
352 
353 	if (*referenceMappingLength == 0)
354 		return NULL;
355 
356 	referenceMappings = recordReferenceMappings(preGraphFilename, *referenceMappingLength);
357 	qsort(referenceMappings, *referenceMappingLength, sizeof(ReferenceMapping), compareRefMaps);
358 
359 	return referenceMappings;
360 }
361 
findReferenceMapping(IDnum seqID,Coordinate refCoord,ReferenceMapping * referenceMappings,Coordinate referenceMappingCount)362 static ReferenceMapping * findReferenceMapping(IDnum seqID, Coordinate refCoord, ReferenceMapping * referenceMappings, Coordinate referenceMappingCount) {
363 	IDnum positive_seqID;
364 	Coordinate leftIndex = 0;
365 	Coordinate rightIndex = referenceMappingCount - 1;
366 	Coordinate middleIndex;
367 	ReferenceMapping refMap;
368 	int comparison;
369 
370 	if (seqID > 0)
371 		positive_seqID = seqID;
372 	else
373 		positive_seqID = -seqID;
374 
375 	refMap.referenceID = positive_seqID;
376 	refMap.referenceStart = refCoord;
377 	refMap.length = 1;
378 	refMap.nodeStart = 0;
379 	refMap.nodeID = 0;
380 
381 	if (compareRefMaps(&(referenceMappings[leftIndex]), &refMap) == 0)
382 		return &(referenceMappings[leftIndex]);
383 	if (compareRefMaps(&(referenceMappings[rightIndex]), &refMap) == 0)
384 		return &(referenceMappings[rightIndex]);
385 
386 	while (true) {
387 		middleIndex = (rightIndex + leftIndex) / 2;
388 		comparison = compareRefMaps(&(referenceMappings[middleIndex]), &refMap);
389 
390 		if (leftIndex >= rightIndex)
391 			return NULL;
392 		else if (comparison == 0)
393 			return &(referenceMappings[middleIndex]);
394 		else if (leftIndex == middleIndex)
395 			return NULL;
396 		else if (comparison > 0)
397 			rightIndex = middleIndex;
398 		else
399 			leftIndex = middleIndex;
400 	}
401 }
402 
403 ///////////////////////////////////////////////////////////
404 // Node Mask
405 ///////////////////////////////////////////////////////////
406 
407 typedef struct nodeMask_st NodeMask;
408 
409 struct nodeMask_st {
410 	IDnum nodeID;
411 	IDnum start;
412 	IDnum finish;
413 } ATTRIBUTE_PACKED;
414 
compareNodeMasks(const void * ptrA,const void * ptrB)415 static int compareNodeMasks(const void * ptrA, const void * ptrB) {
416 	NodeMask * A = (NodeMask *) ptrA;
417 	NodeMask * B = (NodeMask *) ptrB;
418 
419 	if (A->nodeID < B->nodeID)
420 		return -1;
421 	else if (A->nodeID > B->nodeID)
422 		return 1;
423 	else {
424 		if (A->start < B->start)
425 			return -1;
426 		else if (A->start > B->start)
427 			return 1;
428 		else
429 			return 0;
430 	}
431 }
432 
computeNodeMasks(ReferenceMapping * referenceMappings,Coordinate arrayLength,Graph * graph)433 static NodeMask * computeNodeMasks(ReferenceMapping * referenceMappings, Coordinate arrayLength, Graph * graph) {
434 	NodeMask * nodeMasks;
435 	NodeMask * currentMask;
436 	ReferenceMapping * currentMapping = referenceMappings;
437 	Coordinate index;
438 
439 	if (referenceMappings == NULL)
440 		return NULL;
441 
442 	nodeMasks = callocOrExit(arrayLength, NodeMask);
443 	currentMask = nodeMasks;
444 
445 	for (index = 0; index < arrayLength; index++) {
446 		if (currentMapping->nodeID > 0) {
447 			currentMask->nodeID = currentMapping->nodeID;
448 		} else {
449 			currentMask->nodeID = -currentMapping->nodeID;
450 		}
451 		currentMask->start = currentMapping->nodeStart;
452 		currentMask->finish = currentMapping->nodeStart + currentMapping->length;
453 		currentMask++;
454 		currentMapping++;
455 	}
456 
457 	qsort(nodeMasks, arrayLength, sizeof(NodeMask), compareNodeMasks);
458 
459 	return nodeMasks;
460 }
461 
462 ///////////////////////////////////////////////////////////
463 // Process
464 ///////////////////////////////////////////////////////////
465 
referenceGraphKmers(char * preGraphFilename,short int accelerationBits,Graph * graph,boolean double_strand,NodeMask * nodeMasks,Coordinate nodeMaskCount)466 static KmerOccurenceTable *referenceGraphKmers(char *preGraphFilename,
467 					       short int accelerationBits, Graph * graph, boolean double_strand, NodeMask * nodeMasks, Coordinate nodeMaskCount)
468 {
469 	FILE *file = fopen(preGraphFilename, "r");
470 	const int maxline = MAXLINE;
471 	char line[MAXLINE];
472 	char c;
473 	int wordLength;
474 	Coordinate lineLength, kmerCount;
475 	Kmer word;
476 	Kmer antiWord;
477 	KmerOccurenceTable *kmerTable;
478 	IDnum index;
479 	IDnum nodeID = 0;
480 	Nucleotide nucleotide;
481 	NodeMask * nodeMask = nodeMasks;
482 	Coordinate nodeMaskIndex = 0;
483 
484 	if (file == NULL)
485 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
486 
487 	// Count kmers
488 	velvetLog("Scanning pre-graph file %s for k-mers\n",
489 		  preGraphFilename);
490 
491 	// First  line
492 	if (!fgets(line, maxline, file))
493 		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
494 	sscanf(line, "%*i\t%*i\t%i\n", &wordLength);
495 
496 	kmerTable = newKmerOccurenceTable(accelerationBits, wordLength);
497 
498 	// Read nodes
499 	if (!fgets(line, maxline, file))
500 		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
501 	kmerCount = 0;
502 	while (line[0] == 'N') {
503 		lineLength = 0;
504 		while ((c = getc(file)) != EOF && c != '\n')
505 			lineLength++;
506 		kmerCount += lineLength - wordLength + 1;
507 		if (fgets(line, maxline, file) == NULL)
508 			break;
509 	}
510 
511 	velvetLog("%li kmers found\n", (long) kmerCount);
512 
513 	for(nodeMaskIndex = 0; nodeMaskIndex < nodeMaskCount; nodeMaskIndex++) {
514 		kmerCount -= nodeMasks[nodeMaskIndex].finish -
515 nodeMasks[nodeMaskIndex].start;
516 	}
517 
518 	nodeMaskIndex = 0;
519 
520 	fclose(file);
521 
522 	// Create table
523 	allocateKmerOccurences(kmerCount, kmerTable);
524 
525 	// Fill table
526 	file = fopen(preGraphFilename, "r");
527 	if (file == NULL)
528 		exitErrorf(EXIT_FAILURE, true, "Could not open %s", preGraphFilename);
529 
530 	if (!fgets(line, maxline, file))
531 		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
532 
533 	// Read nodes
534 	if (!fgets(line, maxline, file))
535 		exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
536 	while (line[0] == 'N') {
537 		nodeID++;
538 
539 		// Fill in the initial word :
540 		clearKmer(&word);
541 		clearKmer(&antiWord);
542 
543 		for (index = 0; index < wordLength - 1; index++) {
544 			c = getc(file);
545 			if (c == 'A')
546 				nucleotide = ADENINE;
547 			else if (c == 'C')
548 				nucleotide = CYTOSINE;
549 			else if (c == 'G')
550 				nucleotide = GUANINE;
551 			else if (c == 'T')
552 				nucleotide = THYMINE;
553 			else if (c == '\n')
554 				exitErrorf(EXIT_FAILURE, true, "PreGraph file incomplete");
555 			else
556 				nucleotide = ADENINE;
557 
558 
559 			pushNucleotide(&word, nucleotide);
560 			if (double_strand) {
561 #ifdef COLOR
562 				reversePushNucleotide(&antiWord, nucleotide);
563 #else
564 				reversePushNucleotide(&antiWord, 3 - nucleotide);
565 #endif
566 			}
567 		}
568 
569 		// Scan through node
570 		index = 0;
571 		while((c = getc(file)) != '\n' && c != EOF) {
572 			if (c == 'A')
573 				nucleotide = ADENINE;
574 			else if (c == 'C')
575 				nucleotide = CYTOSINE;
576 			else if (c == 'G')
577 				nucleotide = GUANINE;
578 			else if (c == 'T')
579 				nucleotide = THYMINE;
580 			else
581 				nucleotide = ADENINE;
582 
583 			pushNucleotide(&word, nucleotide);
584 			if (double_strand) {
585 #ifdef COLOR
586 				reversePushNucleotide(&antiWord, nucleotide);
587 #else
588 				reversePushNucleotide(&antiWord, 3 - nucleotide);
589 #endif
590 			}
591 
592 			// Update mask if necessary
593 			if (nodeMask) {
594 				if (nodeMask->nodeID < nodeID || (nodeMask->nodeID == nodeID && index >= nodeMask->finish)) {
595 					if (++nodeMaskIndex == nodeMaskCount)
596 						nodeMask = NULL;
597 					else
598 						nodeMask++;
599 				}
600 			}
601 
602 			// Check if not masked!
603 			if (nodeMask) {
604 				if (nodeMask->nodeID == nodeID && index >= nodeMask->start && index < nodeMask->finish) {
605 					index++;
606 					continue;
607 				}
608 			}
609 
610 			if (!double_strand || compareKmers(&word, &antiWord) <= 0)
611 				recordKmerOccurence(&word, nodeID, index, kmerTable);
612 			else
613 				recordKmerOccurence(&antiWord, -nodeID, getNodeLength(getNodeInGraph(graph, nodeID)) - 1 - index, kmerTable);
614 
615 			index++;
616 		}
617 
618 		if (fgets(line, maxline, file) == NULL)
619 			break;
620 	}
621 
622 	fclose(file);
623 
624 	// Sort table
625 	sortKmerOccurenceTable(kmerTable);
626 
627 	return kmerTable;
628 }
629 
ghostThreadSequenceThroughGraph(TightString * tString,KmerOccurenceTable * kmerTable,Graph * graph,IDnum seqID,Category category,boolean readTracking,boolean double_strand,ReferenceMapping * referenceMappings,Coordinate referenceMappingCount,IDnum refCount,Annotation * annotations,IDnum annotationCount,boolean second_in_pair)630 static void ghostThreadSequenceThroughGraph(TightString * tString,
631 					    KmerOccurenceTable *
632 					    kmerTable, Graph * graph,
633 					    IDnum seqID, Category category,
634 					    boolean readTracking,
635 					    boolean double_strand,
636 					    ReferenceMapping * referenceMappings,
637 					    Coordinate referenceMappingCount,
638 					    IDnum refCount,
639 					    Annotation * annotations,
640 					    IDnum annotationCount,
641 					    boolean second_in_pair)
642 {
643 	Kmer word;
644 	Kmer antiWord;
645 	Coordinate readNucleotideIndex;
646 	KmerOccurence *kmerOccurence;
647 	int wordLength = getWordLength(graph);
648 	Nucleotide nucleotide;
649 	IDnum refID;
650 	Coordinate refCoord;
651 	ReferenceMapping * refMap = NULL;
652 	Coordinate uniqueIndex = 0;
653 	Coordinate annotIndex = 0;
654 	IDnum annotCount = 0;
655 	boolean reversed;
656 	SmallNodeList * nodePile = NULL;
657 	Annotation * annotation = annotations;
658 
659 	Node *node = NULL;
660 	Node *previousNode = NULL;
661 
662 	// Neglect any read which will not be short paired
663 	if ((!readTracking && category % 2 == 0)
664 	    || category / 2 >= CATEGORIES)
665 		return;
666 
667 	// Neglect any string shorter than WORDLENGTH :
668 	if (getLength(tString) < wordLength)
669 		return;
670 
671 	// Verify that all short reads are reasonnably short
672 	if (getLength(tString) > USHRT_MAX) {
673 		velvetLog("Short read of length %lli, longer than limit %i\n",
674 			  (long long) getLength(tString), SHRT_MAX);
675 		velvetLog("You should better declare this sequence as long, because it genuinely is!\n");
676 		exit(1);
677 	}
678 
679 	clearKmer(&word);
680 	clearKmer(&antiWord);
681 
682 	// Fill in the initial word :
683 	for (readNucleotideIndex = 0;
684 	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
685 		nucleotide = getNucleotide(readNucleotideIndex, tString);
686 		pushNucleotide(&word, nucleotide);
687 		if (double_strand || second_in_pair) {
688 #ifdef COLOR
689 			reversePushNucleotide(&antiWord, nucleotide);
690 #else
691 			reversePushNucleotide(&antiWord, 3 - nucleotide);
692 #endif
693 		}
694 	}
695 
696 	// Go through sequence
697 	while (readNucleotideIndex < getLength(tString)) {
698 		// Shift word:
699 		nucleotide = getNucleotide(readNucleotideIndex++, tString);
700 		pushNucleotide(&word, nucleotide);
701 		if (double_strand || second_in_pair) {
702 #ifdef COLOR
703 			reversePushNucleotide(&antiWord, nucleotide);
704 #else
705 			reversePushNucleotide(&antiWord, 3 - nucleotide);
706 #endif
707 		}
708 
709 		// Update annotation if necessary
710 		if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
711 			annotation = getNextAnnotation(annotation);
712 			annotCount++;
713 			annotIndex = 0;
714 		}
715 
716 		// Search for reference mapping
717  		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
718 			refID = getAnnotSequenceID(annotation);
719 			if (refID > 0)
720 				refCoord = getStart(annotation) + annotIndex;
721 			else
722 				refCoord = getStart(annotation) - annotIndex;
723 
724 			refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
725 			// If success
726 			if (refMap) {
727 				if (refID > 0)
728 					node = getNodeInGraph(graph, refMap->nodeID);
729 				else
730 					node = getNodeInGraph(graph, -refMap->nodeID);
731 			} else  {
732 				node = NULL;
733 				if (previousNode)
734 					break;
735 			}
736 		}
737 		// if not.. look in table
738 		else {
739 			reversed = false;
740 			if (double_strand) {
741 				if (compareKmers(&word, &antiWord) <= 0) {
742 					kmerOccurence =
743 					findKmerInKmerOccurenceTable(&word,
744 								       kmerTable);
745 				} else {
746 					kmerOccurence =
747 					       findKmerInKmerOccurenceTable(&antiWord,
748 						kmerTable);
749 					reversed = true;
750 				}
751 			} else {
752 				if (!second_in_pair) {
753 					kmerOccurence =
754 					findKmerInKmerOccurenceTable(&word,
755 								       kmerTable);
756 				} else {
757 					kmerOccurence =
758 					       findKmerInKmerOccurenceTable(&antiWord,
759 						kmerTable);
760 					reversed = true;
761 				}
762 			}
763 
764 			if (kmerOccurence) {
765 				if (!reversed)
766 					node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
767 				else
768 					node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
769 			} else {
770 				node = NULL;
771 				if (previousNode)
772 					break;
773 			}
774 
775 		}
776 
777 		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
778 			annotIndex++;
779 		else
780 			uniqueIndex++;
781 
782 		previousNode = node;
783 
784 		// Fill in graph
785 		if (node && !isNodeMemorized(node, nodePile))
786 		{
787 #ifdef _OPENMP
788 			lockNode(node);
789 #endif
790 			incrementReadStartCount(node, graph);
791 #ifdef _OPENMP
792 			unLockNode(node);
793 #endif
794 			memorizeNode(node, &nodePile);
795 		}
796 	}
797 
798 	unMemorizeNodes(&nodePile);
799 }
800 
threadSequenceThroughGraph(TightString * tString,KmerOccurenceTable * kmerTable,Graph * graph,IDnum seqID,Category category,boolean readTracking,boolean double_strand,ReferenceMapping * referenceMappings,Coordinate referenceMappingCount,IDnum refCount,Annotation * annotations,IDnum annotationCount,boolean second_in_pair)801 static void threadSequenceThroughGraph(TightString * tString,
802 				       KmerOccurenceTable * kmerTable,
803 				       Graph * graph,
804 				       IDnum seqID, Category category,
805 				       boolean readTracking,
806 				       boolean double_strand,
807 				       ReferenceMapping * referenceMappings,
808 				       Coordinate referenceMappingCount,
809 				       IDnum refCount,
810 				       Annotation * annotations,
811 				       IDnum annotationCount,
812 				       boolean second_in_pair)
813 {
814 	Kmer word;
815 	Kmer antiWord;
816 	Coordinate readNucleotideIndex;
817 	Coordinate kmerIndex;
818 	KmerOccurence *kmerOccurence;
819 	int wordLength = getWordLength(graph);
820 
821 	PassageMarkerI marker = NULL_IDX;
822 	PassageMarkerI previousMarker = NULL_IDX;
823 	Node *node = NULL;
824 	Node *previousNode = NULL;
825 	Coordinate coord = 0;
826 	Coordinate previousCoord = 0;
827 	Nucleotide nucleotide;
828 	boolean reversed;
829 
830 	IDnum refID;
831 	Coordinate refCoord = 0;
832 	ReferenceMapping * refMap;
833 	Annotation * annotation = annotations;
834 	Coordinate index = 0;
835 	Coordinate uniqueIndex = 0;
836 	Coordinate annotIndex = 0;
837 	IDnum annotCount = 0;
838 	SmallNodeList * nodePile = NULL;
839 
840 	// Neglect any string shorter than WORDLENGTH :
841 	if (getLength(tString) < wordLength)
842 		return;
843 
844 	clearKmer(&word);
845 	clearKmer(&antiWord);
846 
847 	// Fill in the initial word :
848 	for (readNucleotideIndex = 0;
849 	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
850 		nucleotide = getNucleotide(readNucleotideIndex, tString);
851 		pushNucleotide(&word, nucleotide);
852 		if (double_strand || second_in_pair) {
853 #ifdef COLOR
854 			reversePushNucleotide(&antiWord, nucleotide);
855 #else
856 			reversePushNucleotide(&antiWord, 3 - nucleotide);
857 #endif
858 		}
859 	}
860 
861 	// Go through sequence
862 	// printf("len %d\n", getLength(tString));
863 	while (readNucleotideIndex < getLength(tString)) {
864 		nucleotide = getNucleotide(readNucleotideIndex++, tString);
865 		pushNucleotide(&word, nucleotide);
866 		if (double_strand || second_in_pair) {
867 #ifdef COLOR
868 			reversePushNucleotide(&antiWord, nucleotide);
869 #else
870 			reversePushNucleotide(&antiWord, 3 - nucleotide);
871 #endif
872 		}
873 
874 		// Update annotation if necessary
875 		if (annotCount < annotationCount && annotIndex == getAnnotationLength(annotation)) {
876 			annotation = getNextAnnotation(annotation);
877 			annotCount++;
878 			annotIndex = 0;
879 		}
880 
881 		// Search for reference mapping
882 		if (category == REFERENCE) {
883 			if (referenceMappings)
884 				refMap = findReferenceMapping(seqID, index, referenceMappings, referenceMappingCount);
885 			else
886 				refMap = NULL;
887 
888 			if (refMap) {
889 				node = getNodeInGraph(graph, refMap->nodeID);
890 				if (refMap->nodeID > 0) {
891 					coord = refMap->nodeStart + (index - refMap->referenceStart);
892 				} else {
893 					coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (index - refMap->referenceStart);
894 				}
895 			} else  {
896 				node = NULL;
897 			}
898 		}
899 		// Search for reference-based mapping
900 		else if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation) && getAnnotSequenceID(annotation) <= refCount && getAnnotSequenceID(annotation) >= -refCount) {
901 			refID = getAnnotSequenceID(annotation);
902 			if (refID > 0)
903 				refCoord = getStart(annotation) + annotIndex;
904 			else
905 				refCoord = getStart(annotation) - annotIndex;
906 
907 			refMap = findReferenceMapping(refID, refCoord, referenceMappings, referenceMappingCount);
908 			// If success
909 			if (refMap) {
910 				if (refID > 0) {
911 					node = getNodeInGraph(graph, refMap->nodeID);
912 					if (refMap->nodeID > 0) {
913 						coord = refMap->nodeStart + (refCoord - refMap->referenceStart);
914 					} else {
915 						coord = getNodeLength(node) - refMap->nodeStart - refMap->length + (refCoord - refMap->referenceStart);
916 					}
917 				} else {
918 					node = getNodeInGraph(graph, -refMap->nodeID);
919 					if (refMap->nodeID > 0) {
920 						coord =  getNodeLength(node) - refMap->nodeStart - (refCoord - refMap->referenceStart) - 1;
921 					} else {
922 						coord = refMap->nodeStart + refMap->length - (refCoord - refMap->referenceStart) - 1;
923 					}
924 				}
925 			} else  {
926 				node = NULL;
927 				if (previousNode)
928 					break;
929 			}
930 		}
931 		// Search in table
932 		else {
933 			reversed = false;
934 			if (double_strand) {
935 				if (compareKmers(&word, &antiWord) <= 0) {
936 					kmerOccurence =
937 					findKmerInKmerOccurenceTable(&word,
938 								       kmerTable);
939 				} else {
940 					kmerOccurence =
941 					       findKmerInKmerOccurenceTable(&antiWord,
942 						kmerTable);
943 					reversed = true;
944 				}
945 			} else {
946 				if (!second_in_pair) {
947 					kmerOccurence =
948 					findKmerInKmerOccurenceTable(&word,
949 								       kmerTable);
950 				} else {
951 					kmerOccurence =
952 					       findKmerInKmerOccurenceTable(&antiWord,
953 						kmerTable);
954 					reversed = true;
955 				}
956 			}
957 
958 			if (kmerOccurence) {
959 				if (!reversed) {
960 					node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
961 					coord = getKmerOccurencePosition(kmerOccurence);
962 				} else {
963 					node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
964 					coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
965 				}
966 			} else {
967 				node = NULL;
968 				if (previousNode)
969 					break;
970 			}
971 		}
972 
973 		// Increment positions
974 		if (annotCount < annotationCount && uniqueIndex >= getPosition(annotation))
975 			annotIndex++;
976 		else
977 			uniqueIndex++;
978 
979 		// Fill in graph
980 		if (node)
981 		{
982 #ifdef _OPENMP
983 			lockNode(node);
984 #endif
985 			kmerIndex = readNucleotideIndex - wordLength;
986 
987 			if (previousNode == node
988 			    && previousCoord == coord - 1) {
989 				if (category / 2 >= CATEGORIES) {
990 					setPassageMarkerFinish(marker,
991 							       kmerIndex +
992 							       1);
993 					setFinishOffset(marker,
994 							getNodeLength(node)
995 							- coord - 1);
996 				} else {
997 #ifndef SINGLE_COV_CAT
998 					incrementVirtualCoverage(node, category / 2, 1);
999 					incrementOriginalVirtualCoverage(node, category / 2, 1);
1000 #else
1001 					incrementVirtualCoverage(node, 1);
1002 #endif
1003 				}
1004 #ifdef _OPENMP
1005 				unLockNode(node);
1006 #endif
1007 			} else {
1008 				if (category / 2 >= CATEGORIES) {
1009 					marker =
1010 					    newPassageMarker(seqID,
1011 							     kmerIndex,
1012 							     kmerIndex + 1,
1013 							     coord,
1014 							     getNodeLength
1015 							     (node) -
1016 							     coord - 1);
1017 					transposePassageMarker(marker,
1018 							       node);
1019 					connectPassageMarkers
1020 					    (previousMarker, marker,
1021 					     graph);
1022 					previousMarker = marker;
1023 				} else {
1024 					if (readTracking) {
1025 						if (!isNodeMemorized(node, nodePile)) {
1026 							addReadStart(node,
1027 								     seqID,
1028 								     coord,
1029 								     graph,
1030 								     kmerIndex);
1031 							memorizeNode(node, &nodePile);
1032 						} else {
1033 							blurLastShortReadMarker
1034 							    (node, graph);
1035 						}
1036 					}
1037 
1038 #ifndef SINGLE_COV_CAT
1039 					incrementVirtualCoverage(node, category / 2, 1);
1040 					incrementOriginalVirtualCoverage(node, category / 2, 1);
1041 #else
1042 					incrementVirtualCoverage(node, 1);
1043 #endif
1044 				}
1045 #ifdef _OPENMP
1046 				lockTwoNodes(node, previousNode);
1047 #endif
1048 				if (category != REFERENCE)
1049 					createArc(previousNode, node, graph);
1050 #ifdef _OPENMP
1051 				unLockTwoNodes(node, previousNode);
1052 #endif
1053 			}
1054 
1055 			previousNode = node;
1056 			previousCoord = coord;
1057 		}
1058 		index++;
1059 	}
1060 	// printKmer(&word);
1061 
1062 	if (readTracking && category / 2 < CATEGORIES)
1063 		unMemorizeNodes(&nodePile);
1064 }
1065 
fillUpGraph(ReadSet * reads,KmerOccurenceTable * kmerTable,Graph * graph,boolean readTracking,boolean double_strand,ReferenceMapping * referenceMappings,Coordinate referenceMappingCount,IDnum refCount,char * roadmapFilename)1066 static void fillUpGraph(ReadSet * reads,
1067 			KmerOccurenceTable * kmerTable,
1068 			Graph * graph,
1069 			boolean readTracking,
1070 			boolean double_strand,
1071 			ReferenceMapping * referenceMappings,
1072 			Coordinate referenceMappingCount,
1073 			IDnum refCount,
1074 			char * roadmapFilename)
1075 {
1076 	IDnum readIndex;
1077 	RoadMapArray *roadmap = NULL;
1078 	Coordinate *annotationOffset = NULL;
1079 	struct timeval start, end, diff;
1080 
1081 	if (referenceMappings)
1082 	{
1083 		roadmap = importRoadMapArray(roadmapFilename);
1084 		annotationOffset = callocOrExit(reads->readCount, Coordinate);
1085 		for (readIndex = 1; readIndex < reads->readCount; readIndex++)
1086 			annotationOffset[readIndex] = annotationOffset[readIndex - 1]
1087 						      + getAnnotationCount(getRoadMapInArray(roadmap, readIndex - 1));
1088 	}
1089 
1090 	resetNodeStatus(graph);
1091 	// Allocate memory for the read pairs
1092 	if (!readStartsAreActivated(graph))
1093 		activateReadStarts(graph);
1094 
1095 	gettimeofday(&start, NULL);
1096 #ifdef _OPENMP
1097 	initSmallNodeListMemory();
1098 	createNodeLocks(graph);
1099 	#pragma omp parallel for
1100 #endif
1101 	for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1102 	{
1103 		Annotation * annotations = NULL;
1104 		IDnum annotationCount = 0;
1105 		Category category;
1106 		boolean second_in_pair;
1107 
1108 		if (readIndex % 1000000 == 0)
1109 			velvetLog("Ghost Threading through reads %ld / %ld\n",
1110 				  (long) readIndex, (long) reads->readCount);
1111 
1112 		category = reads->categories[readIndex];
1113 		second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1114 
1115 		if (referenceMappings)
1116 		{
1117 			annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1118 			annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1119 		}
1120 
1121 		ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1122 						kmerTable,
1123 						graph, readIndex + 1,
1124 						category,
1125 						readTracking, double_strand,
1126 						referenceMappings, referenceMappingCount,
1127 					  	refCount, annotations, annotationCount,
1128 						second_in_pair);
1129 	}
1130 	createNodeReadStartArrays(graph);
1131 	gettimeofday(&end, NULL);
1132 	timersub(&end, &start, &diff);
1133 	velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1134 
1135 	gettimeofday(&start, NULL);
1136 #ifdef _OPENMP
1137 	int threads = omp_get_max_threads();
1138 	if (threads > 32)
1139 		threads = 32;
1140 
1141 	#pragma omp parallel for num_threads(threads)
1142 #endif
1143 	for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1144 	{
1145 		Annotation * annotations = NULL;
1146 		IDnum annotationCount = 0;
1147 		Category category;
1148 		boolean second_in_pair;
1149 
1150 		if (readIndex % 1000000 == 0)
1151 			velvetLog("Threading through reads %li / %li\n",
1152 				  (long) readIndex, (long) reads->readCount);
1153 
1154 		category = reads->categories[readIndex];
1155 		second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1156 
1157 		if (referenceMappings)
1158 		{
1159 			annotationCount = getAnnotationCount(getRoadMapInArray(roadmap, readIndex));
1160 			annotations = getAnnotationInArray(roadmap->annotations, annotationOffset[readIndex]);
1161 		}
1162 
1163 		threadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1164 					   kmerTable,
1165 					   graph, readIndex + 1, category,
1166 					   readTracking, double_strand,
1167 					   referenceMappings, referenceMappingCount,
1168 					   refCount, annotations, annotationCount, second_in_pair);
1169 	}
1170 	gettimeofday(&end, NULL);
1171 	timersub(&end, &start, &diff);
1172 	velvetLog(" === Threaded in %ld.%06ld s\n", (long) diff.tv_sec, (long) diff.tv_usec);
1173 
1174 #ifdef _OPENMP
1175 	free(nodeLocks);
1176 	nodeLocks = NULL;
1177 #endif
1178 
1179 	if (referenceMappings)
1180 	{
1181 		destroyRoadMapArray(roadmap);
1182 		free (annotationOffset);
1183 	}
1184 
1185 	orderNodeReadStartArrays(graph);
1186 
1187 	destroySmallNodeListMemmory();
1188 
1189 	destroyKmerOccurenceTable(kmerTable);
1190 }
1191 
importPreGraph(char * preGraphFilename,ReadSet * reads,char * roadmapFilename,boolean readTracking,short int accelerationBits)1192 Graph *importPreGraph(char *preGraphFilename, ReadSet * reads, char * roadmapFilename,
1193 		      boolean readTracking, short int accelerationBits)
1194 {
1195 	boolean double_strand = false;
1196 	Graph *graph = readPreGraphFile(preGraphFilename, &double_strand);
1197 	Coordinate referenceMappingCount = 0;
1198 	IDnum referenceCount = 0;
1199 
1200 	if (nodeCount(graph) == 0)
1201 		return graph;
1202 
1203 	// If necessary compile reference -> node
1204 	ReferenceMapping * referenceMappings = computeReferenceMappings(preGraphFilename, reads, &referenceMappingCount, &referenceCount);
1205 	// Node -> reference maps
1206 	NodeMask * nodeMasks = computeNodeMasks(referenceMappings, referenceMappingCount, graph);
1207 
1208 	// Map k-mers to nodes
1209 	KmerOccurenceTable *kmerTable =
1210 	    referenceGraphKmers(preGraphFilename, accelerationBits, graph, double_strand, nodeMasks, referenceMappingCount);
1211 
1212 	free(nodeMasks);
1213 
1214 	// Map sequences -> kmers -> nodes
1215 	fillUpGraph(reads, kmerTable, graph, readTracking, double_strand, referenceMappings, referenceMappingCount, referenceCount, roadmapFilename);
1216 
1217 	free(referenceMappings);
1218 
1219 	return graph;
1220 }
1221 
addReadsToGraph(TightString * tString,KmerOccurenceTable * kmerTable,Graph * graph,IDnum seqID,Category category,boolean readTracking,boolean double_strand,boolean second_in_pair)1222 static void addReadsToGraph(TightString * tString,
1223 				       KmerOccurenceTable * kmerTable,
1224 				       Graph * graph,
1225 				       IDnum seqID, Category category,
1226 				       boolean readTracking,
1227 				       boolean double_strand,
1228 				       boolean second_in_pair)
1229 {
1230 	Kmer word;
1231 	Kmer antiWord;
1232 	Coordinate readNucleotideIndex;
1233 	Coordinate kmerIndex;
1234 	KmerOccurence *kmerOccurence;
1235 	int wordLength = getWordLength(graph);
1236 
1237 	Node *node = NULL;
1238 	Node *previousNode = NULL;
1239 	Coordinate coord = 0;
1240 	Coordinate previousCoord = 0;
1241 	Nucleotide nucleotide;
1242 	boolean reversed;
1243 
1244 	Coordinate index = 0;
1245 	SmallNodeList * nodePile = NULL;
1246 
1247 	// Neglect any read which will not be short paired
1248 	if (category / 2 >= CATEGORIES)
1249 		return;
1250 
1251 	// Neglect any string shorter than WORDLENGTH :
1252 	if (getLength(tString) < wordLength)
1253 		return;
1254 
1255 	clearKmer(&word);
1256 	clearKmer(&antiWord);
1257 
1258 	// Fill in the initial word :
1259 	for (readNucleotideIndex = 0;
1260 	     readNucleotideIndex < wordLength - 1; readNucleotideIndex++) {
1261 		nucleotide = getNucleotide(readNucleotideIndex, tString);
1262 		pushNucleotide(&word, nucleotide);
1263 		if (double_strand || second_in_pair) {
1264 #ifdef COLOR
1265 			reversePushNucleotide(&antiWord, nucleotide);
1266 #else
1267 			reversePushNucleotide(&antiWord, 3 - nucleotide);
1268 #endif
1269 		}
1270 	}
1271 
1272 	// Go through sequence
1273 	// printf("len %d\n", getLength(tString));
1274 	while (readNucleotideIndex < getLength(tString)) {
1275 		nucleotide = getNucleotide(readNucleotideIndex++, tString);
1276 		pushNucleotide(&word, nucleotide);
1277 		if (double_strand || second_in_pair) {
1278 #ifdef COLOR
1279 			reversePushNucleotide(&antiWord, nucleotide);
1280 #else
1281 			reversePushNucleotide(&antiWord, 3 - nucleotide);
1282 #endif
1283 		}
1284 
1285 		// Search in table
1286 		reversed = false;
1287 		if (double_strand) {
1288 			if (compareKmers(&word, &antiWord) <= 0) {
1289 				kmerOccurence =
1290 					findKmerInKmerOccurenceTable(&word,
1291 							kmerTable);
1292 			} else {
1293 				kmerOccurence =
1294 					findKmerInKmerOccurenceTable(&antiWord,
1295 							kmerTable);
1296 				reversed = true;
1297 			}
1298 		} else {
1299 			if (!second_in_pair) {
1300 				kmerOccurence =
1301 					findKmerInKmerOccurenceTable(&word,
1302 							kmerTable);
1303 			} else {
1304 				kmerOccurence =
1305 					findKmerInKmerOccurenceTable(&antiWord,
1306 							kmerTable);
1307 				reversed = true;
1308 			}
1309 		}
1310 
1311 		if (kmerOccurence) {
1312 			if (!reversed) {
1313 				node = getNodeInGraph(graph, getKmerOccurenceNodeID(kmerOccurence));
1314 				coord = getKmerOccurencePosition(kmerOccurence);
1315 			} else {
1316 				node = getNodeInGraph(graph, -getKmerOccurenceNodeID(kmerOccurence));
1317 				coord = getNodeLength(node) - getKmerOccurencePosition(kmerOccurence) - 1;
1318 			}
1319 		} else {
1320 			node = NULL;
1321 			if (previousNode)
1322 				break;
1323 		}
1324 
1325 		// Fill in graph
1326 		if (node)
1327 		{
1328 #ifdef _OPENMP
1329 			lockNode(node);
1330 #endif
1331 			kmerIndex = readNucleotideIndex - wordLength;
1332 
1333 			if (previousNode != node || previousCoord != coord -1) {
1334 				if (!isNodeMemorized(node, nodePile)) {
1335 					addReadStart(node,
1336 							seqID,
1337 							coord,
1338 							graph,
1339 							kmerIndex);
1340 					memorizeNode(node, &nodePile);
1341 				} else {
1342 					blurLastShortReadMarker
1343 						(node, graph);
1344 				}
1345 			}
1346 #ifdef _OPENMP
1347 			unLockNode(node);
1348 #endif
1349 			previousNode = node;
1350 			previousCoord = coord;
1351 		}
1352 		index++;
1353 	}
1354 	// printKmer(&word);
1355 
1356 	if (category / 2 < CATEGORIES)
1357 		unMemorizeNodes(&nodePile);
1358 }
1359 
fillUpConnectedGraph(ReadSet * reads,KmerOccurenceTable * kmerTable,Graph * graph,boolean readTracking,boolean double_strand)1360 static void fillUpConnectedGraph(ReadSet * reads,
1361 			KmerOccurenceTable * kmerTable,
1362 			Graph * graph,
1363 			boolean readTracking,
1364 			boolean double_strand)
1365 {
1366 	IDnum refCount = 0;   // refs not present in connected graphs
1367 	IDnum readIndex;
1368 	struct timeval start, end, diff;
1369 
1370 	resetNodeStatus(graph);
1371 	// Allocate memory for the read pairs
1372 	if (!readStartsAreActivated(graph))
1373 		activateReadStarts(graph);
1374 
1375 	gettimeofday(&start, NULL);
1376 #ifdef _OPENMP
1377 	initSmallNodeListMemory();
1378 	createNodeLocks(graph);
1379 	#pragma omp parallel for
1380 #endif
1381 	for (readIndex = refCount; readIndex < reads->readCount; readIndex++)
1382 	{
1383 		Category category;
1384 		boolean second_in_pair;
1385 
1386 		if (readIndex % 1000000 == 0)
1387 			velvetLog("Ghost Threading through reads %ld / %ld\n",
1388 				  (long) readIndex, (long) reads->readCount);
1389 
1390 		category = reads->categories[readIndex];
1391 		second_in_pair = reads->categories[readIndex] & 1 && isSecondInPair(reads, readIndex);
1392 
1393 		// referenceMappings = NULL, referenceMappingCount = 0
1394 		// refCount = 0, annotations = NULL, annotationCount = 0
1395 		ghostThreadSequenceThroughGraph(getTightStringInArray(reads->tSequences, readIndex),
1396 						kmerTable,
1397 						graph, readIndex + 1,
1398 						category,
1399 						readTracking, double_strand,
1400 						NULL, 0,
1401 					  	0, NULL, 0,
1402 						second_in_pair);
1403 	}
1404 	createNodeReadStartArrays(graph);
1405 	gettimeofday(&end, NULL);
1406 	timersub(&end, &start, &diff);
1407 	velvetLog(" === Ghost-Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1408 
1409 	gettimeofday(&start, NULL);
1410 #ifdef _OPENMP
1411 	int threads = omp_get_max_threads();
1412 	if (threads > 32)
1413 		threads = 32;
1414 
1415 	#pragma omp parallel for num_threads(threads)
1416 #endif
1417 	for (readIndex = 0; readIndex < reads->readCount; readIndex++)
1418 	{
1419 		Category category;
1420 		boolean second_in_pair;
1421 
1422 		if (readIndex % 1000000 == 0)
1423 			velvetLog("Adding reads %li / %li\n",
1424 				  (long) readIndex, (long) reads->readCount);
1425 
1426 		category = reads->categories[readIndex];
1427 		second_in_pair = reads->categories[readIndex] % 2 && isSecondInPair(reads, readIndex);
1428 
1429 		addReadsToGraph(getTightStringInArray(reads->tSequences, readIndex),
1430 					   kmerTable,
1431 					   graph, readIndex + 1, category,
1432 					   readTracking, double_strand, second_in_pair);
1433 	}
1434 	gettimeofday(&end, NULL);
1435 	timersub(&end, &start, &diff);
1436 	velvetLog(" === Threaded in %ld.%06ld s\n", diff.tv_sec, diff.tv_usec);
1437 
1438 #ifdef _OPENMP
1439 	free(nodeLocks);
1440 	nodeLocks = NULL;
1441 #endif
1442 
1443 	orderNodeReadStartArrays(graph);
1444 
1445 	destroySmallNodeListMemmory();
1446 
1447 	destroyKmerOccurenceTable(kmerTable);
1448 }
1449 
importConnectedGraph(char * connectedGraphFilename,ReadSet * reads,char * roadmapFilename,boolean readTracking,short int accelerationBits)1450 Graph *importConnectedGraph(char *connectedGraphFilename, ReadSet * reads, char * roadmapFilename,
1451 		      boolean readTracking, short int accelerationBits)
1452 {
1453 	boolean double_strand = false;
1454 	Graph *graph = readConnectedGraphFile(connectedGraphFilename, &double_strand);
1455 
1456 	if (nodeCount(graph) == 0)
1457 		return graph;
1458 
1459 	if (readTracking) {
1460 		Coordinate referenceMappingCount = 0;
1461 		NodeMask * nodeMasks = NULL;
1462 
1463 		// Map k-mers to nodes
1464 		KmerOccurenceTable *kmerTable =
1465 			referenceGraphKmers(connectedGraphFilename, accelerationBits, graph, doubleStrandedGraph(graph), nodeMasks, referenceMappingCount);
1466 
1467 		// Map sequences -> kmers -> nodes
1468 		fillUpConnectedGraph(reads, kmerTable, graph, readTracking, double_strand);
1469 	}
1470 
1471 	return graph;
1472 }
1473