1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <math.h>
24 #include <string.h>
25 
26 #include "globals.h"
27 #include "graph.h"
28 #include "readSet.h"
29 #include "tightString.h"
30 #include "passageMarker.h"
31 #include "concatenatedGraph.h"
32 #include "readCoherentGraph.h"
33 #include "fibHeap.h"
34 #include "utility.h"
35 // Original
36 #include "shortReadPairs.h"
37 // Original
38 
39 // Original
40 #define LEN_HISTO_X 10000
41 // Original
42 
surveyPath(PassageMarker * marker)43 static void surveyPath(PassageMarker * marker)
44 {
45 	Coordinate length = 0;
46 	Coordinate realLength = 0;
47 	PassageMarker *current = marker;
48 
49 	if (passageMarkerDirection(current) < 0)
50 		current = getTwinMarker(current);
51 
52 	for (; current != NULL; current = getNextInSequence(current)) {
53 		length +=
54 		    getNodeLength(getNode(current)) -
55 		    getStartOffset(current) - getFinishOffset(current);
56 		if (getPassageMarkerFinish(current) > 0)
57 			realLength = getPassageMarkerFinish(current);
58 	}
59 
60 	printf("SURVEY %ld %lld %lld\n", (long) getAbsolutePassMarkerSeqID(marker),
61 	       (long long) realLength, (long long) length);
62 }
63 
surveyPaths(Graph * graph)64 void surveyPaths(Graph * graph)
65 {
66 	IDnum ID;
67 	PassageMarker *marker;
68 
69 	for (ID = 1; ID <= nodeCount(graph); ID++)
70 		for (marker = getMarker(getNodeInGraph(graph, ID));
71 		     marker != NULL; marker = getNextInNode(marker))
72 			if ((passageMarkerDirection(marker) > 0
73 			     && isInitial(marker))
74 			    || (passageMarkerDirection(marker) < 0
75 				&& isTerminal(marker)))
76 				surveyPath(marker);
77 }
78 
copyMarkers(Node * node)79 static PassageMarkerList *copyMarkers(Node * node)
80 {
81 	PassageMarkerList *list = NULL;
82 	PassageMarkerList *new;
83 	PassageMarker *currentMarker;
84 
85 	for (currentMarker = getMarker(node); currentMarker != NULL;
86 	     currentMarker = getNextInNode(currentMarker)) {
87 		new = newPassageMarkerList(currentMarker, list);
88 		list = new;
89 	}
90 
91 	return list;
92 }
93 
removeDead(PassageMarkerList ** list)94 static boolean removeDead(PassageMarkerList ** list)
95 {
96 	PassageMarkerList *current, *next;
97 	boolean removed = false;
98 
99 	if (*list == NULL)
100 		return false;
101 
102 	current = *list;
103 
104 	while (current->next != NULL) {
105 		next = current->next;
106 
107 		if (isTerminal(next->marker)) {
108 			removed = true;
109 			current->next = next->next;
110 			deallocatePassageMarkerList(next);
111 		} else
112 			current = current->next;
113 	}
114 
115 	current = *list;
116 	if (isTerminal(current->marker)) {
117 		removed = true;
118 		*list = current->next;
119 		deallocatePassageMarkerList(current);
120 	}
121 
122 	return removed;
123 }
124 
chooseDestination(PassageMarkerList * list)125 static Node *chooseDestination(PassageMarkerList * list)
126 {
127 	PassageMarkerList *current = list;
128 	Node *destination;
129 
130 	destination = getNode(getNextInSequence(current->marker));
131 	while (current != NULL) {
132 		if (getNode(getNextInSequence(current->marker)) !=
133 		    destination)
134 			return NULL;
135 		current = current->next;
136 	}
137 
138 	return destination;
139 }
140 
destroyPassageMarkerList(PassageMarkerList ** list)141 static void destroyPassageMarkerList(PassageMarkerList ** list)
142 {
143 	PassageMarkerList *ptr;
144 
145 	while (*list != NULL) {
146 		ptr = *list;
147 		*list = ptr->next;
148 		deallocatePassageMarkerList(ptr);
149 	}
150 }
151 
updateMarkers(PassageMarkerList * list)152 static void updateMarkers(PassageMarkerList * list)
153 {
154 	PassageMarkerList *current;
155 
156 	for (current = list; current != NULL; current = current->next)
157 		current->marker = getNextInSequence(current->marker);
158 }
159 
computeSubsequentNodesLength(Node * node)160 Coordinate computeSubsequentNodesLength(Node * node)
161 {
162 	PassageMarkerList *list;
163 	Node *nextNode;
164 	Coordinate totalLength = 0;
165 	boolean uncertain = false;
166 
167 	list = copyMarkers(node);
168 
169 	while (true) {
170 		if (removeDead(&list))
171 			uncertain = true;
172 
173 		if (uncertain && simpleArcCount(node) > 1) {
174 			destroyPassageMarkerList(&list);
175 			return totalLength;
176 		}
177 
178 		if (list == NULL)
179 			return totalLength;
180 
181 		nextNode = chooseDestination(list);
182 		if (nextNode == NULL) {
183 			destroyPassageMarkerList(&list);
184 			return totalLength;
185 		}
186 
187 		totalLength += getNodeLength(nextNode);
188 
189 		updateMarkers(list);
190 	}
191 
192 	// Impossible instruction
193 	return -1;
194 }
195 
computeVirtualNodeLength(Node * node)196 Coordinate computeVirtualNodeLength(Node * node)
197 {
198 	Coordinate virtualLength;
199 
200 	if (node == NULL)
201 		return 0;
202 
203 	virtualLength = getNodeLength(node);
204 
205 	virtualLength += computeSubsequentNodesLength(node);
206 	virtualLength += computeSubsequentNodesLength(getTwinNode(node));
207 
208 	return virtualLength;
209 }
210 
testForBizarreMarkers(Graph * graph)211 void testForBizarreMarkers(Graph * graph)
212 {
213 	IDnum index;
214 	Node *node;
215 	PassageMarker *marker;
216 
217 	for (index = 1; index < nodeCount(graph); index++) {
218 		node = getNodeInGraph(graph, index);
219 		if (node == NULL)
220 			continue;
221 
222 		for (marker = getMarker(node); marker != NULL;
223 		     marker = getNextInNode(marker)) {
224 			if (getTwinMarker(marker) == NULL)
225 				exitErrorf(EXIT_FAILURE, false, "Bizarre marker %s",
226 				       readPassageMarker(marker));
227 		}
228 	}
229 }
230 
231 // COunts how many nodes are dead-ends
countSinksAndSources(Graph * graph)232 IDnum countSinksAndSources(Graph * graph)
233 {
234 	IDnum nodeIndex;
235 	IDnum result = 0;
236 	Node *node;
237 
238 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
239 		node = getNodeInGraph(graph, nodeIndex);
240 		if (arcCount(node) == 0
241 		    || arcCount(getTwinNode(node)) == 0)
242 			result++;
243 	}
244 
245 	return result;
246 }
247 
248 // Counts how many nodes have several arcs either going in or coming out
countTangles(Graph * graph)249 IDnum countTangles(Graph * graph)
250 {
251 	IDnum nodeIndex;
252 	IDnum result = 0;
253 	Node *node;
254 
255 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
256 		node = getNodeInGraph(graph, nodeIndex);
257 		if (arcCount(node) > 1 || arcCount(getTwinNode(node)) > 1)
258 			result++;
259 	}
260 	return result;
261 }
262 
263 // Counts nodes with exactly one incoming and one outgoing arc
countRepeats(Graph * graph)264 IDnum countRepeats(Graph * graph)
265 {
266 	IDnum nodeIndex;
267 	IDnum result = 0;
268 	Node *node;
269 
270 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
271 		node = getNodeInGraph(graph, nodeIndex);
272 		if (arcCount(node) == 1
273 		    && arcCount(getTwinNode(getDestination(getArc(node))))
274 		    == 1)
275 			if (getNodeID
276 			    (getTwinNode(getDestination(getArc(node)))) < 0
277 			    ||
278 			    getNodeID(getTwinNode
279 				      (getDestination(getArc(node)))) >
280 			    getNodeID(node))
281 				result++;
282 	}
283 	return result;
284 
285 }
286 
287 // Counts the number of markers for one node
nodeGenomicMultiplicity(Node * node,IDnum firstStrain)288 int nodeGenomicMultiplicity(Node * node, IDnum firstStrain)
289 {
290 	int counter = 0;
291 	PassageMarker *marker;
292 
293 	if (node == NULL)
294 		return 0;
295 
296 	for (marker = getMarker(node); marker != NULL;
297 	     marker = getNextInNode(marker))
298 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
299 			counter++;
300 
301 	return counter;
302 }
303 
304 // Counts the number of markers for one node
nodeMultiplicity(Node * node)305 IDnum nodeMultiplicity(Node * node)
306 {
307 	int counter = 0;
308 	PassageMarker *marker;
309 
310 	if (node == NULL)
311 		return 0;
312 
313 	marker = getMarker(node);
314 	while (marker != NULL) {
315 		counter++;
316 		marker = getNextInNode(marker);
317 	}
318 	return counter;
319 }
320 
321 // Prints out a set of predefined statistics for one node
nodeStatistics(Node * node)322 char *nodeStatistics(Node * node)
323 {
324 	char *s = mallocOrExit(100, char);
325 	sprintf(s, "NODE %ld\t%lld\t%i\t%i\t%ld", (long) getNodeID(node),
326 		(long long) getNodeLength(node), simpleArcCount(node),
327 		simpleArcCount(getTwinNode(node)), (long) nodeMultiplicity(node));
328 	return s;
329 }
330 
331 // Prints out a table of statistics for all the nodes of the graph
displayGraphStatistics(Graph * graph)332 void displayGraphStatistics(Graph * graph)
333 {
334 	IDnum nodeIndex;
335 	printf("NODE ID\tlgth\tFwd\tBck\tMult\n");
336 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++)
337 		printf("%s\n",
338 		       nodeStatistics(getNodeInGraph(graph, nodeIndex)));
339 }
340 
displayNodeStatisticsSelective(Node * node,IDnum first)341 void displayNodeStatisticsSelective(Node * node, IDnum first)
342 {
343 	PassageMarker *marker;
344 	boolean originalGenome;
345 	boolean strain;
346 
347 	if (node == NULL)
348 		return;
349 
350 	marker = getMarker(node);
351 
352 	originalGenome = false;
353 	strain = false;
354 	while (marker != NULL) {
355 		if (getAbsolutePassMarkerSeqID(marker) < first)
356 			originalGenome = true;
357 
358 		if (getAbsolutePassMarkerSeqID(marker) >= first)
359 			strain = true;
360 
361 		marker = getNextInNode(marker);
362 	}
363 
364 	printf("%s", nodeStatistics(node));
365 	if (originalGenome && !strain)
366 		printf("\tTRUE");
367 	else
368 		printf("\tFALSE");
369 
370 	if (originalGenome && strain)
371 		printf("\tTRUE");
372 	else
373 		printf("\tFALSE");
374 
375 
376 	if (strain && !originalGenome)
377 		puts("\tTRUE");
378 	else
379 		puts("\tFALSE");
380 
381 }
382 
displayGraphStatisticsSelective(Graph * graph,IDnum first)383 void displayGraphStatisticsSelective(Graph * graph, IDnum first)
384 {
385 	IDnum index;
386 
387 	for (index = 1; index <= nodeCount(graph); index++)
388 		displayNodeStatisticsSelective(getNodeInGraph
389 					       (graph, index), first);
390 
391 }
392 
isOnlyGenome(Node * node,IDnum firstStrain)393 boolean isOnlyGenome(Node * node, IDnum firstStrain)
394 {
395 	PassageMarker *marker;
396 
397 	for (marker = getMarker(node); marker != NULL;
398 	     marker = getNextInNode(marker))
399 		if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
400 			return false;
401 
402 	return true;
403 }
404 
isOnlyStrain(Node * node,IDnum firstStrain)405 boolean isOnlyStrain(Node * node, IDnum firstStrain)
406 {
407 	PassageMarker *marker;
408 
409 	for (marker = getMarker(node); marker != NULL;
410 	     marker = getNextInNode(marker))
411 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
412 			return false;
413 
414 	return true;
415 }
416 
isSNP(Node * node,IDnum firstStrain,int WORDLENGTH)417 boolean isSNP(Node * node, IDnum firstStrain, int WORDLENGTH)
418 {
419 	IDnum sequence;
420 	Coordinate position;
421 
422 	if (getNodeLength(node) != WORDLENGTH)
423 		return false;
424 
425 	if (getMarker(node) == NULL)
426 		return false;
427 
428 	if (getAbsolutePassMarkerSeqID(getMarker(node)) >= firstStrain)
429 		return false;
430 
431 	if (getNextInNode(getMarker(node)) != NULL)
432 		return false;
433 
434 	if (arcCount(node) != 1)
435 		return false;
436 
437 	if (arcCount(getTwinNode(node)) != 1)
438 		return false;
439 
440 	if (isOnlyGenome(getDestination(getArc(node)), firstStrain))
441 		return false;
442 
443 	if (isOnlyGenome
444 	    (getDestination(getArc(getTwinNode(node))), firstStrain))
445 		return false;
446 
447 	sequence = getPassageMarkerSequenceID(getMarker(node));
448 
449 	if (sequence >= 0)
450 		position = getPassageMarkerStart(getMarker(node));
451 	else {
452 		sequence = -sequence;
453 		position = getPassageMarkerFinish(getMarker(node));
454 	}
455 
456 	printf("SNP\t%lld\t%ld\n", (long long) position, (long) sequence);
457 
458 	return true;
459 }
460 
strainMarkerCount(Node * node,IDnum firstStrain)461 IDnum strainMarkerCount(Node * node, IDnum firstStrain)
462 {
463 	PassageMarker *marker;
464 	IDnum counter = 0;
465 
466 	for (marker = getMarker(node); marker != NULL;
467 	     marker = getNextInNode(marker))
468 		if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
469 			counter++;
470 
471 	return counter;
472 }
473 
isError(Node * node,IDnum firstStrain)474 boolean isError(Node * node, IDnum firstStrain)
475 {
476 	return (strainMarkerCount(node, firstStrain) < 5);
477 }
478 
removeStrainMarkers(Node * node,IDnum firstStrain)479 void removeStrainMarkers(Node * node, IDnum firstStrain)
480 {
481 	PassageMarker *marker;
482 	PassageMarker *tmp = NULL;
483 
484 	marker = getMarker(node);
485 	while (marker != NULL) {
486 		tmp = getNextInNode(marker);
487 
488 		if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
489 			destroyPassageMarker(marker);
490 		marker = tmp;
491 	}
492 
493 }
494 
chainSawCorrection(Graph * graph,int minMult)495 void chainSawCorrection(Graph * graph, int minMult)
496 {
497 	IDnum nodeIndex;
498 	IDnum removed = 0;
499 
500 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
501 		if (markerCount(getNodeInGraph(graph, nodeIndex)) <
502 		    minMult) {
503 			destroyNode(getNodeInGraph(graph, nodeIndex),
504 				    graph);
505 			removed++;
506 		}
507 	}
508 
509 	printf("%d dubious nodes removed\n", removed);
510 	concatenateGraph(graph);
511 	printf("%d node in the end\n", nodeCount(graph));
512 }
513 
grossErrorRemoval(Graph * graph,IDnum firstStrain)514 void grossErrorRemoval(Graph * graph, IDnum firstStrain)
515 {
516 	IDnum nodeIndex;
517 	IDnum removed = 0;
518 
519 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
520 		if (isError(getNodeInGraph(graph, nodeIndex), firstStrain)) {
521 			if (isOnlyStrain
522 			    (getNodeInGraph(graph, nodeIndex),
523 			     firstStrain)) {
524 				destroyNode(getNodeInGraph
525 					    (graph, nodeIndex), graph);
526 				removed++;
527 			} else
528 				removeStrainMarkers(getNodeInGraph
529 						    (graph, nodeIndex),
530 						    firstStrain);
531 		}
532 	}
533 
534 	printf("%d dubious nodes removed\n", removed);
535 	concatenateGraph(graph);
536 	printf("%d node in the end\n", nodeCount(graph));
537 }
538 
countSNPs(Graph * graph,IDnum firstStrain,int WORDLENGTH)539 IDnum countSNPs(Graph * graph, IDnum firstStrain, int WORDLENGTH)
540 {
541 	IDnum index;
542 	IDnum counter = 0;
543 
544 	for (index = 1; index < nodeCount(graph); index++)
545 		if (isSNP
546 		    (getNodeInGraph(graph, index), firstStrain,
547 		     WORDLENGTH))
548 			counter++;
549 
550 	return counter;
551 }
552 
commonLength(Node * node,IDnum firstStrain)553 Coordinate commonLength(Node * node, IDnum firstStrain)
554 {
555 	PassageMarker *marker = getMarker(node);
556 	int orig = 0;
557 	int strain = 0;
558 
559 	while (marker != NULL) {
560 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
561 			orig++;
562 		else
563 			strain++;
564 		marker = getNextInNode(marker);
565 	}
566 
567 	if (orig == 0 || strain == 0)
568 		return 0;
569 
570 	return (Coordinate) orig *getNodeLength(node);
571 }
572 
countCommonLength(Graph * graph,IDnum firstStrain)573 Coordinate countCommonLength(Graph * graph, IDnum firstStrain)
574 {
575 	IDnum index;
576 	Coordinate res = 0;
577 
578 	for (index = 1; index <= nodeCount(graph); index++)
579 		res +=
580 		    commonLength(getNodeInGraph(graph, index),
581 				 firstStrain);
582 
583 	return res;
584 }
585 
isMixed(Node * node,IDnum firstStrain)586 boolean isMixed(Node * node, IDnum firstStrain)
587 {
588 	return !isOnlyStrain(node, firstStrain)
589 	    && !isOnlyGenome(node, firstStrain);
590 }
591 
countLocalBreakpoints(PassageMarker * marker,IDnum firstStrain)592 int countLocalBreakpoints(PassageMarker * marker, IDnum firstStrain)
593 {
594 	PassageMarker *localMarker;
595 	IDnum sequenceID = getAbsolutePassMarkerSeqID(marker);
596 	IDnum localSeqID;
597 	Coordinate start = getPassageMarkerStart(marker);
598 	Node *localNode = getNode(marker);
599 	Node *destination;
600 	Arc *arc;
601 	int arcCount = 0;
602 	int arcIndex;
603 	boolean *arcStatus;
604 	int counter = 0;
605 
606 	if (!isMixed(localNode, firstStrain))
607 		return 0;
608 
609 	// Count arcs
610 	for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc))
611 		arcCount++;
612 	arcStatus = callocOrExit(arcCount, boolean);
613 	// Check for other genomic markers in node
614 	for (localMarker = getMarker(localNode); localMarker != NULL;
615 	     localMarker = getNextInNode(localMarker)) {
616 		localSeqID = getAbsolutePassMarkerSeqID(localMarker);
617 		if (localSeqID >= firstStrain)
618 			continue;
619 
620 		if (localSeqID < sequenceID)
621 			return 0;
622 
623 		if (localSeqID == sequenceID
624 		    && getPassageMarkerStart(localMarker) < start)
625 			return 0;
626 
627 		destination = getNode(getNextInSequence(localMarker));
628 
629 		// Enter into table:
630 		arcIndex = 0;
631 		for (arc = getArc(localNode);
632 		     getDestination(arc) != destination;
633 		     arc = getNextArc(arc))
634 			arcIndex++;
635 		arcStatus[arcIndex] = true;
636 	}
637 
638 	// Check other nodes
639 	arcIndex = 0;
640 	for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc)) {
641 		if (!arcStatus[arcIndex]
642 		    && isMixed(getDestination(arc), firstStrain))
643 			counter++;
644 		arcIndex++;
645 	}
646 
647 	free(arcStatus);
648 	return counter;
649 }
650 
countBreakpoints(Graph * graph,IDnum firstStrain)651 IDnum countBreakpoints(Graph * graph, IDnum firstStrain)
652 {
653 	PassageMarker *marker;
654 	IDnum seqIndex;
655 	IDnum total = 0;
656 
657 	for (seqIndex = 1; seqIndex < firstStrain; seqIndex++) {
658 		marker = getMarker(getNodeInGraph(graph, seqIndex));
659 		while (marker != NULL) {
660 			total +=
661 			    countLocalBreakpoints(marker, firstStrain);
662 			marker = getNextInSequence(marker);
663 		}
664 	}
665 
666 	return total;
667 }
668 
countStrainOnlyNodes(Graph * graph,IDnum firstStrain)669 IDnum countStrainOnlyNodes(Graph * graph, IDnum firstStrain)
670 {
671 	IDnum index;
672 	IDnum total = 0;
673 
674 	for (index = 1; index <= nodeCount(graph); index++)
675 		if (isOnlyStrain
676 		    (getNodeInGraph(graph, index), firstStrain))
677 			total++;
678 
679 	return total;
680 }
681 
countStrainOnlyBp(Graph * graph,IDnum firstStrain)682 Coordinate countStrainOnlyBp(Graph * graph, IDnum firstStrain)
683 {
684 	IDnum index;
685 	Coordinate total = 0;
686 	Node *node;
687 	Arc *arc;
688 	Coordinate local;
689 
690 	for (index = 1; index <= nodeCount(graph); index++) {
691 		if (isOnlyStrain
692 		    (getNodeInGraph(graph, index), firstStrain)) {
693 			node = getNodeInGraph(graph, index);
694 			local = getNodeLength(node);
695 
696 			for (arc = getArc(node); arc != NULL;
697 			     arc = getNextArc(arc)) {
698 				if (!isOnlyStrain
699 				    (getDestination(arc), firstStrain)) {
700 					local -= 24;
701 					break;
702 				}
703 			}
704 
705 			for (arc = getArc(getTwinNode(node)); arc != NULL;
706 			     arc = getNextArc(arc)) {
707 				if (!isOnlyStrain
708 				    (getDestination(arc), firstStrain)) {
709 					local -= 24;
710 					break;
711 				}
712 			}
713 
714 			if (local < 0)
715 				local = 1;
716 
717 			total += local;
718 		}
719 	}
720 
721 	return total;
722 }
723 
genomeMarkerCount(Node * node,IDnum firstStrain)724 IDnum genomeMarkerCount(Node * node, IDnum firstStrain)
725 {
726 	PassageMarker *marker;
727 	IDnum counter = 0;
728 
729 	for (marker = getMarker(node); marker != NULL;
730 	     marker = getNextInNode(marker))
731 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
732 			counter++;
733 
734 	return counter;
735 }
736 
readCoverage(Node * node)737 Coordinate readCoverage(Node * node)
738 {
739 	PassageMarker *marker;
740 	Coordinate sum = 0;
741 
742 	for (marker = getMarker(node); marker != NULL;
743 	     marker = getNextInNode(marker)) {
744 		if (getTwinMarker(marker) == NULL) {
745 			printf("Node %d screwed up\n", getNodeID(node));
746 			printf("Sequence %d\n",
747 			       getPassageMarkerSequenceID(marker));
748 			abort();
749 		}
750 		sum += getPassageMarkerLength(marker);
751 	}
752 
753 	return sum;
754 }
755 
refReadCoverage(Node * node,IDnum firstStrain)756 Coordinate refReadCoverage(Node * node, IDnum firstStrain)
757 {
758 	PassageMarker *marker;
759 	Coordinate sum = 0;
760 
761 	for (marker = getMarker(node); marker != NULL;
762 	     marker = getNextInNode(marker))
763 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
764 			sum += getPassageMarkerLength(marker);
765 
766 	return sum;
767 }
768 
newReadCoverage(Node * node,IDnum firstStrain)769 Coordinate newReadCoverage(Node * node, IDnum firstStrain)
770 {
771 	PassageMarker *marker;
772 	Coordinate sum = 0;
773 
774 	for (marker = getMarker(node); marker != NULL;
775 	     marker = getNextInNode(marker))
776 		if (getAbsolutePassMarkerSeqID(marker) >= firstStrain) {
777 			sum += getPassageMarkerLength(marker);
778 			if (getPassageMarkerLength(marker) < 0)
779 				printf("Bizarre marker %d at node %d\n",
780 				       getPassageMarkerSequenceID(marker),
781 				       getNodeID(node));
782 		}
783 
784 	return sum;
785 }
786 
readStarts(Node * node)787 IDnum readStarts(Node * node)
788 {
789 	PassageMarker *marker;
790 	IDnum sum = 0;
791 
792 	if (node == NULL)
793 		return 0;
794 
795 	for (marker = getMarker(node); marker != NULL;
796 	     marker = getNextInNode(marker)) {
797 		if (getPassageMarkerSequenceID(marker) > 0
798 		    && isInitial(marker))
799 			sum++;
800 		else if (getPassageMarkerSequenceID(marker) < 0
801 			 && isTerminal(marker))
802 			sum++;
803 	}
804 
805 	return sum;
806 }
807 
printShortCounts(FILE * outfile,Node * node,Graph * graph,ReadSet * reads)808 static void printShortCounts(FILE * outfile, Node * node, Graph * graph, ReadSet * reads) {
809 	IDnum counts[CATEGORIES];
810 	Category cat;
811 	IDnum shortReadIndex;
812 	IDnum readID;
813 	IDnum shortReadCount;
814 	ShortReadMarker *array;
815 	ShortReadMarker *marker;
816 
817 	if (!readStartsAreActivated(graph)) {
818 		for (cat = 0; cat < CATEGORIES; cat++)
819 			fprintf(outfile, "\tN/A");
820 		return;
821 	}
822 
823 	shortReadCount = getNodeReadCount(node, graph);
824 	array = getNodeReads(node, graph);
825 
826 	for (cat = 0; cat < CATEGORIES; cat++)
827 		counts[cat] = 0;
828 
829 	for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
830 		marker = getShortReadMarkerAtIndex(array, shortReadIndex);
831 		readID = getShortReadMarkerID(marker);
832 		cat = reads->categories[readID - 1] / 2;
833 		counts[cat]++;
834 	}
835 
836 	for (cat = 0; cat < CATEGORIES; cat++)
837 		fprintf(outfile, "\t%li", (long) counts[cat]);
838 }
839 
displayGeneralStatistics(Graph * graph,char * filename,ReadSet * reads)840 void displayGeneralStatistics(Graph * graph, char *filename, ReadSet * reads)
841 {
842 	IDnum nodeIndex;
843 	Node *node;
844 	Category cat;
845 	FILE *outfile;
846 
847 	outfile = fopen(filename, "w");
848 	if (outfile == NULL) {
849 		printf("Couldn't open file %s, sorry\n", filename);
850 		return;
851 	} else
852 		printf("Writing into stats file %s...\n", filename);
853 
854 	fprintf(outfile, "ID\tlgth\tout\tin\tlong_cov");
855 
856 	for (cat = 0; cat < CATEGORIES; cat++) {
857 		fprintf(outfile, "\tshort%i_cov", (int) (cat + 1));
858 		fprintf(outfile, "\tshort%i_Ocov", (int) (cat + 1));
859 	}
860 
861 	fprintf(outfile, "\tlong_nb");
862 	for (cat = 0; cat < CATEGORIES; cat++) {
863 		fprintf(outfile, "\tshort%i_nb", (int) (cat + 1));
864 	}
865 
866 	fprintf(outfile, "\n");
867 
868 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
869 		node = getNodeInGraph(graph, nodeIndex);
870 		if (node == NULL)
871 			continue;
872 		fprintf
873 		    (outfile, "%ld\t%lld\t%i\t%i",
874 		     (long) getNodeID(node), (long long) getNodeLength(node), arcCount(node),
875 		     arcCount(getTwinNode(node)));
876 
877 		if (getNodeLength(node) > 0) {
878 			fprintf(outfile, "\t%f",
879 				readCoverage(node) /
880 				(double) getNodeLength(node));
881 			for (cat = 0; cat < CATEGORIES; cat++) {
882 				fprintf(outfile, "\t%f",
883 					getVirtualCoverage(node,
884 							   cat) /
885 					(double) getNodeLength(node));
886 				fprintf(outfile, "\t%f",
887 					getOriginalVirtualCoverage(node,
888 								   cat) /
889 					(double) getNodeLength(node));
890 			}
891 		} else {
892 			fprintf(outfile, "\tInf");
893 			for (cat = 0; cat < CATEGORIES; cat++)
894 				fprintf(outfile, "\tInf\tInf");
895 		}
896 
897 		fprintf(outfile, "\t%li", (long) markerCount(node));
898 		printShortCounts(outfile, node, graph, reads);
899 
900 		fprintf(outfile, "\n");
901 	}
902 
903 	fclose(outfile);
904 }
905 
destroyStrainSpecificIslands(Graph * graph,IDnum firstStrain)906 void destroyStrainSpecificIslands(Graph * graph, IDnum firstStrain)
907 {
908 	IDnum index;
909 	Arc *arc;
910 	boolean isModified = true;
911 	Node *node;
912 	IDnum counter = 0;
913 
914 	resetNodeStatus(graph);
915 
916 	puts("Destroying disconnected strain specific sub-graphs");
917 
918 	// Mark all genomic nodes
919 	for (index = 1; index <= nodeCount(graph); index++) {
920 		node = getNodeInGraph(graph, index);
921 		if (!isOnlyStrain(node, firstStrain))
922 			setNodeStatus(node, true);
923 	}
924 
925 	// Mark nodes connected to genomic nodes
926 	while (isModified) {
927 		isModified = false;
928 
929 		for (index = 1; index <= nodeCount(graph); index++) {
930 			node = getNodeInGraph(graph, index);
931 
932 			if (getNodeStatus(node))
933 				continue;
934 
935 			for (arc = getArc(node); arc != NULL;
936 			     arc = getNextArc(arc)) {
937 				if (getNodeStatus(getDestination(arc))) {
938 					isModified = true;
939 					setNodeStatus(node, true);
940 				}
941 			}
942 
943 			for (arc = getArc(getTwinNode(node)); arc != NULL;
944 			     arc = getNextArc(arc)) {
945 				if (getNodeStatus(getDestination(arc))) {
946 					isModified = true;
947 					setNodeStatus(node, true);
948 				}
949 			}
950 		}
951 	}
952 
953 	// Remove all unmarked nodes
954 	for (index = 1; index <= nodeCount(graph); index++) {
955 		node = getNodeInGraph(graph, index);
956 		if (!getNodeStatus(node)) {
957 			destroyNode(node, graph);
958 			counter++;
959 		}
960 	}
961 
962 	// Renumber graph nodes
963 	printf("Removed %d nodes \n", counter);
964 	renumberNodes(graph);
965 }
966 
displayStrainOnlySequences(Graph * graph,IDnum firstStrain,char * inputFilename,char * filename,int WORDLENGTH)967 void displayStrainOnlySequences(Graph * graph, IDnum firstStrain,
968 				char *inputFilename, char *filename,
969 				int WORDLENGTH)
970 {
971 	IDnum nodeIndex;
972 	Node *node;
973 	FILE *outfile = fopen(filename, "w");
974 	Coordinate start, finish;
975 	char str[100];
976 	TightString *tString;
977 	IDnum readID;
978 	Coordinate readCoord;
979 
980 	if (outfile == NULL) {
981 		printf("Could not write into %s, sorry\n", filename);
982 		return;
983 	}
984 
985 	destroyStrainSpecificIslands(graph, firstStrain);
986 
987 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
988 		node = getNodeInGraph(graph, nodeIndex);
989 		if (isOnlyStrain(node, firstStrain)
990 		    && getNodeLength(node) > 500) {
991 			tString = expandNode(node, WORDLENGTH);
992 			readID =
993 			    getPassageMarkerSequenceID(getMarker(node));
994 			readCoord = getPassageMarkerStart(getMarker(node));
995 			fprintf(outfile, "> UNIQUE SEQUENCE %ld; %lld\n",
996 				(long) readID, (long long) readCoord);
997 
998 			start = 0;
999 			while (start <= getLength(tString)) {
1000 				finish = start + 60;
1001 				readTightStringFragment(tString, start,
1002 							finish, str);
1003 				fprintf(outfile, "%s\n", str);
1004 				start = finish;
1005 			}
1006 		}
1007 	}
1008 
1009 	fclose(outfile);
1010 }
1011 
displayStrainOnlyDescriptors(Graph * graph,IDnum firstStrain)1012 void displayStrainOnlyDescriptors(Graph * graph, IDnum firstStrain)
1013 {
1014 	IDnum nodeIndex;
1015 	Node *node;
1016 	char *str;
1017 
1018 	destroyStrainSpecificIslands(graph, firstStrain);
1019 
1020 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1021 		printf("node %d from %d\n", nodeIndex, nodeCount(graph));
1022 		node = getNodeInGraph(graph, nodeIndex);
1023 
1024 		if (isOnlyStrain(node, firstStrain)) {
1025 			str = readNode(node);
1026 			printf("> UNIQUE SEQUENCE %s\n", str);
1027 			free(str);
1028 		}
1029 	}
1030 }
1031 
displayLocalBreakpoint(PassageMarker * strainMarker,IDnum firstStrain,PassageMarker * genomeMarker,Node ** genomeDestination,Node ** strainDestination,IDnum * counter,IDnum nodeCount)1032 void displayLocalBreakpoint(PassageMarker * strainMarker,
1033 			    IDnum firstStrain,
1034 			    PassageMarker * genomeMarker,
1035 			    Node ** genomeDestination,
1036 			    Node ** strainDestination, IDnum * counter,
1037 			    IDnum nodeCount)
1038 {
1039 	boolean isTranslocation;
1040 	PassageMarker *marker;
1041 	Node *destination, *destinationA;
1042 	Node *destination2, *destination2A;
1043 	Node *node1, *node2;
1044 	IDnum localID = getNodeID(getNode(strainMarker));
1045 
1046 	// Eliminate genomic markers
1047 	if (strainMarker == genomeMarker)
1048 		return;
1049 
1050 	destinationA = getNode(getNextInSequence(strainMarker));
1051 
1052 	if (destinationA == NULL)
1053 		return;
1054 
1055 	// Eliminate those that follow some local strain
1056 	if (isDestinationToMarker(genomeMarker, destinationA)) {
1057 //              puts("Parallel paths");
1058 		return;
1059 	}
1060 
1061 	destination2A = getNode(getNextInSequence(genomeMarker));
1062 
1063 	if (destination2A == NULL)
1064 		return;
1065 
1066 	printf("Lengths %lld %lld\n", (long long) getNodeLength(destinationA),
1067 	       (long long) getNodeLength(destination2A));
1068 
1069 	// Hop to another genomic node
1070 //      if (getNodeLength(destinationA) > 24) {
1071 	//printf("wrong length %d %d\n", getNodeLength(destination) , getNodeID(destination));
1072 //              return;
1073 //      }
1074 
1075 	destination =
1076 	    getNode(getNextInSequence(getNextInSequence(strainMarker)));
1077 
1078 	if (destination == NULL)
1079 		return;
1080 
1081 	// Eliminate those that point to uniquely strain sequences
1082 	if (nodeGenomicMultiplicity(destination, firstStrain) != 1) {
1083 //              puts("Multiple genome reads");
1084 		return;
1085 	}
1086 	// Hop to another genomic node
1087 //      if (getNodeLength(destination2A) != 24) {
1088 	//puts("wrong length 2");
1089 //              return;
1090 //      }
1091 
1092 	destination2 =
1093 	    getNode(getNextInSequence(getNextInSequence(genomeMarker)));
1094 
1095 	if (destination2 == NULL)
1096 		return;
1097 
1098 
1099 	if (destination == destination2)
1100 		return;
1101 
1102 	// Eliminate those that point to uniquely strain sequences
1103 	if (isOnlyGenome(destination2, firstStrain))
1104 		return;
1105 
1106 	setSingleNodeStatus(getNode(strainMarker), true);
1107 	strainDestination[localID + nodeCount] = destination;
1108 	genomeDestination[localID + nodeCount] = destination2;
1109 
1110 //      printf("Assigning %p and %p to %d\n", destination, destination2, localID);
1111 	printf("lengths %lld\t%lld\n", (long long) getNodeLength(destinationA),
1112 	       (long long) getNodeLength(destination2A));
1113 
1114 	// Detect translocation
1115 	isTranslocation = true;
1116 	for (marker = getMarker(destination); marker != NULL;
1117 	     marker = getNextInNode(marker))
1118 		if (getAbsolutePassMarkerSeqID(marker) ==
1119 		    getAbsolutePassMarkerSeqID(genomeMarker)) {
1120 			isTranslocation = false;
1121 			break;
1122 		}
1123 
1124 	if (isTranslocation) {
1125 		printf("BREAK TRANS\t%ld\t%lld\t%lld\t%lld\n",
1126 		       (long) getAbsolutePassMarkerSeqID(genomeMarker),
1127 		       (long long) getPassageMarkerStart(genomeMarker),
1128 		       (long long) getNodeLength(destinationA),
1129 		       (long long) getNodeLength(destination2A));
1130 		counter[2]++;
1131 		return;
1132 	}
1133 	// Detect breakpoint
1134 	printf("BREAK INTRA\t%ld\t%lld\t%lld\t%lld\n",
1135 	       (long) getAbsolutePassMarkerSeqID(genomeMarker),
1136 	       (long long) getPassageMarkerStart(genomeMarker),
1137 	       (long long) getNodeLength(destinationA), (long long) getNodeLength(destination2A));
1138 	counter[1]++;
1139 
1140 	// Check for inversion
1141 	if (getPassageMarkerSequenceID(marker) !=
1142 	    -getPassageMarkerSequenceID(genomeMarker))
1143 		return;
1144 
1145 //      puts("potential!!");
1146 
1147 	node1 = getTwinNode(destination);
1148 
1149 	if (getNodeStatus(node1)) {
1150 		node2 =
1151 		    getTwinNode(genomeDestination
1152 				[getNodeID(node1) + nodeCount]);
1153 		if (getNodeStatus(node2))
1154 			if (strainDestination[getNodeID(node2) + nodeCount]
1155 			    == destination2) {
1156 //                                      puts("Safe");
1157 				counter[1] -= 4;
1158 				counter[0]++;
1159 			} else;
1160 //                              puts("stopped 3");
1161 		else;
1162 //                      puts("stopped 2");
1163 	} else;
1164 //              puts("stopped 1");
1165 }
1166 
displayBreakpoints(Graph * graph,IDnum firstStrain)1167 void displayBreakpoints(Graph * graph, IDnum firstStrain)
1168 {
1169 	IDnum nodeIndex;
1170 	Node *node;
1171 	PassageMarker *strainMarker, *genomeMarker;
1172 	Node **genomeDestination =
1173 	    callocOrExit(2 * nodeCount(graph) + 1, Node *);
1174 	Node **strainDestination =
1175 	    callocOrExit(2 * nodeCount(graph) + 1, Node *);
1176 	IDnum counters[3];
1177 
1178 	counters[0] = 0;
1179 	counters[1] = 0;
1180 	counters[2] = 0;
1181 
1182 	resetNodeStatus(graph);
1183 
1184 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1185 		node = getNodeInGraph(graph, nodeIndex);
1186 
1187 		if (arcCount(node) <= 1
1188 		    && arcCount(getTwinNode(node)) <= 1) {
1189 			continue;
1190 		}
1191 
1192 		if (nodeGenomicMultiplicity(node, firstStrain) != 1) {
1193 			continue;
1194 		}
1195 
1196 		if (isOnlyGenome(node, firstStrain)) {
1197 			continue;
1198 		}
1199 
1200 		for (genomeMarker = getMarker(node); genomeMarker != NULL;
1201 		     genomeMarker = getNextInNode(genomeMarker))
1202 			if (getAbsolutePassMarkerSeqID(genomeMarker) <
1203 			    firstStrain)
1204 				break;
1205 
1206 		// Go through all strain passage marker
1207 		for (strainMarker = getMarker(node); strainMarker != NULL;
1208 		     strainMarker = getNextInNode(strainMarker)) {
1209 			displayLocalBreakpoint(strainMarker, firstStrain,
1210 					       genomeMarker,
1211 					       genomeDestination,
1212 					       strainDestination, counters,
1213 					       nodeCount(graph));
1214 			displayLocalBreakpoint(getTwinMarker(strainMarker),
1215 					       firstStrain,
1216 					       getTwinMarker(genomeMarker),
1217 					       genomeDestination,
1218 					       strainDestination, counters,
1219 					       nodeCount(graph));
1220 		}
1221 	}
1222 
1223 
1224 	printf("%d\t%d\t%d\n", counters[0], counters[1], counters[2]);
1225 	free(strainDestination);
1226 	free(genomeDestination);
1227 }
1228 
genomeMarker(Node * node,IDnum firstStrain)1229 PassageMarker *genomeMarker(Node * node, IDnum firstStrain)
1230 {
1231 	PassageMarker *marker;
1232 
1233 	if (genomeMarkerCount(node, firstStrain) != 1)
1234 		return NULL;
1235 
1236 	for (marker = getMarker(node); marker != NULL;
1237 	     marker = getNextInNode(marker))
1238 		if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
1239 			return marker;
1240 
1241 	return NULL;
1242 }
1243 
exportArcSequence(Arc * arc,FILE * outfile,int WORDLENGTH,TightString ** sequences)1244 void exportArcSequence(Arc * arc, FILE * outfile, int WORDLENGTH,
1245 		       TightString ** sequences)
1246 {
1247 	char *str;
1248 	TightString *output =
1249 	    newTightString(getNodeLength(getOrigin(arc)) +
1250 			   getNodeLength(getDestination(arc)));
1251 	appendNodeSequence(getOrigin(arc), output, 0);
1252 	appendNodeSequence(getDestination(arc), output,
1253 			   getNodeLength(getOrigin(arc)));
1254 	str = readTightString(output);
1255 	fprintf(outfile, "> ARC from NODE %d", getNodeID(getOrigin(arc)));
1256 	fprintf(outfile, "%s\n", str);
1257 	destroyTightString(output);
1258 	free(str);
1259 }
1260 
1261 // Produce sequences necessary to recreate graph elsewhere...
projectGraphToFile(Graph * graph,char * filename,int WORDLENGTH,TightString ** sequences)1262 void projectGraphToFile(Graph * graph, char *filename, int WORDLENGTH,
1263 			TightString ** sequences)
1264 {
1265 	FILE *outfile = fopen(filename, "w");
1266 	IDnum index;
1267 	Node *currentNode;
1268 	Arc *arc;
1269 
1270 	if (outfile == NULL) {
1271 		printf("Could not open %s, sorry\n", filename);
1272 		return;
1273 	}
1274 
1275 	for (index = 1; index < nodeCount(graph); index++) {
1276 		currentNode = getNodeInGraph(graph, index);
1277 		for (arc = getArc(currentNode); arc != NULL;
1278 		     arc = getNextArc(arc))
1279 			exportArcSequence(arc, outfile, WORDLENGTH,
1280 					  sequences);
1281 
1282 		for (arc = getArc(getTwinNode(currentNode)); arc != NULL;
1283 		     arc = getNextArc(arc))
1284 			exportArcSequence(arc, outfile, WORDLENGTH,
1285 					  sequences);
1286 	}
1287 
1288 	fclose(outfile);
1289 }
1290 
removeReferenceMarkers(Graph * graph,IDnum firstStrain)1291 void removeReferenceMarkers(Graph * graph, IDnum firstStrain)
1292 {
1293 	IDnum ID;
1294 	Node *node;
1295 	PassageMarker *marker, *oldMarker;
1296 
1297 	for (ID = 1; ID <= nodeCount(graph); ID++) {
1298 		node = getNodeInGraph(graph, ID);
1299 		marker = getMarker(node);
1300 		while (marker != NULL) {
1301 			if (getAbsolutePassMarkerSeqID(marker) <
1302 			    firstStrain) {
1303 				if (!isInitial(marker))
1304 					changeMultiplicity
1305 					    (getArcBetweenNodes
1306 					     (getNode
1307 					      (getPreviousInSequence
1308 					       (marker)), node, graph),
1309 					     -1);
1310 				if (!isTerminal(marker))
1311 					changeMultiplicity
1312 					    (getArcBetweenNodes
1313 					     (node,
1314 					      getNode(getNextInSequence
1315 						      (marker)), graph),
1316 					     -1);
1317 				oldMarker = marker;
1318 				marker = getNextInNode(marker);
1319 				destroyPassageMarker(oldMarker);
1320 			} else
1321 				marker = getNextInNode(marker);
1322 		}
1323 
1324 		if (getMarker(node) == NULL)
1325 			destroyNode(node, graph);
1326 	}
1327 
1328 	concatenateGraph(graph);
1329 }
1330 
exportLongNodeSequences(char * filename,Graph * graph,Coordinate minLength)1331 void exportLongNodeSequences(char *filename, Graph * graph,
1332 			     Coordinate minLength)
1333 {
1334 	FILE *outfile = fopen(filename, "w");
1335 	IDnum nodeIndex;
1336 	TightString *tString;
1337 	Coordinate position;
1338 	char nucleotide;
1339 	Node *node;
1340 	int WORDLENGTH = getWordLength(graph);
1341 	GapMarker *gap;
1342 	//double sensitivity, specificity;
1343 
1344 	if (outfile == NULL) {
1345 		printf("Could not write into %s, sorry\n", filename);
1346 		return;
1347 	} else {
1348 		printf("Writing contigs into %s...\n", filename);
1349 	}
1350 
1351 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1352 		node = getNodeInGraph(graph, nodeIndex);
1353 
1354 		if (node == NULL || getNodeLength(node) < minLength)
1355 			continue;
1356 
1357 		tString = expandNode(node, WORDLENGTH);
1358 		fprintf(outfile, ">NODE_%ld_length_%lld_cov_%f\n",
1359 			(long) nodeIndex, (long long) getNodeLength(node),
1360 			(getVirtualCoverage(node, 0)
1361 			 + getVirtualCoverage(node, 1)
1362 			 + readCoverage(node)) /
1363 			(float) getNodeLength(node));
1364 
1365 		gap = getGap(node, graph);
1366 		for (position = 0; position < WORDLENGTH; position++) {
1367 			if (gap && position >= getGapFinish(gap))
1368 				gap = getNextGap(gap);
1369 
1370 			if (gap == NULL || position < getGapStart(gap)) {
1371 				nucleotide =
1372 				    getNucleotideChar(position, tString);
1373 				fprintf(outfile, "%c", nucleotide);
1374 			} else
1375 				fprintf(outfile, "N");
1376 		}
1377 
1378 		gap = getGap(node, graph);
1379 		for (; position < getLength(tString); position++) {
1380 			if (position % 60 == 0)
1381 				fprintf(outfile, "\n");
1382 
1383 			if (gap
1384 			    && position - WORDLENGTH + 1 >=
1385 			    getGapFinish(gap))
1386 				gap = getNextGap(gap);
1387 
1388 			if (gap == NULL
1389 			    || position - WORDLENGTH + 1 <
1390 			    getGapStart(gap)) {
1391 				nucleotide =
1392 				    getNucleotideChar(position, tString);
1393 				fprintf(outfile, "%c", nucleotide);
1394 			} else
1395 				fprintf(outfile, "N");
1396 		}
1397 		fprintf(outfile, "\n");
1398 
1399 		destroyTightString(tString);
1400 	}
1401 
1402 	fclose(outfile);
1403 }
1404 
1405 /*
1406 void exportMediumNodeSequences(char* filename, Graph * graph, Coordinate minLength)
1407 {
1408 	IDnum dummy;
1409 	ReadSet *readSet = readFastAFile(sequenceFile);
1410 	char **reads = readSet->sequences;
1411 	TightString **sequences =
1412 	    newTightStringArrayFromStringArray(reads, dummy);
1413 	FILE *outfile = fopen(filename, "w");
1414 	char str[100];
1415 	IDnum nodeIndex;
1416 	TightString *tString;
1417 	Coordinate start, finish;
1418 	Node *node;
1419 	double sensitivity, specificity;
1420 
1421 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1422 		node = getNodeInGraph(graph, nodeIndex);
1423 		if (getNodeLength(node) < minLength
1424 		    || getNodeLength(node) >= maxLength)
1425 			continue;
1426 
1427 		tString = expandNode(node);
1428 		compareSequences(tString, sequences[0], &sensitivity,
1429 				 &specificity, WORDLENGTH);
1430 
1431 		fprintf(outfile,
1432 			"> MEDIUM NODE %d, Sensitivity = %f, Specificity = %f\n",
1433 			nodeIndex, sensitivity, specificity);
1434 		printf
1435 		    ("> MEDIUM NODE %d, Sensitivity = %f, Specificity = %f\n",
1436 		     nodeIndex, sensitivity, specificity);
1437 
1438 		start = 0;
1439 		while (start <= getLength(tString)) {
1440 			finish = start + 60;
1441 			readTightStringFragment(tString, start,
1442 						finish, str);
1443 			fprintf(outfile, "%s\n", str);
1444 			start = finish;
1445 		}
1446 
1447 		destroyTightString(tString);
1448 	}
1449 }
1450 */
1451 
maxLength(Graph * graph)1452 Coordinate maxLength(Graph * graph)
1453 {
1454 	IDnum index;
1455 	Node *node;
1456 	Coordinate max = 0;
1457 
1458 	for (index = 1; index <= nodeCount(graph); index++) {
1459 		node = getNodeInGraph(graph, index);
1460 		if (node != NULL && getNodeLength(node) > max)
1461 			max = getNodeLength(node);
1462 	}
1463 
1464 	return max;
1465 }
1466 
n50(Graph * graph)1467 Coordinate n50(Graph * graph)
1468 {
1469 	FibHeap *heap = newFibHeap();
1470 	IDnum index;
1471 	Coordinate totalLength = 0;
1472 	Coordinate sumLength = 0;
1473 	Node *node;
1474 
1475 	if (nodeCount(graph) == 0) {
1476 		puts("EMPTY GRAPH");
1477 		return 0;
1478 	}
1479 
1480 	for (index = 1; index <= nodeCount(graph); index++) {
1481 		node = getNodeInGraph(graph, index);
1482 		if (node == NULL)
1483 			continue;
1484 		insertNodeIntoHeap(heap, getNodeLength(node), node);
1485 		totalLength += getNodeLength(node);
1486 	}
1487 	totalLength /= 2;
1488 
1489 	node = removeNextNodeFromHeap(heap);
1490 	while (node != NULL) {
1491 		sumLength += getNodeLength(node);
1492 		if (sumLength >= totalLength)
1493 			break;
1494 		node = removeNextNodeFromHeap(heap);
1495 	}
1496 
1497 	destroyHeap(heap);
1498 	return getNodeLength(node);
1499 }
1500 
getTotalCoverage(Node * node)1501 static Coordinate getTotalCoverage(Node * node)
1502 {
1503 	Category cat;
1504 	Coordinate coverage = 0;
1505 
1506 	for (cat = 0; cat < CATEGORIES; cat++)
1507 		coverage += getVirtualCoverage(node, cat);
1508 
1509 	return coverage;
1510 }
1511 
compareNodeCovs(const void * A,const void * B)1512 int compareNodeCovs(const void * A, const void * B) {
1513 	Node * nodeA = *((Node **) A);
1514 	Node * nodeB = *((Node **) B);
1515 	double covA;
1516 	double covB;
1517 
1518 	if (getNodeLength(nodeA) == 0)
1519 		nodeA = NULL;
1520 
1521 	if (getNodeLength(nodeB) == 0)
1522 		nodeB = NULL;
1523 
1524 	// Null nodes considered to have infinite coverage
1525 	if (nodeA == NULL && nodeB == NULL)
1526 		return 0;
1527 	if (nodeA == NULL)
1528 		return 1;
1529 	if (nodeB == NULL)
1530 		return -1;
1531 
1532 	// Deal with real coverage numbers:
1533 	covA = getTotalCoverage(nodeA) / (double) getNodeLength(nodeA);
1534 	covB = getTotalCoverage(nodeB) / (double) getNodeLength(nodeB);
1535 
1536 	if (covA > covB)
1537 		return 1;
1538 	if (covA == covB)
1539 		return 0;
1540 	return -1;
1541 }
1542 
estimated_cov(Graph * graph,char * directory)1543 double estimated_cov(Graph * graph, char * directory)
1544 {
1545 	Node ** nodeArray = callocOrExit(nodeCount(graph), Node*);
1546 	IDnum index;
1547 	Coordinate halfTotalLength = 0;
1548 	Coordinate sumLength = 0;
1549 	Node *node;
1550 	char *logFilename =
1551 	    mallocOrExit(strlen(directory) + 100, char);
1552 	char *statsLine =
1553 	    mallocOrExit(5000, char);
1554 	FILE *logFile;
1555 
1556 	strcpy(logFilename, directory);
1557 	strcat(logFilename, "/Log");
1558 	logFile = fopen(logFilename, "a");
1559 
1560 	if (logFile == NULL)
1561 		exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
1562 		       logFilename);
1563 
1564 	puts("Measuring median coverage depth...");
1565 
1566 	if (nodeCount(graph) == 0) {
1567 		puts("EMPTY GRAPH");
1568 		return 0;
1569 	}
1570 
1571 	// Write nodes into array and compute total assembly length
1572 	for (index = 1; index <= nodeCount(graph); index++) {
1573 		node = getNodeInGraph(graph, index);
1574 		nodeArray[index - 1] = node;
1575 		if (node == NULL)
1576 			continue;
1577 		halfTotalLength += getNodeLength(node);
1578 	}
1579 	halfTotalLength /= 2;
1580 
1581 	// Sort nodes
1582 	qsort(nodeArray, nodeCount(graph), sizeof(Node *), compareNodeCovs);
1583 
1584 	// Compute the length weighted median node coverage
1585 	for (index = 0; index < nodeCount(graph); index++) {
1586 		node = nodeArray[index];
1587 		sumLength += getNodeLength(node);
1588 		if (sumLength >= halfTotalLength) {
1589 			printf("Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1590 			fprintf(logFile, "Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1591 			free(nodeArray);
1592 			fclose(logFile);
1593 			free(logFilename);
1594 			free(statsLine);
1595 			return getTotalCoverage(node) / (double) getNodeLength(node);
1596 		}
1597 	}
1598 
1599 	// In case something went wrong...
1600 	free(nodeArray);
1601 	fclose(logFile);
1602 	free(logFilename);
1603 	free(statsLine);
1604 
1605 	return -1;
1606 }
1607 
destroyMixedNode(Node * node)1608 static void destroyMixedNode(Node * node)
1609 {
1610 	PassageMarker *marker = getMarker(node);
1611 	PassageMarker *current;
1612 
1613 	while (marker != NULL) {
1614 		while (!isInitial(marker))
1615 			marker = getPreviousInSequence(marker);
1616 
1617 		while (marker != NULL) {
1618 			current = marker;
1619 			marker = getNextInSequence(marker);
1620 			destroyPassageMarker(current);
1621 		}
1622 
1623 		marker = getMarker(node);
1624 	}
1625 }
1626 
destroyMixedReads(Graph * graph,IDnum minCoverage)1627 void destroyMixedReads(Graph * graph, IDnum minCoverage)
1628 {
1629 	IDnum index;
1630 	Node *node;
1631 
1632 	for (index = 1; index <= nodeCount(graph); index++) {
1633 		node = getNodeInGraph(graph, index);
1634 		if (node == NULL)
1635 			continue;
1636 
1637 		if (markerCount(node) < minCoverage)
1638 			destroyMixedNode(node);
1639 	}
1640 
1641 	for (index = 1; index <= nodeCount(graph); index++) {
1642 		node = getNodeInGraph(graph, index);
1643 		if (node == NULL)
1644 			continue;
1645 
1646 		if (getMarker(node) == NULL)
1647 			destroyNode(node, graph);
1648 	}
1649 
1650 	concatenateGraph(graph);
1651 }
1652 
isConnectedRead(PassageMarker * marker)1653 static boolean isConnectedRead(PassageMarker * marker)
1654 {
1655 	PassageMarker *current;
1656 
1657 	for (current = marker; getNodeStatus(getNode(current));
1658 	     current = getNextInSequence(current))
1659 		if (isTerminal(current))
1660 			return false;
1661 
1662 	for (current = getTwinMarker(marker);
1663 	     getNodeStatus(getNode(current));
1664 	     current = getNextInSequence(current))
1665 		if (isTerminal(current))
1666 			return false;
1667 
1668 	return true;
1669 }
1670 
destroyWholeRead(PassageMarker * marker)1671 static void destroyWholeRead(PassageMarker * marker)
1672 {
1673 	PassageMarker *current = marker;
1674 	PassageMarker *next;
1675 
1676 	while (!isInitial(current))
1677 		current = getPreviousInSequence(current);
1678 
1679 	for (; current != NULL; current = next) {
1680 		next = getNextInSequence(current);
1681 		destroyPassageMarker(current);
1682 	}
1683 }
1684 
cleanUpNode(Node * node,Graph * graph)1685 static void cleanUpNode(Node * node, Graph * graph)
1686 {
1687 	Category cat;
1688 	Node *twin = getTwinNode(node);
1689 	PassageMarker *marker, *twinMarker;
1690 
1691 	for (cat = 0; cat < CATEGORIES; cat++)
1692 		setVirtualCoverage(node, cat, 0);
1693 
1694 	while (getArc(node) != NULL)
1695 		destroyArc(getArc(node), graph);
1696 	while (getArc(twin) != NULL)
1697 		destroyArc(getArc(twin), graph);
1698 
1699 	for (marker = getMarker(node); marker != NULL;
1700 	     marker = getNextInNode(marker)) {
1701 		twinMarker = getTwinMarker(marker);
1702 
1703 		if (getNode(getNextInSequence(marker)) != twin
1704 		    || getPassageMarkerSequenceID(marker) > 0)
1705 			createArc(node,
1706 				  getNode(getNextInSequence(marker)),
1707 				  graph);
1708 		if (getNode(getNextInSequence(twinMarker)) != node
1709 		    || getPassageMarkerSequenceID(twinMarker) > 0)
1710 			createArc(twin,
1711 				  getNode(getNextInSequence(twinMarker)),
1712 				  graph);
1713 	}
1714 }
1715 
destroySinglePoolNodes(Graph * graph)1716 void destroySinglePoolNodes(Graph * graph)
1717 {
1718 	IDnum index;
1719 	Node *node;
1720 	PassageMarker *marker, *next;
1721 
1722 	puts("Destroying single pool nodes");
1723 	resetNodeStatus(graph);
1724 
1725 	// Remove empty, single pool nodes, mark other single pool nodes
1726 	for (index = 1; index <= nodeCount(graph); index++) {
1727 		node = getNodeInGraph(graph, index);
1728 		if (node == NULL)
1729 			continue;
1730 
1731 		if (getVirtualCoverage(node, 0) != 0
1732 		    && getVirtualCoverage(node, 1) != 0)
1733 			continue;
1734 
1735 		if (getMarker(node) == NULL)
1736 			destroyNode(node, graph);
1737 		else
1738 			setNodeStatus(node, true);
1739 	}
1740 
1741 	// Remove disconnected reads
1742 	for (index = 1; index <= nodeCount(graph); index++) {
1743 		node = getNodeInGraph(graph, index);
1744 		if (node == NULL || !getNodeStatus(node))
1745 			continue;
1746 
1747 		for (marker = getMarker(node); marker != NULL;
1748 		     marker = next) {
1749 			if (isConnectedRead(marker))
1750 				next = getNextInNode(marker);
1751 			else {
1752 				destroyWholeRead(marker);
1753 				next = getMarker(node);
1754 			}
1755 		}
1756 	}
1757 
1758 	// Remove empty, single pool nodes, review coverage of the others
1759 	for (index = 1; index <= nodeCount(graph); index++) {
1760 		node = getNodeInGraph(graph, index);
1761 		if (node == NULL || !getNodeStatus(node))
1762 			continue;
1763 
1764 		if (getMarker(node) == NULL)
1765 			destroyNode(node, graph);
1766 		else
1767 			cleanUpNode(node, graph);
1768 	}
1769 
1770 	puts("Done");
1771 
1772 	concatenateGraph(graph);
1773 }
1774 
destroyShortTips(Graph * graph)1775 void destroyShortTips(Graph * graph)
1776 {
1777 	IDnum index;
1778 	Node *node;
1779 	boolean modified = true;
1780 
1781 	puts("Removing short tips");
1782 
1783 	while (modified) {
1784 		modified = false;
1785 		for (index = 1; index <= nodeCount(graph); index++) {
1786 			node = getNodeInGraph(graph, index);
1787 			if (node == NULL)
1788 				continue;
1789 
1790 			if (getArc(node) == NULL
1791 			    || getArc(getTwinNode(node)) == NULL) {
1792 				if (getNodeLength(node) < 500) {
1793 					modified = true;
1794 					destroyNode(node, graph);
1795 				}
1796 			}
1797 		}
1798 	}
1799 
1800 	puts("Done");
1801 
1802 	concatenateGraph(graph);
1803 }
1804 
connectDomain(Node * node)1805 static Coordinate connectDomain(Node * node)
1806 {
1807 	Coordinate result = getNodeLength(node);
1808 	Arc *arc;
1809 
1810 	if (getNodeStatus(node))
1811 		return 0;
1812 	setNodeStatus(node, true);
1813 
1814 	for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1815 		result += connectDomain(getDestination(arc));
1816 	for (arc = getArc(getTwinNode(node)); arc != NULL;
1817 	     arc = getNextArc(arc))
1818 		result += connectDomain(getDestination(arc));
1819 
1820 	return result;
1821 
1822 }
1823 
destroyConnectedDomain(Node * node,Graph * graph)1824 static void destroyConnectedDomain(Node * node, Graph * graph)
1825 {
1826 	Arc *arc;
1827 
1828 	if (getNodeStatus(node))
1829 		return;
1830 	setNodeStatus(node, true);
1831 
1832 	for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1833 		destroyConnectedDomain(getDestination(arc), graph);
1834 	for (arc = getArc(getTwinNode(node)); arc != NULL;
1835 	     arc = getNextArc(arc))
1836 		destroyConnectedDomain(getDestination(arc), graph);
1837 
1838 	destroyNode(node, graph);
1839 
1840 }
1841 
destroyDisconnectedElements(Graph * graph)1842 void destroyDisconnectedElements(Graph * graph)
1843 {
1844 	Node *node;
1845 	IDnum index;
1846 	Coordinate domainSize;
1847 	FibHeap *heap = newFibHeap();
1848 	Coordinate *domainSizes =
1849 	    callocOrExit(1 + nodeCount(graph), Coordinate);
1850 
1851 	resetNodeStatus(graph);
1852 
1853 	puts("Destroying disconnected domains");
1854 
1855 	for (index = 1; index <= nodeCount(graph); index++) {
1856 		node = getNodeInGraph(graph, index);
1857 		if (node == NULL || getNodeStatus(node))
1858 			continue;
1859 		domainSize = connectDomain(node);
1860 		printf("CONNECT\t%lld\n", (long long) domainSize);
1861 		insertNodeIntoHeap(heap, domainSize, node);
1862 		domainSizes[index] = domainSize;
1863 	}
1864 
1865 	resetNodeStatus(graph);
1866 
1867 	while (true) {
1868 		node = removeNextNodeFromHeap(heap);
1869 		if (node == NULL || domainSizes[getNodeID(node)] > 1200)
1870 			break;
1871 
1872 		destroyConnectedDomain(node, graph);
1873 	}
1874 
1875 
1876 	destroyHeap(heap);
1877 	free(domainSizes);
1878 	puts("Done");
1879 
1880 	concatenateGraph(graph);
1881 }
1882 
connectDomainNodeCount(Node * node)1883 static Coordinate connectDomainNodeCount(Node * node)
1884 {
1885 	Coordinate result = 1;
1886 	Arc *arc;
1887 
1888 	if (getNodeStatus(node))
1889 		return 0;
1890 	setNodeStatus(node, true);
1891 
1892 	for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1893 		result += connectDomain(getDestination(arc));
1894 	for (arc = getArc(getTwinNode(node)); arc != NULL;
1895 	     arc = getNextArc(arc))
1896 		result += connectDomain(getDestination(arc));
1897 
1898 	return result;
1899 
1900 }
1901 
measureTangleSizes(Graph * graph,Coordinate maxLength)1902 void measureTangleSizes(Graph * graph, Coordinate maxLength)
1903 {
1904 	Node *node;
1905 	IDnum index;
1906 	Coordinate domainSize;
1907 
1908 	for (index = 1; index <= nodeCount(graph); index++) {
1909 		node = getNodeInGraph(graph, index);
1910 		if (node == NULL)
1911 			continue;
1912 		if (getNodeLength(node) >= maxLength)
1913 			destroyNode(node, graph);
1914 	}
1915 
1916 	renumberNodes(graph);
1917 	resetNodeStatus(graph);
1918 
1919 	for (index = 1; index <= nodeCount(graph); index++) {
1920 		node = getNodeInGraph(graph, index);
1921 		if (node == NULL || getNodeStatus(node))
1922 			continue;
1923 		domainSize = connectDomainNodeCount(node);
1924 		printf("CONNECT\t%lld\n", (long long) domainSize);
1925 	}
1926 
1927 	puts("Done");
1928 }
1929 
destroyEmptyNodes(Graph * graph)1930 void destroyEmptyNodes(Graph * graph)
1931 {
1932 	IDnum index;
1933 	Node *node;
1934 
1935 	for (index = 1; index <= nodeCount(graph); index++) {
1936 		node = getNodeInGraph(graph, index);
1937 		if (node == NULL)
1938 			continue;
1939 
1940 		if (getMarker(node) == NULL)
1941 			destroyNode(node, graph);
1942 	}
1943 
1944 	concatenateGraph(graph);
1945 }
1946 
getReadLength(PassageMarker * marker)1947 static Coordinate getReadLength(PassageMarker * marker)
1948 {
1949 	PassageMarker *current;
1950 	Coordinate totalLength = 0;
1951 
1952 	for (current = marker; current != NULL;
1953 	     current = getNextInSequence(current))
1954 		totalLength += getPassageMarkerLength(current);
1955 
1956 	return totalLength;
1957 }
1958 
destroyRead(PassageMarker * marker)1959 static void destroyRead(PassageMarker * marker)
1960 {
1961 	PassageMarker *current, *next;
1962 
1963 	for (current = marker; current != NULL; current = next) {
1964 		next = getNextInSequence(current);
1965 		destroyPassageMarker(current);
1966 	}
1967 }
1968 
removeShortReads(Graph * graph)1969 void removeShortReads(Graph * graph)
1970 {
1971 	IDnum index;
1972 	Node *node;
1973 	PassageMarker *marker, *next;
1974 
1975 	for (index = 1; index <= nodeCount(graph); index++) {
1976 		node = getNodeInGraph(graph, index);
1977 		if (node == NULL)
1978 			continue;
1979 
1980 
1981 		for (marker = getMarker(node); marker != NULL;
1982 		     marker = next) {
1983 			if (getPassageMarkerSequenceID(marker) > 0
1984 			    && isInitial(marker)
1985 			    && getReadLength(marker) < 400) {
1986 				destroyRead(marker);
1987 				next = getMarker(node);
1988 			} else if (getPassageMarkerSequenceID(marker) < 0
1989 				   && isTerminal(marker)
1990 				   && getReadLength(getTwinMarker(marker))
1991 				   < 400) {
1992 				destroyRead(getTwinMarker(marker));
1993 				next = getMarker(node);
1994 			} else
1995 				next = getNextInNode(marker);
1996 
1997 		}
1998 	}
1999 }
2000 
totalGraphLength(Graph * graph)2001 Coordinate totalGraphLength(Graph * graph)
2002 {
2003 	IDnum index;
2004 	Coordinate totalLength = 0;
2005 
2006 	for (index = 1; index <= nodeCount(graph); index++)
2007 		totalLength += getNodeLength(getNodeInGraph(graph, index));
2008 
2009 	return totalLength;
2010 }
2011 
destroySinglePoolNodesStrict(Graph * graph)2012 void destroySinglePoolNodesStrict(Graph * graph)
2013 {
2014 	IDnum index;
2015 	Node *node;
2016 
2017 	puts("Destroying single pool nodes");
2018 	resetNodeStatus(graph);
2019 
2020 	// Remove empty, single pool nodes, mark other single pool nodes
2021 	for (index = 1; index <= nodeCount(graph); index++) {
2022 		node = getNodeInGraph(graph, index);
2023 		if (node == NULL)
2024 			continue;
2025 
2026 		if (getVirtualCoverage(node, 0) != 0
2027 		    && getVirtualCoverage(node, 1) != 0)
2028 			continue;
2029 
2030 		destroyNode(node, graph);
2031 	}
2032 
2033 	puts("Done");
2034 
2035 	concatenateGraph(graph);
2036 }
2037 
contigStats(Node ** contigs,IDnum readCount)2038 void contigStats(Node ** contigs, IDnum readCount)
2039 {
2040 	FibHeap *heap = newFibHeap();
2041 	IDnum index;
2042 	Coordinate totalLength = 0;
2043 	Coordinate sumLength = 0;
2044 	Node *node;
2045 	Coordinate halfLength;
2046 
2047 	for (index = 0; index <= readCount; index++) {
2048 		if (contigs[index] != NULL) {
2049 			node = contigs[index];
2050 			printf("CONTIG %lld\n", (long long) getNodeLength(node));
2051 			insertNodeIntoHeap(heap, getNodeLength(node),
2052 					   node);
2053 			totalLength += getNodeLength(node);
2054 		}
2055 	}
2056 	halfLength = totalLength / 2;
2057 
2058 	node = removeNextNodeFromHeap(heap);
2059 	while (node != NULL) {
2060 		sumLength += getNodeLength(node);
2061 		if (sumLength >= halfLength)
2062 			break;
2063 		node = removeNextNodeFromHeap(heap);
2064 	}
2065 
2066 	destroyHeap(heap);
2067 	printf("N50 %lld Total %lld\n", (long long) getNodeLength(node), (long long) totalLength);
2068 }
2069 
exportContigs(Node ** contigs,ReadSet * reads,char * filename,int WORDLENGTH,int pairedReadsCount)2070 void exportContigs(Node ** contigs, ReadSet * reads, char *filename,
2071 		   int WORDLENGTH, int pairedReadsCount)
2072 {
2073 	TightString **sequences =
2074 	    mallocOrExit(reads->readCount, TightString *);
2075 	IDnum i;
2076 
2077 	for (i = 0; i < pairedReadsCount; i++) {
2078 		if (contigs[i] == NULL)
2079 			sequences[i] = NULL;
2080 		else
2081 			sequences[i] = expandNode(contigs[i], WORDLENGTH);
2082 	}
2083 
2084 	exportSequenceArray(filename, sequences, reads->readCount);
2085 }
2086 
removeLowCoverageNodesAndDenounceDubiousReads(Graph * graph,double minCov)2087 boolean *removeLowCoverageNodesAndDenounceDubiousReads(Graph * graph,
2088 						       double minCov)
2089 {
2090 	IDnum index;
2091 	Node *node;
2092 	boolean denounceReads = readStartsAreActivated(graph);
2093 	boolean *res = NULL;
2094 	ShortReadMarker *nodeArray, *shortMarker;
2095 	PassageMarker *marker;
2096 	IDnum maxIndex;
2097 	IDnum readID;
2098 	IDnum index2;
2099 	// Original
2100 	double nodeDensity = 0.0;
2101 	int countNodeTotal = 0, countNodeUnderMinCov = 0, countNodeOnLongRead = 0;
2102 	int countNodeUMCbutOnLongRead = 0, countNodeSupportedByLongRead = 0;
2103 	int countNodeEscapedByLongRead = 0;
2104 	boolean escapeByLongRead = false;
2105 	// Original
2106 
2107 	printf("Removing contigs with coverage < %f...\n", minCov);
2108 
2109 	if (denounceReads)
2110 		res = callocOrExit(sequenceCount(graph), boolean);
2111 
2112 	// Original
2113 	countNodeTotal = nodeCount(graph);
2114 	// Original
2115 
2116 	for (index = 1; index <= nodeCount(graph); index++) {
2117 		node = getNodeInGraph(graph, index);
2118 
2119 		if (getNodeLength(node) == 0)
2120 			continue;
2121 
2122 		// Original
2123 		if ((marker = getMarker(node))){
2124 			countNodeOnLongRead++;
2125 		}
2126 		// Original
2127 
2128 		if ((double)getTotalCoverage(node) / getNodeLength(node) < minCov) {
2129 		        // Original
2130 		        countNodeUnderMinCov++;
2131 			if ((marker = getMarker(node))) {
2132 				nodeDensity = (double)getTotalCoverage(node) / getNodeLength(node);
2133 				// printf("[Under]\tnodeID : %d\tnodeLen : %d\tnodeCov : %f\n",
2134 				//       getNodeID(node), getNodeLength(node), nodeDensity);
2135 				countNodeUMCbutOnLongRead++;
2136 
2137 				if (markerCount(node) >= 1) {
2138 					countNodeSupportedByLongRead++;
2139 				}
2140 				if (markerCount(node) >= 1 && !(nodeDensity == 0.0)) {
2141 					escapeByLongRead = true;
2142 					countNodeEscapedByLongRead++;
2143 				} else {
2144 					escapeByLongRead = false;
2145 				}
2146 			}
2147 			// Original
2148 
2149 
2150 			// Original
2151 			//if (! escapeByLongRead) {
2152 			// Original
2153 
2154 			if (denounceReads) {
2155 				nodeArray = getNodeReads(node, graph);
2156 				maxIndex = getNodeReadCount(node, graph);
2157 				for (index2 = 0; index2 < maxIndex; index2++) {
2158 					shortMarker =
2159 					    getShortReadMarkerAtIndex(nodeArray,
2160 								      index2);
2161 					readID = getShortReadMarkerID(shortMarker);
2162 					//printf("Dubious %d\n", readID);
2163 					if (readID > 0)
2164 						res[readID - 1] = true;
2165 					else
2166 						res[-readID - 1] = true;
2167 				}
2168 
2169 				nodeArray = getNodeReads(getTwinNode(node), graph);
2170 				maxIndex =
2171 				    getNodeReadCount(getTwinNode(node), graph);
2172 				for (index2 = 0; index2 < maxIndex; index2++) {
2173 					shortMarker =
2174 					    getShortReadMarkerAtIndex(nodeArray,
2175 								      index2);
2176 					readID = getShortReadMarkerID(shortMarker);
2177 					//printf("Dubious %d\n", readID);
2178 					if (readID > 0)
2179 						res[readID - 1] = true;
2180 					else
2181 						res[-readID - 1] = true;
2182 				}
2183 			}
2184 
2185 			// Original
2186 			//}
2187 			// Original
2188 
2189 			while ((marker = getMarker(node))) {
2190 			        if (!isInitial(marker)
2191 				    && !isTerminal(marker))
2192 					disconnectNextPassageMarker
2193 					    (getPreviousInSequence(marker),
2194 					     graph);
2195 				destroyPassageMarker(marker);
2196 			}
2197 			destroyNode(node, graph);
2198 
2199 		}
2200 		// Original
2201 		// else {
2202 		//	nodeDensity = (double)getTotalCoverage(node) / getNodeLength(node);
2203 		//	printf("[Over]\tnodeID : %d\tnodeLen : %d\tnodeCov : %f\n",
2204 		//	       getNodeID(node), getNodeLength(node), nodeDensity);
2205 		//}
2206 		// Original
2207 	}
2208 
2209 	/*
2210 	// Original
2211 	printf("No. of Nodes : %d\n", countNodeTotal);
2212 	printf("No. of Nodes Under MinCov : %d\n", countNodeUnderMinCov);
2213 	printf("No. of Nodes On Long Reads : %d\n", countNodeOnLongRead);
2214 	printf("No. of Nodes Under MinCov but On Long Reads : %d\n", countNodeOnLongRead);
2215 	printf("No. of Nodes Under MinCov but Supported by Long Reads : %d\n",
2216 	       countNodeSupportedByLongRead);
2217 	printf("No. of Nodes 0 < Cov < minCov but Escaped (Supported) by Long Reads : %d\n",
2218 	       countNodeEscapedByLongRead);
2219 	// Original
2220 	*/
2221 
2222 	concatenateGraph(graph);
2223 	return res;
2224 }
2225 
removeLowCoverageNodes(Graph * graph,double minCov)2226 void removeLowCoverageNodes(Graph * graph, double minCov)
2227 {
2228 	IDnum index;
2229 	Node *node;
2230 	PassageMarker *marker;
2231 
2232 	if (minCov < 0)
2233 		return;
2234 
2235 	printf("Applying a coverage cutoff of %f...\n", minCov);
2236 
2237 	for (index = 1; index <= nodeCount(graph); index++) {
2238 		node = getNodeInGraph(graph, index);
2239 
2240 		if (getNodeLength(node) > 0
2241 		    && getTotalCoverage(node) / getNodeLength(node) <
2242 		    minCov) {
2243 			while ((marker = getMarker(node))) {
2244 				if (!isInitial(marker)
2245 				    && !isTerminal(marker))
2246 					disconnectNextPassageMarker
2247 					    (getPreviousInSequence(marker),
2248 					     graph);
2249 				destroyPassageMarker(marker);
2250 			}
2251 			destroyNode(node, graph);
2252 		}
2253 	}
2254 
2255 	concatenateGraph(graph);
2256 }
2257 
removeHighCoverageNodes(Graph * graph,double maxCov)2258 void removeHighCoverageNodes(Graph * graph, double maxCov)
2259 {
2260 	IDnum index;
2261 	Node *node;
2262 	PassageMarker *marker;
2263 
2264 	if (maxCov < 0)
2265 		return;
2266 
2267 	printf("Applying an upper coverage cutoff of %f...\n", maxCov);
2268 
2269 	for (index = 1; index <= nodeCount(graph); index++) {
2270 		node = getNodeInGraph(graph, index);
2271 
2272 		if (getNodeLength(node) > 0
2273 		    && getTotalCoverage(node) / getNodeLength(node) >
2274 		    maxCov) {
2275 			while ((marker = getMarker(node))) {
2276 				if (!isInitial(marker)
2277 				    && !isTerminal(marker))
2278 					disconnectNextPassageMarker
2279 					    (getPreviousInSequence(marker),
2280 					     graph);
2281 				destroyPassageMarker(marker);
2282 			}
2283 			destroyNode(node, graph);
2284 		}
2285 	}
2286 
2287 	concatenateGraph(graph);
2288 }
2289 
removeMissingStrain(Graph * graph,Category cat)2290 void removeMissingStrain(Graph * graph, Category cat)
2291 {
2292 	IDnum ID;
2293 	Node *node;
2294 
2295 	for (ID = 1; ID <= nodeCount(graph); ID++) {
2296 		node = getNodeInGraph(graph, ID);
2297 
2298 		if (node == NULL)
2299 			continue;
2300 
2301 		if (getVirtualCoverage(node, cat) == 0)
2302 			destroyNode(node, graph);
2303 	}
2304 
2305 	concatenateGraph(graph);
2306 }
2307 
exportAMOSLib(FILE * outfile,Graph * graph,Category cat)2308 static void exportAMOSLib(FILE * outfile, Graph * graph, Category cat)
2309 {
2310 	Coordinate distance = getInsertLength(graph, cat * 2);
2311 	double variance = getInsertLength_var(graph, cat * 2);
2312 
2313 	if (distance == -1)
2314 		return;
2315 
2316 	fprintf(outfile, "{LIB\n");
2317 	fprintf(outfile, "iid:%d\n", (int) (cat + 1));
2318 	fprintf(outfile, "{DST\n");
2319 	fprintf(outfile, "mea:%lld\n", (long long) distance);
2320 	fprintf(outfile, "std:%lld\n", (long long) sqrt(variance));
2321 	fprintf(outfile, "}\n");
2322 	fprintf(outfile, "}\n");
2323 }
2324 
exportAMOSMarker(FILE * outfile,PassageMarker * marker,Coordinate nodeLength,Coordinate start,Coordinate finish,int wordShift)2325 static void exportAMOSMarker(FILE * outfile, PassageMarker * marker,
2326 			     Coordinate nodeLength, Coordinate start,
2327 			     Coordinate finish, int wordShift)
2328 {
2329 	Coordinate sequenceStart, sequenceFinish;
2330 
2331 	if (getStartOffset(marker) >= finish
2332 	    || getFinishOffset(marker) > nodeLength - start)
2333 		return;
2334 
2335 	sequenceStart = getPassageMarkerStart(marker);
2336 	if (start > getStartOffset(marker)) {
2337 		if (getPassageMarkerSequenceID(marker) > 0)
2338 			sequenceStart += start - getStartOffset(marker);
2339 		else
2340 			sequenceStart -= start - getStartOffset(marker);
2341 	}
2342 
2343 	sequenceFinish = getPassageMarkerFinish(marker);
2344 	if (nodeLength - finish > getFinishOffset(marker)) {
2345 		if (getPassageMarkerSequenceID(marker) > 0)
2346 			sequenceFinish -=
2347 			    nodeLength - finish - getFinishOffset(marker);
2348 		else
2349 			sequenceFinish +=
2350 			    nodeLength - finish - getFinishOffset(marker);
2351 	}
2352 
2353 	if (getPassageMarkerSequenceID(marker) > 0)
2354 		sequenceFinish += wordShift;
2355 	else
2356 		sequenceStart += wordShift;
2357 
2358 	fprintf(outfile, "{TLE\n");
2359 	fprintf(outfile, "src:%d\n", getAbsolutePassMarkerSeqID(marker));
2360 	if (getStartOffset(marker) > start)
2361 		fprintf(outfile, "off:%lld\n",
2362 			(long long) (getStartOffset(marker) - start));
2363 	else
2364 		fprintf(outfile, "off:0\n");
2365 	fprintf(outfile, "clr:%lld,%lld\n", (long long) sequenceStart, (long long) sequenceFinish);
2366 	fprintf(outfile, "}\n");
2367 }
2368 
exportAMOSShortMarker(FILE * outfile,ShortReadMarker * marker,ReadSet * reads,Coordinate start,Coordinate finish)2369 static void exportAMOSShortMarker(FILE * outfile, ShortReadMarker * marker,
2370 				  ReadSet * reads, Coordinate start,
2371 				  Coordinate finish)
2372 {
2373 	Coordinate offset =
2374 	    getShortReadMarkerPosition(marker) -
2375 	    getShortReadMarkerOffset(marker);
2376 	TightString *sequence =
2377 	    reads->tSequences[getShortReadMarkerID(marker) - 1];
2378 
2379 	if (getShortReadMarkerPosition(marker) == -1)
2380 		return;
2381 
2382 	if (offset >= finish || offset + getLength(sequence) < start)
2383 		return;
2384 
2385 	fprintf(outfile, "{TLE\n");
2386 	fprintf(outfile, "src:%d\n", getShortReadMarkerID(marker));
2387 	fprintf(outfile, "off:%lld\n", (long long) (offset - start));
2388 	fprintf(outfile, "clr:0,%lld\n", (long long) getLength(sequence));
2389 	fprintf(outfile, "}\n");
2390 }
2391 
exportAMOSReverseShortMarker(FILE * outfile,ShortReadMarker * marker,Coordinate nodeLength,int wordShift,ReadSet * reads,Coordinate start,Coordinate finish)2392 static void exportAMOSReverseShortMarker(FILE * outfile,
2393 					 ShortReadMarker * marker,
2394 					 Coordinate nodeLength,
2395 					 int wordShift, ReadSet * reads,
2396 					 Coordinate start,
2397 					 Coordinate finish)
2398 {
2399 	TightString *sequence =
2400 	    reads->tSequences[getShortReadMarkerID(marker) - 1];
2401 
2402 	Coordinate offset =
2403 	    nodeLength - getShortReadMarkerPosition(marker) +
2404 	    getShortReadMarkerOffset(marker) - getLength(sequence) +
2405 	    wordShift;
2406 
2407 	if (getShortReadMarkerPosition(marker) == -1)
2408 		return;
2409 
2410 	if (offset >= finish || offset + getLength(sequence) < start)
2411 		return;
2412 
2413 	fprintf(outfile, "{TLE\n");
2414 	fprintf(outfile, "src:%d\n", getShortReadMarkerID(marker));
2415 	fprintf(outfile, "off:%lld\n", (long long) (offset - start));
2416 	fprintf(outfile, "clr:%lld,0\n", (long long) getLength(sequence));
2417 	fprintf(outfile, "}\n");
2418 }
2419 
exportAMOSContig(FILE * outfile,ReadSet * reads,Node * node,Graph * graph,Coordinate contigStart,Coordinate contigFinish,IDnum iid,IDnum internalIndex)2420 static void exportAMOSContig(FILE * outfile, ReadSet * reads, Node * node,
2421 			     Graph * graph, Coordinate contigStart,
2422 			     Coordinate contigFinish, IDnum iid,
2423 			     IDnum internalIndex)
2424 {
2425 	Coordinate start;
2426 	char str[100];
2427 	PassageMarker *marker;
2428 	ShortReadMarker *shortMarkerArray, *shortMarker;
2429 	Coordinate index, maxIndex;
2430 	int wordShift = getWordLength(graph) - 1;
2431 	char *string = expandNodeFragment(node, contigStart, contigFinish,
2432 					  getWordLength(graph));
2433 	Coordinate length = contigFinish - contigStart + wordShift;
2434 
2435 	fprintf(outfile, "{CTG\n");
2436 	fprintf(outfile, "iid:%d\n", iid);
2437 	fprintf(outfile, "eid:%d-%d\n", getNodeID(node), internalIndex);
2438 
2439 	fprintf(outfile, "seq:\n");
2440 	for (start = 0; start <= length; start += 60) {
2441 		strncpy(str, &(string[start]), 60);
2442 		str[60] = '\0';
2443 		fprintf(outfile, "%s\n", str);
2444 	}
2445 	fprintf(outfile, ".\n");
2446 
2447 	fprintf(outfile, "qlt:\n");
2448 	for (start = 0; start <= length; start += 60) {
2449 		strncpy(str, &(string[start]), 60);
2450 		str[60] = '\0';
2451 		fprintf(outfile, "%s\n", str);
2452 	}
2453 	fprintf(outfile, ".\n");
2454 
2455 	free(string);
2456 
2457 	for (marker = getMarker(node); marker != NULL;
2458 	     marker = getNextInNode(marker))
2459 		exportAMOSMarker(outfile, marker, getNodeLength(node),
2460 				 contigStart, contigFinish, wordShift);
2461 
2462 	if (readStartsAreActivated(graph)) {
2463 		shortMarkerArray = getNodeReads(node, graph);
2464 		maxIndex = getNodeReadCount(node, graph);
2465 		for (index = 0; index < maxIndex; index++) {
2466 			shortMarker =
2467 			    getShortReadMarkerAtIndex(shortMarkerArray,
2468 						      index);
2469 			exportAMOSShortMarker(outfile, shortMarker, reads,
2470 					      contigStart, contigFinish);
2471 		}
2472 
2473 		shortMarkerArray = getNodeReads(getTwinNode(node), graph);
2474 		maxIndex = getNodeReadCount(getTwinNode(node), graph);
2475 		for (index = 0; index < maxIndex; index++) {
2476 			shortMarker =
2477 			    getShortReadMarkerAtIndex(shortMarkerArray,
2478 						      index);
2479 			exportAMOSReverseShortMarker(outfile, shortMarker,
2480 						     getNodeLength(node),
2481 						     wordShift, reads,
2482 						     contigStart,
2483 						     contigFinish);
2484 		}
2485 	}
2486 
2487 	fprintf(outfile, "}\n");
2488 }
2489 
exportAMOSNode(FILE * outfile,ReadSet * reads,Node * node,Graph * graph)2490 static void exportAMOSNode(FILE * outfile, ReadSet * reads, Node * node,
2491 			   Graph * graph)
2492 {
2493 	Coordinate start = 0;
2494 	Coordinate finish;
2495 	GapMarker *gap;
2496 	IDnum smallIndex = 0;
2497 	static IDnum iid = 1;
2498 	IDnum contigIndex = iid;
2499 	int wordShift = getWordLength(graph) - 1;
2500 
2501 	for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
2502 		finish = getGapStart(gap);
2503 		exportAMOSContig(outfile, reads, node, graph, start,
2504 				 finish, iid++, smallIndex++);
2505 		start = getGapFinish(gap);
2506 	}
2507 
2508 	finish = getNodeLength(node);
2509 	exportAMOSContig(outfile, reads, node, graph, start, finish, iid++,
2510 			 smallIndex);
2511 
2512 	if (!getGap(node, graph))
2513 		return;
2514 
2515 	start = 0;
2516 
2517 	fprintf(outfile, "{SCF\n");
2518 	fprintf(outfile, "eid:%d\n", getNodeID(node));
2519 	for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
2520 		finish = getGapStart(gap);
2521 		fprintf(outfile, "{TLE\n");
2522 		fprintf(outfile, "off:%lld\n", (long long) start);
2523 		fprintf(outfile, "clr:0,%lld\n",
2524 			(long long) (finish - start + (long long) wordShift));
2525 		fprintf(outfile, "src:%d\n", contigIndex++);
2526 		fprintf(outfile, "}\n");
2527 		start = getGapFinish(gap);
2528 	}
2529 	finish = getNodeLength(node);
2530 	fprintf(outfile, "{TLE\n");
2531 	fprintf(outfile, "off:%lld\n", (long long) start);
2532 	fprintf(outfile, "clr:0,%lld\n", (long long) (finish - start));
2533 	fprintf(outfile, "src:%d\n", contigIndex++);
2534 	fprintf(outfile, "}\n");
2535 
2536 	fprintf(outfile, "}\n");
2537 }
2538 
exportAMOSRead(FILE * outfile,TightString * tString,IDnum index,IDnum frg_index)2539 static void exportAMOSRead(FILE * outfile, TightString * tString,
2540 			   IDnum index, IDnum frg_index)
2541 {
2542 	Coordinate start, finish;
2543 	char str[100];
2544 
2545 	fprintf(outfile, "{RED\n");
2546 	fprintf(outfile, "iid:%d\n", index);
2547 	fprintf(outfile, "eid:%d\n", index);
2548 	if (frg_index > 0)
2549 		fprintf(outfile, "frg:%d\n", frg_index);
2550 
2551 	fprintf(outfile, "seq:\n");
2552 	start = 0;
2553 	while (start <= getLength(tString)) {
2554 		finish = start + 60;
2555 		readTightStringFragment(tString, start, finish, str);
2556 		fprintf(outfile, "%s\n", str);
2557 		start = finish;
2558 	}
2559 	fprintf(outfile, ".\n");
2560 
2561 	fprintf(outfile, "qlt:\n");
2562 	start = 0;
2563 	while (start <= getLength(tString)) {
2564 		finish = start + 60;
2565 		readTightStringFragment(tString, start, finish, str);
2566 		fprintf(outfile, "%s\n", str);
2567 		start = finish;
2568 	}
2569 	fprintf(outfile, ".\n");
2570 
2571 	fprintf(outfile, "}\n");
2572 }
2573 
exportAMOSContigs(char * filename,Graph * graph,Coordinate cutoff_length,ReadSet * reads)2574 void exportAMOSContigs(char *filename, Graph * graph,
2575 		       Coordinate cutoff_length, ReadSet * reads)
2576 {
2577 	IDnum index;
2578 	Category cat;
2579 	Node *node;
2580 	FILE *outfile;
2581 
2582 	printf("Writing into AMOS file %s...\n", filename);
2583 	outfile = fopen(filename, "w");
2584 
2585 	if (outfile == NULL)
2586 		exitErrorf(EXIT_FAILURE, true, "Could not write to AMOS file %s",
2587 		       filename);
2588 
2589 	for (cat = 0; cat <= CATEGORIES; cat++)
2590 		exportAMOSLib(outfile, graph, cat);
2591 
2592 	for (index = 1; index <= reads->readCount; index++) {
2593 		if (reads->categories[index - 1] % 2 != 0 &&
2594 		    getInsertLength(graph,
2595 				    reads->categories[index - 1]) >= 0) {
2596 			fprintf(outfile, "{FRG\n");
2597 			fprintf(outfile, "lib:%d\n",
2598 				(int) ((reads->categories[index - 1] / 2) + 1));
2599 			fprintf(outfile, "rds:%d,%d\n", index,
2600 				index + 1);
2601 			fprintf(outfile, "eid:%d\n", index);
2602 			fprintf(outfile, "iid:%d\n", index);
2603 			fprintf(outfile, "typ:I\n");
2604 			fprintf(outfile, "}\n");
2605 			index++;
2606 		}
2607 	}
2608 
2609 	for (index = 1; index <= reads->readCount; index++) {
2610 		if (reads->categories[index - 1] % 2 != 0 &&
2611 		    getInsertLength(graph,
2612 				    reads->categories[index - 1]) >= 0) {
2613 			exportAMOSRead(outfile,
2614 				       reads->tSequences[index - 1], index,
2615 				       index);
2616 			index++;
2617 			exportAMOSRead(outfile,
2618 				       reads->tSequences[index - 1], index,
2619 				       index - 1);
2620 		} else {
2621 			exportAMOSRead(outfile,
2622 				       reads->tSequences[index - 1], index,
2623 				       -1);
2624 		}
2625 	}
2626 
2627 	for (index = 1; index <= nodeCount(graph); index++) {
2628 		node = getNodeInGraph(graph, index);
2629 
2630 		if (node == NULL)
2631 			continue;
2632 
2633 		if (getNodeLength(node) >= cutoff_length)
2634 			exportAMOSNode(outfile, reads, node, graph);
2635 	}
2636 
2637 	fclose(outfile);
2638 
2639 }
2640 
isNatural(Graph * graph)2641 boolean isNatural(Graph * graph)
2642 {
2643 	Node *node;
2644 	IDnum index;
2645 
2646 	for (index = 1; index < nodeCount(graph); index++) {
2647 		node = getNodeInGraph(graph, index);
2648 		if (node == NULL)
2649 			continue;
2650 
2651 		if (getNodeLength(node) == 0)
2652 			return false;
2653 
2654 		if (simpleArcCount(node) > 4)
2655 			return false;
2656 
2657 		if (simpleArcCount(getTwinNode(node)) > 4)
2658 			return false;
2659 	}
2660 
2661 	return true;
2662 }
2663 
followShortPath(Arc * arc)2664 static Node *followShortPath(Arc * arc)
2665 {
2666 	Node *node = getDestination(arc);
2667 	if (simpleArcCount(node) != 1)
2668 		return NULL;
2669 	node = getDestination(getArc(node));
2670 	return getTwinNode(node);
2671 
2672 }
2673 
checkNodeForHallidayJunction(Node * node,Graph * graph)2674 static void checkNodeForHallidayJunction(Node * node, Graph * graph)
2675 {
2676 	Node *nodeA, *nodeB, *nodeC, *nodeD;
2677 	Arc *arc1, *arc2;
2678 
2679 	if (simpleArcCount(node) != 2)
2680 		return;
2681 
2682 	arc1 = getArc(node);
2683 	arc2 = getNextArc(arc1);
2684 
2685 	nodeA = followShortPath(arc1);
2686 	if (nodeA == NULL || simpleArcCount(nodeA) != 2
2687 	    || !isUniqueBasic(nodeA))
2688 		return;
2689 
2690 	nodeB = followShortPath(arc2);
2691 	if (nodeB == NULL || simpleArcCount(nodeB) != 2
2692 	    || !isUniqueBasic(nodeB))
2693 		return;
2694 
2695 	if (nodeA == nodeB) {
2696 		return;
2697 	}
2698 
2699 	arc1 = getArc(nodeA);
2700 	arc2 = getNextArc(arc1);
2701 	nodeC = followShortPath(arc1);
2702 	if (nodeC == NULL)
2703 		return;
2704 	if (nodeC == node) {
2705 		nodeC = followShortPath(arc2);
2706 		if (nodeC == NULL || nodeC == node
2707 		    || simpleArcCount(nodeC) != 2
2708 		    || !isUniqueBasic(nodeC)) {
2709 			printf("NO %d %d %d %d\n", getNodeID(node),
2710 			       getNodeID(nodeA), getNodeID(nodeB),
2711 			       getNodeID(nodeC));
2712 			return;
2713 		}
2714 	} else {
2715 		if (simpleArcCount(nodeC) != 2 || !isUniqueBasic(nodeC)) {
2716 			puts("2");
2717 			return;
2718 		}
2719 		nodeD = followShortPath(arc2);
2720 		if (nodeD != node) {
2721 			puts("3");
2722 			return;
2723 		}
2724 	}
2725 
2726 	puts("A");
2727 
2728 	arc1 = getArc(nodeB);
2729 	arc2 = getNextArc(arc1);
2730 	nodeD = followShortPath(arc1);
2731 	if (nodeD != nodeC && nodeD != node)
2732 		return;
2733 	nodeD = followShortPath(arc2);
2734 	if (nodeD != nodeC && nodeD != node)
2735 		return;
2736 
2737 	arc1 = getArc(nodeB);
2738 	arc2 = getNextArc(arc1);
2739 	nodeD = followShortPath(arc1);
2740 	if (nodeD != nodeC && nodeD != node)
2741 		return;
2742 	nodeD = followShortPath(arc2);
2743 	if (nodeD != nodeC && nodeD != node)
2744 		return;
2745 
2746 	printf("JOHNNY HALLIDAY JUNCTION %d %d %d %d\n",
2747 	       getNodeID(node), getNodeID(nodeC), getNodeID(nodeA),
2748 	       getNodeID(nodeB));
2749 }
2750 
searchForHallidayJunction(Graph * graph)2751 void searchForHallidayJunction(Graph * graph)
2752 {
2753 	IDnum index;
2754 	Node *node;
2755 
2756 	setBaseCoverage(8);
2757 
2758 	for (index = 1; index <= nodeCount(graph); index++) {
2759 		node = getNodeInGraph(graph, index);
2760 
2761 		if (isUniqueBasic(node)) {
2762 			checkNodeForHallidayJunction(node, graph);
2763 			checkNodeForHallidayJunction(getTwinNode(node),
2764 						     graph);
2765 		}
2766 	}
2767 }
2768 
totalAssemblyLength(Graph * graph)2769 Coordinate totalAssemblyLength(Graph * graph)
2770 {
2771 	IDnum index;
2772 	Node *node;
2773 	Coordinate total = 0;
2774 
2775 	for (index = 1; index <= nodeCount(graph); index++) {
2776 		node = getNodeInGraph(graph, index);
2777 		if (node)
2778 			total += getNodeLength(node);
2779 	}
2780 
2781 	return total;
2782 }
2783 
usedReads(Graph * graph,Coordinate minContigLength)2784 IDnum usedReads(Graph * graph, Coordinate minContigLength)
2785 {
2786 	IDnum res = 0;
2787 	boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
2788 	IDnum nodeID, readID;
2789 	Node * node;
2790 	PassageMarker * marker;
2791 	ShortReadMarker * shortReadArray, * shortReadMarker;
2792 	IDnum shortReadCount, shortReadIndex;
2793 
2794 	for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
2795 		node = getNodeInGraph(graph, nodeID);
2796 		if (node == NULL || getNodeLength(node) < minContigLength)
2797 			continue;
2798 
2799 		// Long reads
2800 		for(marker = getMarker(node); marker != NULL; marker = getNextInNode(marker)) {
2801 			readID = getPassageMarkerSequenceID(marker);
2802 			if (readID < 0)
2803 				readID = -readID;
2804 			used[readID] = true;
2805 		}
2806 
2807 		// Short reads
2808 		if (!readStartsAreActivated(graph))
2809 			continue;
2810 
2811 		shortReadArray = getNodeReads(node, graph);
2812 		shortReadCount = getNodeReadCount(node, graph);
2813 		for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2814 			shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2815 			readID = getShortReadMarkerID(shortReadMarker);
2816 			used[readID] = true;
2817 		}
2818 
2819 		shortReadArray = getNodeReads(getTwinNode(node), graph);
2820 		shortReadCount = getNodeReadCount(getTwinNode(node), graph);
2821 		for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2822 			shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2823 			readID = getShortReadMarkerID(shortReadMarker);
2824 			used[readID] = true;
2825 		}
2826 	}
2827 
2828 	for (readID = 1; readID <= sequenceCount(graph); readID++)
2829 		if (used[readID])
2830 			res++;
2831 
2832 	free(used);
2833 
2834 	return res;
2835 }
2836 
logFinalStats(Graph * graph,Coordinate minContigKmerLength,char * directory)2837 void logFinalStats(Graph * graph, Coordinate minContigKmerLength, char *directory)
2838 {
2839 	char *logFilename =
2840 	    mallocOrExit(strlen(directory) + 100, char);
2841 	char *statsLine =
2842 	    mallocOrExit(5000, char);
2843 	FILE *logFile;
2844 
2845 	strcpy(logFilename, directory);
2846 	strcat(logFilename, "/Log");
2847 	logFile = fopen(logFilename, "a");
2848 
2849 	if (logFile == NULL)
2850 		exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
2851 		       logFilename);
2852 
2853 	sprintf
2854 	    (statsLine, "Final graph has %ld nodes and n50 of %lld, max %lld, total %lld, using %ld/%ld reads\n",
2855 	     (long) nodeCount(graph), (long long) n50(graph), (long long) maxLength(graph),
2856 	     (long long) totalAssemblyLength(graph), (long) usedReads(graph, minContigKmerLength),
2857 	     (long) sequenceCount(graph));
2858 
2859 	fprintf(logFile, "%s", statsLine);
2860 	fprintf(stdout, "%s", statsLine);
2861 
2862 	fclose(logFile);
2863 	free(logFilename);
2864 	free(statsLine);
2865 }
2866 
exportUnusedReads(Graph * graph,ReadSet * reads,Coordinate minContigKmerLength,char * directory)2867 void exportUnusedReads(Graph* graph, ReadSet * reads, Coordinate minContigKmerLength, char* directory) {
2868 	char *outFilename =
2869 	    mallocOrExit(strlen(directory) + 100, char);
2870 	FILE * outfile;
2871 	boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
2872 	IDnum nodeID, readID;
2873 	Node * node;
2874 	PassageMarker * marker;
2875 	ShortReadMarker * shortReadArray, * shortReadMarker;
2876 	IDnum shortReadCount, shortReadIndex;
2877 	// Original
2878 	IDnum numUnusedReads;
2879 	// Original
2880 
2881 	strcpy(outFilename, directory);
2882 	strcat(outFilename, "/UnusedReads.fa");
2883 	outfile = fopen(outFilename, "w");
2884 
2885 	printf("Printing unused reads into %s\n", outFilename);
2886 
2887 	for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
2888 		node = getNodeInGraph(graph, nodeID);
2889 		if (node == NULL || getNodeLength(node) < minContigKmerLength)
2890 			continue;
2891 
2892 		// Long reads
2893 		for(marker = getMarker(node); marker != NULL; marker = getNextInNode(marker)) {
2894 			readID = getPassageMarkerSequenceID(marker);
2895 			if (readID < 0)
2896 				readID = -readID;
2897 			used[readID] = true;
2898 		}
2899 
2900 		// Short reads
2901 		if (!readStartsAreActivated(graph))
2902 			continue;
2903 
2904 		shortReadArray = getNodeReads(node, graph);
2905 		shortReadCount = getNodeReadCount(node, graph);
2906 		for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2907 			shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2908 			readID = getShortReadMarkerID(shortReadMarker);
2909 			used[readID] = true;
2910 		}
2911 
2912 		shortReadArray = getNodeReads(getTwinNode(node), graph);
2913 		shortReadCount = getNodeReadCount(getTwinNode(node), graph);
2914 		for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2915 			shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2916 			readID = getShortReadMarkerID(shortReadMarker);
2917 			used[readID] = true;
2918 		}
2919 	}
2920 
2921 	for (readID = 1; readID <= sequenceCount(graph); readID++)
2922 		if (!used[readID]) {
2923 			exportTightString(outfile, reads->tSequences[readID - 1], readID);
2924 			numUnusedReads++;
2925 		}
2926 
2927 	// Original
2928 	//printf("%d\n", numUnusedReads);
2929 	// Original
2930 
2931 	free(outFilename);
2932 	free(used);
2933 	fclose(outfile);
2934 }
2935 
2936 // Original
getNodeDensity(Node * node)2937 double getNodeDensity(Node * node)
2938 {
2939 	Coordinate nodeLength, nodeCoverage;
2940 
2941 	nodeLength = getNodeLength(node);
2942 	nodeCoverage = (getVirtualCoverage(node, 0)
2943 			+ getVirtualCoverage(node, 1));
2944 
2945 	return nodeCoverage /(double) nodeLength;
2946 }
2947 
makeDummySubgraphMask(Graph * graph)2948 int * makeDummySubgraphMask(Graph * graph)
2949 {
2950 	int lenSubgraphMask = 2 * nodeCount(graph) + 1;
2951 	int *subgraphMask = callocOrExit(lenSubgraphMask, int);
2952 	int index;
2953 
2954 	for (index = 0; index < lenSubgraphMask; index++)
2955 		subgraphMask[index] = 1;
2956 
2957 	return subgraphMask;
2958 }
2959 
estimated_cov_multi(Graph * graph,int * subgraphMask,double expCovMulti[100])2960 int estimated_cov_multi(Graph * graph, int * subgraphMask, double expCovMulti[100])
2961 {
2962 	double histo[LEN_HISTO_X];
2963 	Node *node;
2964 	int index, ecmIndex = 0;
2965 	double binWidth = 0.2;
2966 	int bin = 0;
2967 	double peakCov = 0.0, peakHeight = 0.0;
2968 	double lastPeakHeight = 0.0;
2969 	double SNratio = 10, thresNoiseHeight = 0.0;
2970 	int noiseCount = 0, thresNoiseCount = 5;
2971 	double thresMinPeak = 2.0;
2972 
2973 	puts("Starting peak detection...");
2974 
2975 	// Initialize expCovMulti[] and histo[]
2976 	for (index = 0; index < 100; index++)
2977 		expCovMulti[index] = -1;
2978 	for (index = 0; index < LEN_HISTO_X; index++)
2979 		histo[index] = 0.0;
2980 
2981 	// Make histogram
2982 	for (index = 1; index <= nodeCount(graph); index++) {
2983 		if (subgraphMask[index + nodeCount(graph)] == 1) {
2984 			node = getNodeInGraph(graph, index);
2985 			node = getNodeInGraph(graph, index);
2986 			if (node == NULL || getNodeLength(node) <= 0)
2987 				continue;
2988 			bin = (int) floor(getNodeDensity(node) / binWidth);
2989 			if (bin >= LEN_HISTO_X - 1)
2990 				bin = LEN_HISTO_X - 1;
2991 			histo[bin] += getNodeLength(node);
2992 		}
2993 	}
2994 
2995 	// Define length threshold of noise
2996 	 // Skip index = 0 to avoid the influence of long reads
2997 	for (index = LEN_HISTO_X - 2; index >= 1; index--) {
2998 		if (histo[index] > peakHeight)
2999 			peakHeight = histo[index];
3000 	}
3001 	thresNoiseHeight = peakHeight / (double) SNratio;
3002 	peakHeight = 0.0;
3003 
3004 	// Detect peaks
3005 	for (index = LEN_HISTO_X - 2; index >= 1; index--) {
3006 		if (histo[index] > thresNoiseHeight) {
3007 			if (histo[index] > peakHeight
3008 			    && histo[index] > lastPeakHeight) {
3009 				peakHeight = histo[index];
3010 				peakCov = (double) (index + 0.5) * binWidth;
3011 				noiseCount = 0;
3012 				continue;
3013 			}
3014 			else {
3015 				noiseCount++;
3016 			}
3017 		}
3018 		else {
3019 			lastPeakHeight = 0.0;
3020 			noiseCount++;
3021 		}
3022 
3023 		if (peakHeight > 0.0 && noiseCount >= thresNoiseCount) {
3024 			if (peakCov < thresMinPeak)
3025 				break;
3026 
3027 			expCovMulti[ecmIndex++] = peakCov;
3028 
3029 			peakCov = 0.0;
3030 			lastPeakHeight = peakHeight;
3031 			peakHeight = 0.0;
3032 			noiseCount = 0;
3033 		}
3034 	}
3035 
3036 	// Output detedted peaks
3037 	if (ecmIndex == 0) {
3038 		puts("Error!! Couldn't detect any peaks");
3039 		exit(1);
3040 	}
3041 	for (index = 0; index < ecmIndex; index++)
3042 		printf("Detected Peak Coverage : %f\n", expCovMulti[index]);
3043 
3044 	puts("Peak detection finished");
3045 
3046 	return ecmIndex;
3047 }
3048 
eliminateNullNodes(Graph * graph,int * subgraphMask)3049 static void eliminateNullNodes(Graph * graph, int * subgraphMask)
3050 {
3051 	Node *node;
3052 	int index;
3053 	int lenSubgraphMask = 2 * nodeCount(graph) + 1;
3054 
3055 	for (index = 0; index < lenSubgraphMask; index++) {
3056 		node = getNodeInGraph(graph, index - nodeCount(graph));
3057 		if (node == NULL || getNodeID(node) == 0)
3058 			subgraphMask[index] = -2;
3059 	}
3060 }
3061 
checkLongReadExistence(Graph * graph)3062 static boolean checkLongReadExistence(Graph * graph)
3063 {
3064 	int index;
3065 
3066 	for (index = 1; index <= nodeCount(graph); index++) {
3067 		if (getMarker(getNodeInGraph(graph, index)) != NULL)
3068 			return true;
3069 	}
3070 
3071 	return false;
3072 }
3073 
depthFirstSearchSubgraph(int currentIndex,Graph * graph,int * subgraphMask)3074 static void depthFirstSearchSubgraph(int currentIndex, Graph * graph, int * subgraphMask)
3075 {
3076 	Arc *activeArc = NULL;
3077 	int nextIndex = 0;
3078 
3079 	if (subgraphMask[currentIndex + nodeCount(graph)] == 0) {
3080 		// Mark "Visiting"
3081 		subgraphMask[currentIndex + nodeCount(graph)] = 1;
3082 
3083 		// Find "Unvisited" Node
3084 		for (activeArc = getArc(getNodeInGraph(graph, currentIndex));
3085 		     activeArc != NULL; activeArc = getNextArc(activeArc)) {
3086 			nextIndex = getNodeID(getDestination(activeArc));
3087 			if (subgraphMask[nextIndex] == 0) {
3088 				depthFirstSearchSubgraph(nextIndex, graph, subgraphMask);
3089 				depthFirstSearchSubgraph((nextIndex * -1), graph, subgraphMask);
3090 			}
3091 		}
3092 	}
3093 }
3094 
resetUniqueness(Graph * graph)3095 void resetUniqueness(Graph * graph)
3096 {
3097 	Node *node;
3098 	int index;
3099 
3100 	for (index = 1; index <= nodeCount(graph); index++) {
3101 		node = getNodeInGraph(graph, index);
3102 		if (node == NULL)
3103 			continue;
3104 		setUniqueness(node, false);
3105 	}
3106 }
3107 
setSubgraphMask(int * subgraphMask,int lenSubgraphMask,int before,int after)3108 static void setSubgraphMask(int * subgraphMask, int lenSubgraphMask,
3109 			    int before, int after)
3110 {
3111 	int index;
3112 
3113 	for (index = 0; index < lenSubgraphMask; index++) {
3114 		if (subgraphMask[index] == before)
3115 			subgraphMask[index] = after;
3116 	}
3117 }
3118 
getUnvisitedNodeID(int * subgraphMask,int lenSubgraphMask)3119 static int getUnvisitedNodeID(int * subgraphMask, int lenSubgraphMask)
3120 {
3121 	int index;
3122 
3123 	for (index = 0; index < lenSubgraphMask; index++) {
3124 		if (index == (lenSubgraphMask - 1) / 2)
3125 			continue;
3126 		if (subgraphMask[index] == 0)
3127 			return index - (lenSubgraphMask - 1) / 2;
3128 	}
3129 
3130 	// Visited all nodes
3131 	return 0;
3132 }
3133 
shelveSubgraphMask(int * subgraphMask,int lenSubgraphMask,int exception)3134 static void shelveSubgraphMask(int * subgraphMask, int lenSubgraphMask,
3135 			       int exception)
3136 {
3137 	int index;
3138 
3139 	for (index = 0; index < lenSubgraphMask; index++) {
3140 		if (subgraphMask[index] == exception)
3141 			subgraphMask[index] = 0;
3142 		else
3143 			subgraphMask[index] += 100;
3144 	}
3145 }
3146 
unshelveSubgraphMask(int * subgraphMask,int lenSubgraphMask)3147 static void unshelveSubgraphMask(int * subgraphMask, int lenSubgraphMask)
3148 {
3149 	int index;
3150 
3151 	for (index = 0; index < lenSubgraphMask; index++) {
3152 		if (subgraphMask[index] >= 50)
3153 			subgraphMask[index] -= 100;
3154 	}
3155 }
3156 
estimated_cov_subgraph(Graph * graph,int * subgraphMask,double expCovPandS[2],double rateChimericSubgraph)3157 static int estimated_cov_subgraph(Graph * graph, int * subgraphMask, double expCovPandS[2],
3158 				  double rateChimericSubgraph)
3159 {
3160 	int nodeIndex;
3161 	long int sumLenPrimary = 0, sumLenSecondary = 0, sumLenTotal;
3162 	double perPrimary, perSecondary;
3163 	Node *node;
3164 	double cov = 0.0;
3165 	double primary = expCovPandS[0], secondary = expCovPandS[1];
3166 
3167 	for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
3168 		if (subgraphMask[nodeIndex + nodeCount(graph)] == 1) {
3169 			node = getNodeInGraph(graph, nodeIndex);
3170 			if (node == NULL)
3171 				continue;
3172 			cov = getNodeDensity(node);
3173 			if (fabs(cov - primary) <= fabs(cov - secondary))
3174 				sumLenPrimary += getNodeLength(node);
3175 			else
3176 				sumLenSecondary += getNodeLength(node);
3177 		}
3178 	}
3179 
3180 	sumLenTotal = sumLenPrimary + sumLenSecondary;
3181 	perPrimary = (double) sumLenPrimary / sumLenTotal;
3182 	perSecondary = (double) sumLenSecondary / sumLenTotal;
3183 	if (perSecondary <= rateChimericSubgraph) {
3184 		// Non Chimeric Subgraph, belongs to Primary Species
3185 		return 1;
3186 	} else if (perPrimary <= rateChimericSubgraph) {
3187 		// Non Chimeric Subgraph, belongs to Secondary Species
3188 		return -1;
3189 	} else {
3190 		// Chimeric Subgraph
3191 		return 2;
3192 	}
3193 }
3194 
forceSeparateChimericSubgraph(Graph * graph,int * subgraphMask,double expCovPandS[2])3195 static void forceSeparateChimericSubgraph(Graph * graph, int * subgraphMask,
3196 					  double expCovPandS[2])
3197 {
3198 	int maskIndex;
3199 	int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3200 	Node *node;
3201 	double cov, primary = expCovPandS[0], secondary = expCovPandS[1];
3202 
3203 	for (maskIndex = 0; maskIndex < lenSubgraphMask; maskIndex++) {
3204 		if (subgraphMask[maskIndex] == 1) {
3205 			node = getNodeInGraph(graph, maskIndex - nodeCount(graph));
3206 			if (node == NULL)
3207 				continue;
3208 			cov = getNodeDensity(node);
3209 			if (fabs(cov - primary) <= fabs(cov - secondary))
3210 				subgraphMask[maskIndex] = 2;
3211 			else
3212 				subgraphMask[maskIndex] = -1;
3213 		}
3214 	}
3215 }
3216 
judgeChimericSubgraph(Graph * graph,int * subgraphMask,double expCovPandS[2],double rateChimericSubgraph,boolean discardChimericSubgraph)3217 static void judgeChimericSubgraph(Graph * graph, int * subgraphMask, double expCovPandS[2],
3218 				  double rateChimericSubgraph, boolean discardChimericSubgraph)
3219 {
3220 	int nodeIndex;
3221 	int judgeResult = 0;
3222 	int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3223 	boolean flagVisitedAllNodes = false;
3224 	int numSubgraph = 0, thresNumSubgraph = lenSubgraphMask;
3225 
3226 	// Shelve not "Visiting" subgraphs
3227 	shelveSubgraphMask(subgraphMask, lenSubgraphMask, 1);
3228 
3229 	while (!flagVisitedAllNodes) {
3230 		// Check Infinite Loop
3231 		numSubgraph++;
3232 		if (numSubgraph >= thresNumSubgraph) {
3233 			puts("Resolving Repeat Error!! Infinite Loop");
3234 			free(subgraphMask);
3235 			exit(1);
3236 		}
3237 
3238 		// Choice starting unvisited node
3239 		nodeIndex = getUnvisitedNodeID(subgraphMask, lenSubgraphMask);
3240 
3241 		printf("nodeIndex = %d\n", nodeIndex);
3242 
3243 		// Depth-first search (node & twin)
3244 		depthFirstSearchSubgraph(nodeIndex, graph, subgraphMask);
3245 		depthFirstSearchSubgraph((nodeIndex * -1), graph, subgraphMask);
3246 
3247 		// Estimate exp_cov in the Subgraph
3248 		judgeResult = estimated_cov_subgraph(graph, subgraphMask, expCovPandS,
3249 						     rateChimericSubgraph);
3250 
3251 		if (judgeResult == 1) {
3252 			printf("NonChimeric Subgraph, belongs to Primary Species\n");
3253 			setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 2);
3254 		}
3255 		else if (judgeResult == -1) {
3256 			printf("NonChimeric Subgraph, belongs to Secondary Species\n");
3257 			setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -1);
3258 		}
3259 		else {
3260 			printf("Chimeric Subgraph!\n");
3261 			if (discardChimericSubgraph)
3262 				setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -2);
3263 			else
3264 				forceSeparateChimericSubgraph(graph, subgraphMask, expCovPandS);
3265 		}
3266 
3267 		// Judge whether all nodes in Subgraph have visited or not
3268 		if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) == 0)
3269 			flagVisitedAllNodes = true;
3270 	}
3271 
3272 	// Unshelve not "Visiting" subgraphs
3273 	unshelveSubgraphMask(subgraphMask, lenSubgraphMask);
3274 
3275 	printf("Separating Chimeric Subgraphs Finished!\n");
3276 }
3277 
checkPrimaryExpCovExistence(double expCovMulti[2],double expCovPandS[2])3278 static boolean checkPrimaryExpCovExistence(double expCovMulti[2], double expCovPandS[2])
3279 {
3280 	int index;
3281 	double expCov, primary = expCovPandS[0], secondary = expCovPandS[1];
3282 
3283 	if (secondary == -1)
3284 		secondary = primary / (double) 2;
3285 
3286 	for (index = 0; index < 2; index++) {
3287 		expCov = expCovMulti[index];
3288 		if (expCov == -1)
3289 			break;
3290 		if (fabs(expCov - primary) <= fabs(expCov - secondary)) {
3291 			printf("Primary exp_cov (%f <-> %f), Exist\n",
3292 			       primary, secondary);
3293 			return true;
3294 		}
3295 	}
3296 
3297 	printf("Primary exp_cov (%f <-> %f), NOT exist\n", primary, secondary);
3298 	return false;
3299 }
3300 
judgeSkip(Graph * graph,int * subgraphMask)3301 static boolean judgeSkip(Graph * graph, int * subgraphMask)
3302 {
3303 	int nodeIndex;
3304 	int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3305 	int countSkip = 0;
3306 	double insertLen = getInsertLength(graph, 1);
3307 	double skipCandidateNodeLen = 0.0;
3308 	Node *node = NULL;
3309 
3310 	for (nodeIndex = 1; nodeIndex < nodeCount(graph); nodeIndex++) {
3311 		if (subgraphMask[nodeIndex + nodeCount(graph)] == 1) {
3312 			countSkip++;
3313 			node = getNodeInGraph(graph, nodeIndex);
3314 			skipCandidateNodeLen = getNodeLength(node);
3315 		}
3316 	}
3317 
3318 	if (countSkip <= 1 && skipCandidateNodeLen < insertLen) {
3319 		setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -2);
3320 		printf("Skipped\n");
3321 		return true;
3322 	}
3323 	else
3324 		return false;
3325 }
3326 
printActiveNodes(int * subgraphMask,int lenSubgraphMask)3327 static void printActiveNodes(int * subgraphMask, int lenSubgraphMask)
3328 {
3329 	int index;
3330 
3331 	printf("Active Nodes : ");
3332 	for (index = 0; index < lenSubgraphMask; index++) {
3333 		if (subgraphMask[index] == 2)
3334 			printf("%d ", index - (lenSubgraphMask - 1) / 2);
3335 	}
3336 
3337 	printf("\n");
3338 }
3339 
resolveRepeatOfAllSubgraphs(Graph * graph,ReadSet * reads,double expCovMulti[100],boolean * dubious,boolean force_jumps,int argPebbleRounds,double rateChimericSubgraph,boolean discardChimericSubgraph,double repeatNodeCovSD)3340 void resolveRepeatOfAllSubgraphs(Graph * graph, ReadSet * reads, double expCovMulti[100],
3341 				 boolean * dubious, boolean force_jumps, int argPebbleRounds,
3342 				 double rateChimericSubgraph, boolean discardChimericSubgraph,
3343 				 double repeatNodeCovSD)
3344 {
3345 	int nodeIndex = 1, ecmIndex = 0;
3346 	int numSubgraph = 0;
3347 	int thresNumSubgraph = nodeCount(graph) * 2;
3348 	int lenSubgraphMask = 2 * nodeCount(graph) + 1;
3349 	int *subgraphMask = callocOrExit(lenSubgraphMask, int);
3350 	double expCovSubgraph = 0.0;
3351 	double expCovPandS[2];
3352 	int numPeaks = 0;
3353 	int countInterRepeatLoop = 0, thresInterRepeatLoop = 20;
3354 	int pebbleRounds = argPebbleRounds;
3355 	boolean flagLongRead = false, flagVisitedAllNodes = false;
3356 
3357 	puts("\nResolving Repeats for each subgraph\n");
3358 
3359 	// Eliminate NULL nodes
3360 	eliminateNullNodes(graph, subgraphMask);
3361 
3362 	// Check whether long reads are in the input sequences
3363 	flagLongRead = checkLongReadExistence(graph);
3364 
3365 	// Print Expected Coverages
3366 	for (ecmIndex = 0; expCovMulti[ecmIndex] > 0.001; ecmIndex++)
3367 		printf("Expected Coverage %d : %f\n", ecmIndex+1, expCovMulti[ecmIndex]);
3368 	ecmIndex = 0;
3369 
3370 	/*
3371 	// Detect peaks from whole Graph
3372 	setSubgraphMask(subgraphMask, lenSubgraphMask, 0, 1);
3373 	estimated_cov_multi(graph, subgraphMask, expCovMulti);
3374 	setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 0);
3375 	*/
3376 
3377 	while (!flagVisitedAllNodes) {
3378 		// Set expCovPandS
3379 		if (expCovMulti[ecmIndex] != -1) {
3380 			expCovPandS[0] = expCovMulti[ecmIndex++];
3381 			expCovPandS[1] = expCovMulti[ecmIndex];
3382 		}
3383 		printf("\nPrimary exp_cov : %f\n", expCovPandS[0]);
3384 
3385 		// Resolve repeats for each Subgraph
3386 		while (true) {
3387 			// Check Infinite Loop
3388 			numSubgraph++;
3389 			if (numSubgraph >= thresNumSubgraph) {
3390 				puts("Resolving Repeat Error!! Infinite Loop");
3391 				free(subgraphMask);
3392 				exit(1);
3393 			}
3394 
3395 			// Choice starting unvisited node
3396 			nodeIndex = getUnvisitedNodeID(subgraphMask, lenSubgraphMask);
3397 
3398 			printf("nodeIndex = %d\n", nodeIndex);
3399 
3400 			// Depth-first search (node & twin)
3401 			depthFirstSearchSubgraph(nodeIndex, graph, subgraphMask);
3402 			depthFirstSearchSubgraph((nodeIndex * -1), graph, subgraphMask);
3403 
3404 			// Estimate the number of peaks
3405 			numPeaks = estimated_cov_subgraph(graph, subgraphMask, expCovPandS,
3406 							  rateChimericSubgraph);
3407 
3408 			// Judge whether the Subgraph is chimeric or not
3409 			if (numPeaks >= 2) {
3410 				puts("Multiple Peaks Detected!");
3411 				// Identify and Separate InterRepeats
3412 				while (identifyAndSeparateInterRepeats(graph, expCovPandS,
3413 								       repeatNodeCovSD)) {
3414 					// Check Infinite Loop
3415 					if (countInterRepeatLoop++ >= thresInterRepeatLoop) {
3416 						puts("Force-quitted to Identify InterRepeats");
3417 						eliminateNullNodes(graph, subgraphMask);
3418 						break;
3419 						//puts("Identifying InterRepeat Error! Infinite Loop");
3420 						//free(subgraphMask);
3421 						//exit(1);
3422 					}
3423 					// Eliminate NULL nodes
3424 					eliminateNullNodes(graph, subgraphMask);
3425 				}
3426 				// Judge whether each Subgraph is chimeric or not
3427 				judgeChimericSubgraph(graph, subgraphMask, expCovPandS,
3428 						      rateChimericSubgraph, discardChimericSubgraph);
3429 			}
3430 			else if (numPeaks == 1)
3431 				setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 2);
3432 
3433 			// Judge whether all nodes in Subgraphs have visited or not
3434 			if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) != 0) {
3435 				printf("Unvisited Node : %d\n",
3436 				       getUnvisitedNodeID(subgraphMask, lenSubgraphMask));
3437 				continue;
3438 			}
3439 			else {
3440 				printf("\nGo to Assembly!\n");
3441 				//printActiveNodes(subgraphMask, lenSubgraphMask);
3442 				expCovSubgraph = expCovPandS[0];
3443 				printf("exp_cov = %f\n", expCovSubgraph);
3444 			}
3445 
3446 			// -------------------- Assemble in the Subgraph --------------------
3447 			// Judge unique or repeat
3448 			identifyUniqueNodesSubgraph(graph, subgraphMask,
3449 						    isUniqueSolexaSubgraph, expCovSubgraph);
3450 			// Rock Band in the Subgraph
3451 			if (flagLongRead)
3452 				readCoherentSubgraph(graph, expCovSubgraph, reads, subgraphMask);
3453 			// Pebble in the Subgraph
3454 			for (pebbleRounds = argPebbleRounds; pebbleRounds > 0; pebbleRounds--)
3455 				exploitShortReadPairs(graph, reads, dubious, force_jumps);
3456 			// Print "Finished"
3457 			printf("Subgraph Assembly Finished!\n\n");
3458 			// ------------------------------------------------------------------
3459 
3460 			// Eliminate NULL Nodes
3461 			eliminateNullNodes(graph, subgraphMask);
3462 
3463 			// Reset uniqueness
3464 			resetUniqueness(graph);
3465 
3466 			// Set "2" -> "-2", "-1" -> "0"
3467 			setSubgraphMask(subgraphMask, lenSubgraphMask, 2, -2);
3468 			setSubgraphMask(subgraphMask, lenSubgraphMask, -1, 0);
3469 
3470 			// Judge whether all nodes in Graph have visited or not
3471 			if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) == 0)
3472 				flagVisitedAllNodes = true;
3473 			break;
3474 		}
3475 	}
3476 
3477 	// Resolved Successfully
3478 	puts("Resolved Successfully!\n");
3479 	free(subgraphMask);
3480 }
3481 // Original
3482