1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <math.h>
24 #include <string.h>
25
26 #include "globals.h"
27 #include "graph.h"
28 #include "readSet.h"
29 #include "tightString.h"
30 #include "passageMarker.h"
31 #include "concatenatedGraph.h"
32 #include "readCoherentGraph.h"
33 #include "fibHeap.h"
34 #include "utility.h"
35 // Original
36 #include "shortReadPairs.h"
37 // Original
38
39 // Original
40 #define LEN_HISTO_X 10000
41 // Original
42
surveyPath(PassageMarker * marker)43 static void surveyPath(PassageMarker * marker)
44 {
45 Coordinate length = 0;
46 Coordinate realLength = 0;
47 PassageMarker *current = marker;
48
49 if (passageMarkerDirection(current) < 0)
50 current = getTwinMarker(current);
51
52 for (; current != NULL; current = getNextInSequence(current)) {
53 length +=
54 getNodeLength(getNode(current)) -
55 getStartOffset(current) - getFinishOffset(current);
56 if (getPassageMarkerFinish(current) > 0)
57 realLength = getPassageMarkerFinish(current);
58 }
59
60 printf("SURVEY %ld %lld %lld\n", (long) getAbsolutePassMarkerSeqID(marker),
61 (long long) realLength, (long long) length);
62 }
63
surveyPaths(Graph * graph)64 void surveyPaths(Graph * graph)
65 {
66 IDnum ID;
67 PassageMarker *marker;
68
69 for (ID = 1; ID <= nodeCount(graph); ID++)
70 for (marker = getMarker(getNodeInGraph(graph, ID));
71 marker != NULL; marker = getNextInNode(marker))
72 if ((passageMarkerDirection(marker) > 0
73 && isInitial(marker))
74 || (passageMarkerDirection(marker) < 0
75 && isTerminal(marker)))
76 surveyPath(marker);
77 }
78
copyMarkers(Node * node)79 static PassageMarkerList *copyMarkers(Node * node)
80 {
81 PassageMarkerList *list = NULL;
82 PassageMarkerList *new;
83 PassageMarker *currentMarker;
84
85 for (currentMarker = getMarker(node); currentMarker != NULL;
86 currentMarker = getNextInNode(currentMarker)) {
87 new = newPassageMarkerList(currentMarker, list);
88 list = new;
89 }
90
91 return list;
92 }
93
removeDead(PassageMarkerList ** list)94 static boolean removeDead(PassageMarkerList ** list)
95 {
96 PassageMarkerList *current, *next;
97 boolean removed = false;
98
99 if (*list == NULL)
100 return false;
101
102 current = *list;
103
104 while (current->next != NULL) {
105 next = current->next;
106
107 if (isTerminal(next->marker)) {
108 removed = true;
109 current->next = next->next;
110 deallocatePassageMarkerList(next);
111 } else
112 current = current->next;
113 }
114
115 current = *list;
116 if (isTerminal(current->marker)) {
117 removed = true;
118 *list = current->next;
119 deallocatePassageMarkerList(current);
120 }
121
122 return removed;
123 }
124
chooseDestination(PassageMarkerList * list)125 static Node *chooseDestination(PassageMarkerList * list)
126 {
127 PassageMarkerList *current = list;
128 Node *destination;
129
130 destination = getNode(getNextInSequence(current->marker));
131 while (current != NULL) {
132 if (getNode(getNextInSequence(current->marker)) !=
133 destination)
134 return NULL;
135 current = current->next;
136 }
137
138 return destination;
139 }
140
destroyPassageMarkerList(PassageMarkerList ** list)141 static void destroyPassageMarkerList(PassageMarkerList ** list)
142 {
143 PassageMarkerList *ptr;
144
145 while (*list != NULL) {
146 ptr = *list;
147 *list = ptr->next;
148 deallocatePassageMarkerList(ptr);
149 }
150 }
151
updateMarkers(PassageMarkerList * list)152 static void updateMarkers(PassageMarkerList * list)
153 {
154 PassageMarkerList *current;
155
156 for (current = list; current != NULL; current = current->next)
157 current->marker = getNextInSequence(current->marker);
158 }
159
computeSubsequentNodesLength(Node * node)160 Coordinate computeSubsequentNodesLength(Node * node)
161 {
162 PassageMarkerList *list;
163 Node *nextNode;
164 Coordinate totalLength = 0;
165 boolean uncertain = false;
166
167 list = copyMarkers(node);
168
169 while (true) {
170 if (removeDead(&list))
171 uncertain = true;
172
173 if (uncertain && simpleArcCount(node) > 1) {
174 destroyPassageMarkerList(&list);
175 return totalLength;
176 }
177
178 if (list == NULL)
179 return totalLength;
180
181 nextNode = chooseDestination(list);
182 if (nextNode == NULL) {
183 destroyPassageMarkerList(&list);
184 return totalLength;
185 }
186
187 totalLength += getNodeLength(nextNode);
188
189 updateMarkers(list);
190 }
191
192 // Impossible instruction
193 return -1;
194 }
195
computeVirtualNodeLength(Node * node)196 Coordinate computeVirtualNodeLength(Node * node)
197 {
198 Coordinate virtualLength;
199
200 if (node == NULL)
201 return 0;
202
203 virtualLength = getNodeLength(node);
204
205 virtualLength += computeSubsequentNodesLength(node);
206 virtualLength += computeSubsequentNodesLength(getTwinNode(node));
207
208 return virtualLength;
209 }
210
testForBizarreMarkers(Graph * graph)211 void testForBizarreMarkers(Graph * graph)
212 {
213 IDnum index;
214 Node *node;
215 PassageMarker *marker;
216
217 for (index = 1; index < nodeCount(graph); index++) {
218 node = getNodeInGraph(graph, index);
219 if (node == NULL)
220 continue;
221
222 for (marker = getMarker(node); marker != NULL;
223 marker = getNextInNode(marker)) {
224 if (getTwinMarker(marker) == NULL)
225 exitErrorf(EXIT_FAILURE, false, "Bizarre marker %s",
226 readPassageMarker(marker));
227 }
228 }
229 }
230
231 // COunts how many nodes are dead-ends
countSinksAndSources(Graph * graph)232 IDnum countSinksAndSources(Graph * graph)
233 {
234 IDnum nodeIndex;
235 IDnum result = 0;
236 Node *node;
237
238 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
239 node = getNodeInGraph(graph, nodeIndex);
240 if (arcCount(node) == 0
241 || arcCount(getTwinNode(node)) == 0)
242 result++;
243 }
244
245 return result;
246 }
247
248 // Counts how many nodes have several arcs either going in or coming out
countTangles(Graph * graph)249 IDnum countTangles(Graph * graph)
250 {
251 IDnum nodeIndex;
252 IDnum result = 0;
253 Node *node;
254
255 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
256 node = getNodeInGraph(graph, nodeIndex);
257 if (arcCount(node) > 1 || arcCount(getTwinNode(node)) > 1)
258 result++;
259 }
260 return result;
261 }
262
263 // Counts nodes with exactly one incoming and one outgoing arc
countRepeats(Graph * graph)264 IDnum countRepeats(Graph * graph)
265 {
266 IDnum nodeIndex;
267 IDnum result = 0;
268 Node *node;
269
270 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
271 node = getNodeInGraph(graph, nodeIndex);
272 if (arcCount(node) == 1
273 && arcCount(getTwinNode(getDestination(getArc(node))))
274 == 1)
275 if (getNodeID
276 (getTwinNode(getDestination(getArc(node)))) < 0
277 ||
278 getNodeID(getTwinNode
279 (getDestination(getArc(node)))) >
280 getNodeID(node))
281 result++;
282 }
283 return result;
284
285 }
286
287 // Counts the number of markers for one node
nodeGenomicMultiplicity(Node * node,IDnum firstStrain)288 int nodeGenomicMultiplicity(Node * node, IDnum firstStrain)
289 {
290 int counter = 0;
291 PassageMarker *marker;
292
293 if (node == NULL)
294 return 0;
295
296 for (marker = getMarker(node); marker != NULL;
297 marker = getNextInNode(marker))
298 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
299 counter++;
300
301 return counter;
302 }
303
304 // Counts the number of markers for one node
nodeMultiplicity(Node * node)305 IDnum nodeMultiplicity(Node * node)
306 {
307 int counter = 0;
308 PassageMarker *marker;
309
310 if (node == NULL)
311 return 0;
312
313 marker = getMarker(node);
314 while (marker != NULL) {
315 counter++;
316 marker = getNextInNode(marker);
317 }
318 return counter;
319 }
320
321 // Prints out a set of predefined statistics for one node
nodeStatistics(Node * node)322 char *nodeStatistics(Node * node)
323 {
324 char *s = mallocOrExit(100, char);
325 sprintf(s, "NODE %ld\t%lld\t%i\t%i\t%ld", (long) getNodeID(node),
326 (long long) getNodeLength(node), simpleArcCount(node),
327 simpleArcCount(getTwinNode(node)), (long) nodeMultiplicity(node));
328 return s;
329 }
330
331 // Prints out a table of statistics for all the nodes of the graph
displayGraphStatistics(Graph * graph)332 void displayGraphStatistics(Graph * graph)
333 {
334 IDnum nodeIndex;
335 printf("NODE ID\tlgth\tFwd\tBck\tMult\n");
336 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++)
337 printf("%s\n",
338 nodeStatistics(getNodeInGraph(graph, nodeIndex)));
339 }
340
displayNodeStatisticsSelective(Node * node,IDnum first)341 void displayNodeStatisticsSelective(Node * node, IDnum first)
342 {
343 PassageMarker *marker;
344 boolean originalGenome;
345 boolean strain;
346
347 if (node == NULL)
348 return;
349
350 marker = getMarker(node);
351
352 originalGenome = false;
353 strain = false;
354 while (marker != NULL) {
355 if (getAbsolutePassMarkerSeqID(marker) < first)
356 originalGenome = true;
357
358 if (getAbsolutePassMarkerSeqID(marker) >= first)
359 strain = true;
360
361 marker = getNextInNode(marker);
362 }
363
364 printf("%s", nodeStatistics(node));
365 if (originalGenome && !strain)
366 printf("\tTRUE");
367 else
368 printf("\tFALSE");
369
370 if (originalGenome && strain)
371 printf("\tTRUE");
372 else
373 printf("\tFALSE");
374
375
376 if (strain && !originalGenome)
377 puts("\tTRUE");
378 else
379 puts("\tFALSE");
380
381 }
382
displayGraphStatisticsSelective(Graph * graph,IDnum first)383 void displayGraphStatisticsSelective(Graph * graph, IDnum first)
384 {
385 IDnum index;
386
387 for (index = 1; index <= nodeCount(graph); index++)
388 displayNodeStatisticsSelective(getNodeInGraph
389 (graph, index), first);
390
391 }
392
isOnlyGenome(Node * node,IDnum firstStrain)393 boolean isOnlyGenome(Node * node, IDnum firstStrain)
394 {
395 PassageMarker *marker;
396
397 for (marker = getMarker(node); marker != NULL;
398 marker = getNextInNode(marker))
399 if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
400 return false;
401
402 return true;
403 }
404
isOnlyStrain(Node * node,IDnum firstStrain)405 boolean isOnlyStrain(Node * node, IDnum firstStrain)
406 {
407 PassageMarker *marker;
408
409 for (marker = getMarker(node); marker != NULL;
410 marker = getNextInNode(marker))
411 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
412 return false;
413
414 return true;
415 }
416
isSNP(Node * node,IDnum firstStrain,int WORDLENGTH)417 boolean isSNP(Node * node, IDnum firstStrain, int WORDLENGTH)
418 {
419 IDnum sequence;
420 Coordinate position;
421
422 if (getNodeLength(node) != WORDLENGTH)
423 return false;
424
425 if (getMarker(node) == NULL)
426 return false;
427
428 if (getAbsolutePassMarkerSeqID(getMarker(node)) >= firstStrain)
429 return false;
430
431 if (getNextInNode(getMarker(node)) != NULL)
432 return false;
433
434 if (arcCount(node) != 1)
435 return false;
436
437 if (arcCount(getTwinNode(node)) != 1)
438 return false;
439
440 if (isOnlyGenome(getDestination(getArc(node)), firstStrain))
441 return false;
442
443 if (isOnlyGenome
444 (getDestination(getArc(getTwinNode(node))), firstStrain))
445 return false;
446
447 sequence = getPassageMarkerSequenceID(getMarker(node));
448
449 if (sequence >= 0)
450 position = getPassageMarkerStart(getMarker(node));
451 else {
452 sequence = -sequence;
453 position = getPassageMarkerFinish(getMarker(node));
454 }
455
456 printf("SNP\t%lld\t%ld\n", (long long) position, (long) sequence);
457
458 return true;
459 }
460
strainMarkerCount(Node * node,IDnum firstStrain)461 IDnum strainMarkerCount(Node * node, IDnum firstStrain)
462 {
463 PassageMarker *marker;
464 IDnum counter = 0;
465
466 for (marker = getMarker(node); marker != NULL;
467 marker = getNextInNode(marker))
468 if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
469 counter++;
470
471 return counter;
472 }
473
isError(Node * node,IDnum firstStrain)474 boolean isError(Node * node, IDnum firstStrain)
475 {
476 return (strainMarkerCount(node, firstStrain) < 5);
477 }
478
removeStrainMarkers(Node * node,IDnum firstStrain)479 void removeStrainMarkers(Node * node, IDnum firstStrain)
480 {
481 PassageMarker *marker;
482 PassageMarker *tmp = NULL;
483
484 marker = getMarker(node);
485 while (marker != NULL) {
486 tmp = getNextInNode(marker);
487
488 if (getAbsolutePassMarkerSeqID(marker) >= firstStrain)
489 destroyPassageMarker(marker);
490 marker = tmp;
491 }
492
493 }
494
chainSawCorrection(Graph * graph,int minMult)495 void chainSawCorrection(Graph * graph, int minMult)
496 {
497 IDnum nodeIndex;
498 IDnum removed = 0;
499
500 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
501 if (markerCount(getNodeInGraph(graph, nodeIndex)) <
502 minMult) {
503 destroyNode(getNodeInGraph(graph, nodeIndex),
504 graph);
505 removed++;
506 }
507 }
508
509 printf("%d dubious nodes removed\n", removed);
510 concatenateGraph(graph);
511 printf("%d node in the end\n", nodeCount(graph));
512 }
513
grossErrorRemoval(Graph * graph,IDnum firstStrain)514 void grossErrorRemoval(Graph * graph, IDnum firstStrain)
515 {
516 IDnum nodeIndex;
517 IDnum removed = 0;
518
519 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
520 if (isError(getNodeInGraph(graph, nodeIndex), firstStrain)) {
521 if (isOnlyStrain
522 (getNodeInGraph(graph, nodeIndex),
523 firstStrain)) {
524 destroyNode(getNodeInGraph
525 (graph, nodeIndex), graph);
526 removed++;
527 } else
528 removeStrainMarkers(getNodeInGraph
529 (graph, nodeIndex),
530 firstStrain);
531 }
532 }
533
534 printf("%d dubious nodes removed\n", removed);
535 concatenateGraph(graph);
536 printf("%d node in the end\n", nodeCount(graph));
537 }
538
countSNPs(Graph * graph,IDnum firstStrain,int WORDLENGTH)539 IDnum countSNPs(Graph * graph, IDnum firstStrain, int WORDLENGTH)
540 {
541 IDnum index;
542 IDnum counter = 0;
543
544 for (index = 1; index < nodeCount(graph); index++)
545 if (isSNP
546 (getNodeInGraph(graph, index), firstStrain,
547 WORDLENGTH))
548 counter++;
549
550 return counter;
551 }
552
commonLength(Node * node,IDnum firstStrain)553 Coordinate commonLength(Node * node, IDnum firstStrain)
554 {
555 PassageMarker *marker = getMarker(node);
556 int orig = 0;
557 int strain = 0;
558
559 while (marker != NULL) {
560 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
561 orig++;
562 else
563 strain++;
564 marker = getNextInNode(marker);
565 }
566
567 if (orig == 0 || strain == 0)
568 return 0;
569
570 return (Coordinate) orig *getNodeLength(node);
571 }
572
countCommonLength(Graph * graph,IDnum firstStrain)573 Coordinate countCommonLength(Graph * graph, IDnum firstStrain)
574 {
575 IDnum index;
576 Coordinate res = 0;
577
578 for (index = 1; index <= nodeCount(graph); index++)
579 res +=
580 commonLength(getNodeInGraph(graph, index),
581 firstStrain);
582
583 return res;
584 }
585
isMixed(Node * node,IDnum firstStrain)586 boolean isMixed(Node * node, IDnum firstStrain)
587 {
588 return !isOnlyStrain(node, firstStrain)
589 && !isOnlyGenome(node, firstStrain);
590 }
591
countLocalBreakpoints(PassageMarker * marker,IDnum firstStrain)592 int countLocalBreakpoints(PassageMarker * marker, IDnum firstStrain)
593 {
594 PassageMarker *localMarker;
595 IDnum sequenceID = getAbsolutePassMarkerSeqID(marker);
596 IDnum localSeqID;
597 Coordinate start = getPassageMarkerStart(marker);
598 Node *localNode = getNode(marker);
599 Node *destination;
600 Arc *arc;
601 int arcCount = 0;
602 int arcIndex;
603 boolean *arcStatus;
604 int counter = 0;
605
606 if (!isMixed(localNode, firstStrain))
607 return 0;
608
609 // Count arcs
610 for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc))
611 arcCount++;
612 arcStatus = callocOrExit(arcCount, boolean);
613 // Check for other genomic markers in node
614 for (localMarker = getMarker(localNode); localMarker != NULL;
615 localMarker = getNextInNode(localMarker)) {
616 localSeqID = getAbsolutePassMarkerSeqID(localMarker);
617 if (localSeqID >= firstStrain)
618 continue;
619
620 if (localSeqID < sequenceID)
621 return 0;
622
623 if (localSeqID == sequenceID
624 && getPassageMarkerStart(localMarker) < start)
625 return 0;
626
627 destination = getNode(getNextInSequence(localMarker));
628
629 // Enter into table:
630 arcIndex = 0;
631 for (arc = getArc(localNode);
632 getDestination(arc) != destination;
633 arc = getNextArc(arc))
634 arcIndex++;
635 arcStatus[arcIndex] = true;
636 }
637
638 // Check other nodes
639 arcIndex = 0;
640 for (arc = getArc(localNode); arc != NULL; arc = getNextArc(arc)) {
641 if (!arcStatus[arcIndex]
642 && isMixed(getDestination(arc), firstStrain))
643 counter++;
644 arcIndex++;
645 }
646
647 free(arcStatus);
648 return counter;
649 }
650
countBreakpoints(Graph * graph,IDnum firstStrain)651 IDnum countBreakpoints(Graph * graph, IDnum firstStrain)
652 {
653 PassageMarker *marker;
654 IDnum seqIndex;
655 IDnum total = 0;
656
657 for (seqIndex = 1; seqIndex < firstStrain; seqIndex++) {
658 marker = getMarker(getNodeInGraph(graph, seqIndex));
659 while (marker != NULL) {
660 total +=
661 countLocalBreakpoints(marker, firstStrain);
662 marker = getNextInSequence(marker);
663 }
664 }
665
666 return total;
667 }
668
countStrainOnlyNodes(Graph * graph,IDnum firstStrain)669 IDnum countStrainOnlyNodes(Graph * graph, IDnum firstStrain)
670 {
671 IDnum index;
672 IDnum total = 0;
673
674 for (index = 1; index <= nodeCount(graph); index++)
675 if (isOnlyStrain
676 (getNodeInGraph(graph, index), firstStrain))
677 total++;
678
679 return total;
680 }
681
countStrainOnlyBp(Graph * graph,IDnum firstStrain)682 Coordinate countStrainOnlyBp(Graph * graph, IDnum firstStrain)
683 {
684 IDnum index;
685 Coordinate total = 0;
686 Node *node;
687 Arc *arc;
688 Coordinate local;
689
690 for (index = 1; index <= nodeCount(graph); index++) {
691 if (isOnlyStrain
692 (getNodeInGraph(graph, index), firstStrain)) {
693 node = getNodeInGraph(graph, index);
694 local = getNodeLength(node);
695
696 for (arc = getArc(node); arc != NULL;
697 arc = getNextArc(arc)) {
698 if (!isOnlyStrain
699 (getDestination(arc), firstStrain)) {
700 local -= 24;
701 break;
702 }
703 }
704
705 for (arc = getArc(getTwinNode(node)); arc != NULL;
706 arc = getNextArc(arc)) {
707 if (!isOnlyStrain
708 (getDestination(arc), firstStrain)) {
709 local -= 24;
710 break;
711 }
712 }
713
714 if (local < 0)
715 local = 1;
716
717 total += local;
718 }
719 }
720
721 return total;
722 }
723
genomeMarkerCount(Node * node,IDnum firstStrain)724 IDnum genomeMarkerCount(Node * node, IDnum firstStrain)
725 {
726 PassageMarker *marker;
727 IDnum counter = 0;
728
729 for (marker = getMarker(node); marker != NULL;
730 marker = getNextInNode(marker))
731 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
732 counter++;
733
734 return counter;
735 }
736
readCoverage(Node * node)737 Coordinate readCoverage(Node * node)
738 {
739 PassageMarker *marker;
740 Coordinate sum = 0;
741
742 for (marker = getMarker(node); marker != NULL;
743 marker = getNextInNode(marker)) {
744 if (getTwinMarker(marker) == NULL) {
745 printf("Node %d screwed up\n", getNodeID(node));
746 printf("Sequence %d\n",
747 getPassageMarkerSequenceID(marker));
748 abort();
749 }
750 sum += getPassageMarkerLength(marker);
751 }
752
753 return sum;
754 }
755
refReadCoverage(Node * node,IDnum firstStrain)756 Coordinate refReadCoverage(Node * node, IDnum firstStrain)
757 {
758 PassageMarker *marker;
759 Coordinate sum = 0;
760
761 for (marker = getMarker(node); marker != NULL;
762 marker = getNextInNode(marker))
763 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
764 sum += getPassageMarkerLength(marker);
765
766 return sum;
767 }
768
newReadCoverage(Node * node,IDnum firstStrain)769 Coordinate newReadCoverage(Node * node, IDnum firstStrain)
770 {
771 PassageMarker *marker;
772 Coordinate sum = 0;
773
774 for (marker = getMarker(node); marker != NULL;
775 marker = getNextInNode(marker))
776 if (getAbsolutePassMarkerSeqID(marker) >= firstStrain) {
777 sum += getPassageMarkerLength(marker);
778 if (getPassageMarkerLength(marker) < 0)
779 printf("Bizarre marker %d at node %d\n",
780 getPassageMarkerSequenceID(marker),
781 getNodeID(node));
782 }
783
784 return sum;
785 }
786
readStarts(Node * node)787 IDnum readStarts(Node * node)
788 {
789 PassageMarker *marker;
790 IDnum sum = 0;
791
792 if (node == NULL)
793 return 0;
794
795 for (marker = getMarker(node); marker != NULL;
796 marker = getNextInNode(marker)) {
797 if (getPassageMarkerSequenceID(marker) > 0
798 && isInitial(marker))
799 sum++;
800 else if (getPassageMarkerSequenceID(marker) < 0
801 && isTerminal(marker))
802 sum++;
803 }
804
805 return sum;
806 }
807
printShortCounts(FILE * outfile,Node * node,Graph * graph,ReadSet * reads)808 static void printShortCounts(FILE * outfile, Node * node, Graph * graph, ReadSet * reads) {
809 IDnum counts[CATEGORIES];
810 Category cat;
811 IDnum shortReadIndex;
812 IDnum readID;
813 IDnum shortReadCount;
814 ShortReadMarker *array;
815 ShortReadMarker *marker;
816
817 if (!readStartsAreActivated(graph)) {
818 for (cat = 0; cat < CATEGORIES; cat++)
819 fprintf(outfile, "\tN/A");
820 return;
821 }
822
823 shortReadCount = getNodeReadCount(node, graph);
824 array = getNodeReads(node, graph);
825
826 for (cat = 0; cat < CATEGORIES; cat++)
827 counts[cat] = 0;
828
829 for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
830 marker = getShortReadMarkerAtIndex(array, shortReadIndex);
831 readID = getShortReadMarkerID(marker);
832 cat = reads->categories[readID - 1] / 2;
833 counts[cat]++;
834 }
835
836 for (cat = 0; cat < CATEGORIES; cat++)
837 fprintf(outfile, "\t%li", (long) counts[cat]);
838 }
839
displayGeneralStatistics(Graph * graph,char * filename,ReadSet * reads)840 void displayGeneralStatistics(Graph * graph, char *filename, ReadSet * reads)
841 {
842 IDnum nodeIndex;
843 Node *node;
844 Category cat;
845 FILE *outfile;
846
847 outfile = fopen(filename, "w");
848 if (outfile == NULL) {
849 printf("Couldn't open file %s, sorry\n", filename);
850 return;
851 } else
852 printf("Writing into stats file %s...\n", filename);
853
854 fprintf(outfile, "ID\tlgth\tout\tin\tlong_cov");
855
856 for (cat = 0; cat < CATEGORIES; cat++) {
857 fprintf(outfile, "\tshort%i_cov", (int) (cat + 1));
858 fprintf(outfile, "\tshort%i_Ocov", (int) (cat + 1));
859 }
860
861 fprintf(outfile, "\tlong_nb");
862 for (cat = 0; cat < CATEGORIES; cat++) {
863 fprintf(outfile, "\tshort%i_nb", (int) (cat + 1));
864 }
865
866 fprintf(outfile, "\n");
867
868 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
869 node = getNodeInGraph(graph, nodeIndex);
870 if (node == NULL)
871 continue;
872 fprintf
873 (outfile, "%ld\t%lld\t%i\t%i",
874 (long) getNodeID(node), (long long) getNodeLength(node), arcCount(node),
875 arcCount(getTwinNode(node)));
876
877 if (getNodeLength(node) > 0) {
878 fprintf(outfile, "\t%f",
879 readCoverage(node) /
880 (double) getNodeLength(node));
881 for (cat = 0; cat < CATEGORIES; cat++) {
882 fprintf(outfile, "\t%f",
883 getVirtualCoverage(node,
884 cat) /
885 (double) getNodeLength(node));
886 fprintf(outfile, "\t%f",
887 getOriginalVirtualCoverage(node,
888 cat) /
889 (double) getNodeLength(node));
890 }
891 } else {
892 fprintf(outfile, "\tInf");
893 for (cat = 0; cat < CATEGORIES; cat++)
894 fprintf(outfile, "\tInf\tInf");
895 }
896
897 fprintf(outfile, "\t%li", (long) markerCount(node));
898 printShortCounts(outfile, node, graph, reads);
899
900 fprintf(outfile, "\n");
901 }
902
903 fclose(outfile);
904 }
905
destroyStrainSpecificIslands(Graph * graph,IDnum firstStrain)906 void destroyStrainSpecificIslands(Graph * graph, IDnum firstStrain)
907 {
908 IDnum index;
909 Arc *arc;
910 boolean isModified = true;
911 Node *node;
912 IDnum counter = 0;
913
914 resetNodeStatus(graph);
915
916 puts("Destroying disconnected strain specific sub-graphs");
917
918 // Mark all genomic nodes
919 for (index = 1; index <= nodeCount(graph); index++) {
920 node = getNodeInGraph(graph, index);
921 if (!isOnlyStrain(node, firstStrain))
922 setNodeStatus(node, true);
923 }
924
925 // Mark nodes connected to genomic nodes
926 while (isModified) {
927 isModified = false;
928
929 for (index = 1; index <= nodeCount(graph); index++) {
930 node = getNodeInGraph(graph, index);
931
932 if (getNodeStatus(node))
933 continue;
934
935 for (arc = getArc(node); arc != NULL;
936 arc = getNextArc(arc)) {
937 if (getNodeStatus(getDestination(arc))) {
938 isModified = true;
939 setNodeStatus(node, true);
940 }
941 }
942
943 for (arc = getArc(getTwinNode(node)); arc != NULL;
944 arc = getNextArc(arc)) {
945 if (getNodeStatus(getDestination(arc))) {
946 isModified = true;
947 setNodeStatus(node, true);
948 }
949 }
950 }
951 }
952
953 // Remove all unmarked nodes
954 for (index = 1; index <= nodeCount(graph); index++) {
955 node = getNodeInGraph(graph, index);
956 if (!getNodeStatus(node)) {
957 destroyNode(node, graph);
958 counter++;
959 }
960 }
961
962 // Renumber graph nodes
963 printf("Removed %d nodes \n", counter);
964 renumberNodes(graph);
965 }
966
displayStrainOnlySequences(Graph * graph,IDnum firstStrain,char * inputFilename,char * filename,int WORDLENGTH)967 void displayStrainOnlySequences(Graph * graph, IDnum firstStrain,
968 char *inputFilename, char *filename,
969 int WORDLENGTH)
970 {
971 IDnum nodeIndex;
972 Node *node;
973 FILE *outfile = fopen(filename, "w");
974 Coordinate start, finish;
975 char str[100];
976 TightString *tString;
977 IDnum readID;
978 Coordinate readCoord;
979
980 if (outfile == NULL) {
981 printf("Could not write into %s, sorry\n", filename);
982 return;
983 }
984
985 destroyStrainSpecificIslands(graph, firstStrain);
986
987 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
988 node = getNodeInGraph(graph, nodeIndex);
989 if (isOnlyStrain(node, firstStrain)
990 && getNodeLength(node) > 500) {
991 tString = expandNode(node, WORDLENGTH);
992 readID =
993 getPassageMarkerSequenceID(getMarker(node));
994 readCoord = getPassageMarkerStart(getMarker(node));
995 fprintf(outfile, "> UNIQUE SEQUENCE %ld; %lld\n",
996 (long) readID, (long long) readCoord);
997
998 start = 0;
999 while (start <= getLength(tString)) {
1000 finish = start + 60;
1001 readTightStringFragment(tString, start,
1002 finish, str);
1003 fprintf(outfile, "%s\n", str);
1004 start = finish;
1005 }
1006 }
1007 }
1008
1009 fclose(outfile);
1010 }
1011
displayStrainOnlyDescriptors(Graph * graph,IDnum firstStrain)1012 void displayStrainOnlyDescriptors(Graph * graph, IDnum firstStrain)
1013 {
1014 IDnum nodeIndex;
1015 Node *node;
1016 char *str;
1017
1018 destroyStrainSpecificIslands(graph, firstStrain);
1019
1020 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1021 printf("node %d from %d\n", nodeIndex, nodeCount(graph));
1022 node = getNodeInGraph(graph, nodeIndex);
1023
1024 if (isOnlyStrain(node, firstStrain)) {
1025 str = readNode(node);
1026 printf("> UNIQUE SEQUENCE %s\n", str);
1027 free(str);
1028 }
1029 }
1030 }
1031
displayLocalBreakpoint(PassageMarker * strainMarker,IDnum firstStrain,PassageMarker * genomeMarker,Node ** genomeDestination,Node ** strainDestination,IDnum * counter,IDnum nodeCount)1032 void displayLocalBreakpoint(PassageMarker * strainMarker,
1033 IDnum firstStrain,
1034 PassageMarker * genomeMarker,
1035 Node ** genomeDestination,
1036 Node ** strainDestination, IDnum * counter,
1037 IDnum nodeCount)
1038 {
1039 boolean isTranslocation;
1040 PassageMarker *marker;
1041 Node *destination, *destinationA;
1042 Node *destination2, *destination2A;
1043 Node *node1, *node2;
1044 IDnum localID = getNodeID(getNode(strainMarker));
1045
1046 // Eliminate genomic markers
1047 if (strainMarker == genomeMarker)
1048 return;
1049
1050 destinationA = getNode(getNextInSequence(strainMarker));
1051
1052 if (destinationA == NULL)
1053 return;
1054
1055 // Eliminate those that follow some local strain
1056 if (isDestinationToMarker(genomeMarker, destinationA)) {
1057 // puts("Parallel paths");
1058 return;
1059 }
1060
1061 destination2A = getNode(getNextInSequence(genomeMarker));
1062
1063 if (destination2A == NULL)
1064 return;
1065
1066 printf("Lengths %lld %lld\n", (long long) getNodeLength(destinationA),
1067 (long long) getNodeLength(destination2A));
1068
1069 // Hop to another genomic node
1070 // if (getNodeLength(destinationA) > 24) {
1071 //printf("wrong length %d %d\n", getNodeLength(destination) , getNodeID(destination));
1072 // return;
1073 // }
1074
1075 destination =
1076 getNode(getNextInSequence(getNextInSequence(strainMarker)));
1077
1078 if (destination == NULL)
1079 return;
1080
1081 // Eliminate those that point to uniquely strain sequences
1082 if (nodeGenomicMultiplicity(destination, firstStrain) != 1) {
1083 // puts("Multiple genome reads");
1084 return;
1085 }
1086 // Hop to another genomic node
1087 // if (getNodeLength(destination2A) != 24) {
1088 //puts("wrong length 2");
1089 // return;
1090 // }
1091
1092 destination2 =
1093 getNode(getNextInSequence(getNextInSequence(genomeMarker)));
1094
1095 if (destination2 == NULL)
1096 return;
1097
1098
1099 if (destination == destination2)
1100 return;
1101
1102 // Eliminate those that point to uniquely strain sequences
1103 if (isOnlyGenome(destination2, firstStrain))
1104 return;
1105
1106 setSingleNodeStatus(getNode(strainMarker), true);
1107 strainDestination[localID + nodeCount] = destination;
1108 genomeDestination[localID + nodeCount] = destination2;
1109
1110 // printf("Assigning %p and %p to %d\n", destination, destination2, localID);
1111 printf("lengths %lld\t%lld\n", (long long) getNodeLength(destinationA),
1112 (long long) getNodeLength(destination2A));
1113
1114 // Detect translocation
1115 isTranslocation = true;
1116 for (marker = getMarker(destination); marker != NULL;
1117 marker = getNextInNode(marker))
1118 if (getAbsolutePassMarkerSeqID(marker) ==
1119 getAbsolutePassMarkerSeqID(genomeMarker)) {
1120 isTranslocation = false;
1121 break;
1122 }
1123
1124 if (isTranslocation) {
1125 printf("BREAK TRANS\t%ld\t%lld\t%lld\t%lld\n",
1126 (long) getAbsolutePassMarkerSeqID(genomeMarker),
1127 (long long) getPassageMarkerStart(genomeMarker),
1128 (long long) getNodeLength(destinationA),
1129 (long long) getNodeLength(destination2A));
1130 counter[2]++;
1131 return;
1132 }
1133 // Detect breakpoint
1134 printf("BREAK INTRA\t%ld\t%lld\t%lld\t%lld\n",
1135 (long) getAbsolutePassMarkerSeqID(genomeMarker),
1136 (long long) getPassageMarkerStart(genomeMarker),
1137 (long long) getNodeLength(destinationA), (long long) getNodeLength(destination2A));
1138 counter[1]++;
1139
1140 // Check for inversion
1141 if (getPassageMarkerSequenceID(marker) !=
1142 -getPassageMarkerSequenceID(genomeMarker))
1143 return;
1144
1145 // puts("potential!!");
1146
1147 node1 = getTwinNode(destination);
1148
1149 if (getNodeStatus(node1)) {
1150 node2 =
1151 getTwinNode(genomeDestination
1152 [getNodeID(node1) + nodeCount]);
1153 if (getNodeStatus(node2))
1154 if (strainDestination[getNodeID(node2) + nodeCount]
1155 == destination2) {
1156 // puts("Safe");
1157 counter[1] -= 4;
1158 counter[0]++;
1159 } else;
1160 // puts("stopped 3");
1161 else;
1162 // puts("stopped 2");
1163 } else;
1164 // puts("stopped 1");
1165 }
1166
displayBreakpoints(Graph * graph,IDnum firstStrain)1167 void displayBreakpoints(Graph * graph, IDnum firstStrain)
1168 {
1169 IDnum nodeIndex;
1170 Node *node;
1171 PassageMarker *strainMarker, *genomeMarker;
1172 Node **genomeDestination =
1173 callocOrExit(2 * nodeCount(graph) + 1, Node *);
1174 Node **strainDestination =
1175 callocOrExit(2 * nodeCount(graph) + 1, Node *);
1176 IDnum counters[3];
1177
1178 counters[0] = 0;
1179 counters[1] = 0;
1180 counters[2] = 0;
1181
1182 resetNodeStatus(graph);
1183
1184 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1185 node = getNodeInGraph(graph, nodeIndex);
1186
1187 if (arcCount(node) <= 1
1188 && arcCount(getTwinNode(node)) <= 1) {
1189 continue;
1190 }
1191
1192 if (nodeGenomicMultiplicity(node, firstStrain) != 1) {
1193 continue;
1194 }
1195
1196 if (isOnlyGenome(node, firstStrain)) {
1197 continue;
1198 }
1199
1200 for (genomeMarker = getMarker(node); genomeMarker != NULL;
1201 genomeMarker = getNextInNode(genomeMarker))
1202 if (getAbsolutePassMarkerSeqID(genomeMarker) <
1203 firstStrain)
1204 break;
1205
1206 // Go through all strain passage marker
1207 for (strainMarker = getMarker(node); strainMarker != NULL;
1208 strainMarker = getNextInNode(strainMarker)) {
1209 displayLocalBreakpoint(strainMarker, firstStrain,
1210 genomeMarker,
1211 genomeDestination,
1212 strainDestination, counters,
1213 nodeCount(graph));
1214 displayLocalBreakpoint(getTwinMarker(strainMarker),
1215 firstStrain,
1216 getTwinMarker(genomeMarker),
1217 genomeDestination,
1218 strainDestination, counters,
1219 nodeCount(graph));
1220 }
1221 }
1222
1223
1224 printf("%d\t%d\t%d\n", counters[0], counters[1], counters[2]);
1225 free(strainDestination);
1226 free(genomeDestination);
1227 }
1228
genomeMarker(Node * node,IDnum firstStrain)1229 PassageMarker *genomeMarker(Node * node, IDnum firstStrain)
1230 {
1231 PassageMarker *marker;
1232
1233 if (genomeMarkerCount(node, firstStrain) != 1)
1234 return NULL;
1235
1236 for (marker = getMarker(node); marker != NULL;
1237 marker = getNextInNode(marker))
1238 if (getAbsolutePassMarkerSeqID(marker) < firstStrain)
1239 return marker;
1240
1241 return NULL;
1242 }
1243
exportArcSequence(Arc * arc,FILE * outfile,int WORDLENGTH,TightString ** sequences)1244 void exportArcSequence(Arc * arc, FILE * outfile, int WORDLENGTH,
1245 TightString ** sequences)
1246 {
1247 char *str;
1248 TightString *output =
1249 newTightString(getNodeLength(getOrigin(arc)) +
1250 getNodeLength(getDestination(arc)));
1251 appendNodeSequence(getOrigin(arc), output, 0);
1252 appendNodeSequence(getDestination(arc), output,
1253 getNodeLength(getOrigin(arc)));
1254 str = readTightString(output);
1255 fprintf(outfile, "> ARC from NODE %d", getNodeID(getOrigin(arc)));
1256 fprintf(outfile, "%s\n", str);
1257 destroyTightString(output);
1258 free(str);
1259 }
1260
1261 // Produce sequences necessary to recreate graph elsewhere...
projectGraphToFile(Graph * graph,char * filename,int WORDLENGTH,TightString ** sequences)1262 void projectGraphToFile(Graph * graph, char *filename, int WORDLENGTH,
1263 TightString ** sequences)
1264 {
1265 FILE *outfile = fopen(filename, "w");
1266 IDnum index;
1267 Node *currentNode;
1268 Arc *arc;
1269
1270 if (outfile == NULL) {
1271 printf("Could not open %s, sorry\n", filename);
1272 return;
1273 }
1274
1275 for (index = 1; index < nodeCount(graph); index++) {
1276 currentNode = getNodeInGraph(graph, index);
1277 for (arc = getArc(currentNode); arc != NULL;
1278 arc = getNextArc(arc))
1279 exportArcSequence(arc, outfile, WORDLENGTH,
1280 sequences);
1281
1282 for (arc = getArc(getTwinNode(currentNode)); arc != NULL;
1283 arc = getNextArc(arc))
1284 exportArcSequence(arc, outfile, WORDLENGTH,
1285 sequences);
1286 }
1287
1288 fclose(outfile);
1289 }
1290
removeReferenceMarkers(Graph * graph,IDnum firstStrain)1291 void removeReferenceMarkers(Graph * graph, IDnum firstStrain)
1292 {
1293 IDnum ID;
1294 Node *node;
1295 PassageMarker *marker, *oldMarker;
1296
1297 for (ID = 1; ID <= nodeCount(graph); ID++) {
1298 node = getNodeInGraph(graph, ID);
1299 marker = getMarker(node);
1300 while (marker != NULL) {
1301 if (getAbsolutePassMarkerSeqID(marker) <
1302 firstStrain) {
1303 if (!isInitial(marker))
1304 changeMultiplicity
1305 (getArcBetweenNodes
1306 (getNode
1307 (getPreviousInSequence
1308 (marker)), node, graph),
1309 -1);
1310 if (!isTerminal(marker))
1311 changeMultiplicity
1312 (getArcBetweenNodes
1313 (node,
1314 getNode(getNextInSequence
1315 (marker)), graph),
1316 -1);
1317 oldMarker = marker;
1318 marker = getNextInNode(marker);
1319 destroyPassageMarker(oldMarker);
1320 } else
1321 marker = getNextInNode(marker);
1322 }
1323
1324 if (getMarker(node) == NULL)
1325 destroyNode(node, graph);
1326 }
1327
1328 concatenateGraph(graph);
1329 }
1330
exportLongNodeSequences(char * filename,Graph * graph,Coordinate minLength)1331 void exportLongNodeSequences(char *filename, Graph * graph,
1332 Coordinate minLength)
1333 {
1334 FILE *outfile = fopen(filename, "w");
1335 IDnum nodeIndex;
1336 TightString *tString;
1337 Coordinate position;
1338 char nucleotide;
1339 Node *node;
1340 int WORDLENGTH = getWordLength(graph);
1341 GapMarker *gap;
1342 //double sensitivity, specificity;
1343
1344 if (outfile == NULL) {
1345 printf("Could not write into %s, sorry\n", filename);
1346 return;
1347 } else {
1348 printf("Writing contigs into %s...\n", filename);
1349 }
1350
1351 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1352 node = getNodeInGraph(graph, nodeIndex);
1353
1354 if (node == NULL || getNodeLength(node) < minLength)
1355 continue;
1356
1357 tString = expandNode(node, WORDLENGTH);
1358 fprintf(outfile, ">NODE_%ld_length_%lld_cov_%f\n",
1359 (long) nodeIndex, (long long) getNodeLength(node),
1360 (getVirtualCoverage(node, 0)
1361 + getVirtualCoverage(node, 1)
1362 + readCoverage(node)) /
1363 (float) getNodeLength(node));
1364
1365 gap = getGap(node, graph);
1366 for (position = 0; position < WORDLENGTH; position++) {
1367 if (gap && position >= getGapFinish(gap))
1368 gap = getNextGap(gap);
1369
1370 if (gap == NULL || position < getGapStart(gap)) {
1371 nucleotide =
1372 getNucleotideChar(position, tString);
1373 fprintf(outfile, "%c", nucleotide);
1374 } else
1375 fprintf(outfile, "N");
1376 }
1377
1378 gap = getGap(node, graph);
1379 for (; position < getLength(tString); position++) {
1380 if (position % 60 == 0)
1381 fprintf(outfile, "\n");
1382
1383 if (gap
1384 && position - WORDLENGTH + 1 >=
1385 getGapFinish(gap))
1386 gap = getNextGap(gap);
1387
1388 if (gap == NULL
1389 || position - WORDLENGTH + 1 <
1390 getGapStart(gap)) {
1391 nucleotide =
1392 getNucleotideChar(position, tString);
1393 fprintf(outfile, "%c", nucleotide);
1394 } else
1395 fprintf(outfile, "N");
1396 }
1397 fprintf(outfile, "\n");
1398
1399 destroyTightString(tString);
1400 }
1401
1402 fclose(outfile);
1403 }
1404
1405 /*
1406 void exportMediumNodeSequences(char* filename, Graph * graph, Coordinate minLength)
1407 {
1408 IDnum dummy;
1409 ReadSet *readSet = readFastAFile(sequenceFile);
1410 char **reads = readSet->sequences;
1411 TightString **sequences =
1412 newTightStringArrayFromStringArray(reads, dummy);
1413 FILE *outfile = fopen(filename, "w");
1414 char str[100];
1415 IDnum nodeIndex;
1416 TightString *tString;
1417 Coordinate start, finish;
1418 Node *node;
1419 double sensitivity, specificity;
1420
1421 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
1422 node = getNodeInGraph(graph, nodeIndex);
1423 if (getNodeLength(node) < minLength
1424 || getNodeLength(node) >= maxLength)
1425 continue;
1426
1427 tString = expandNode(node);
1428 compareSequences(tString, sequences[0], &sensitivity,
1429 &specificity, WORDLENGTH);
1430
1431 fprintf(outfile,
1432 "> MEDIUM NODE %d, Sensitivity = %f, Specificity = %f\n",
1433 nodeIndex, sensitivity, specificity);
1434 printf
1435 ("> MEDIUM NODE %d, Sensitivity = %f, Specificity = %f\n",
1436 nodeIndex, sensitivity, specificity);
1437
1438 start = 0;
1439 while (start <= getLength(tString)) {
1440 finish = start + 60;
1441 readTightStringFragment(tString, start,
1442 finish, str);
1443 fprintf(outfile, "%s\n", str);
1444 start = finish;
1445 }
1446
1447 destroyTightString(tString);
1448 }
1449 }
1450 */
1451
maxLength(Graph * graph)1452 Coordinate maxLength(Graph * graph)
1453 {
1454 IDnum index;
1455 Node *node;
1456 Coordinate max = 0;
1457
1458 for (index = 1; index <= nodeCount(graph); index++) {
1459 node = getNodeInGraph(graph, index);
1460 if (node != NULL && getNodeLength(node) > max)
1461 max = getNodeLength(node);
1462 }
1463
1464 return max;
1465 }
1466
n50(Graph * graph)1467 Coordinate n50(Graph * graph)
1468 {
1469 FibHeap *heap = newFibHeap();
1470 IDnum index;
1471 Coordinate totalLength = 0;
1472 Coordinate sumLength = 0;
1473 Node *node;
1474
1475 if (nodeCount(graph) == 0) {
1476 puts("EMPTY GRAPH");
1477 return 0;
1478 }
1479
1480 for (index = 1; index <= nodeCount(graph); index++) {
1481 node = getNodeInGraph(graph, index);
1482 if (node == NULL)
1483 continue;
1484 insertNodeIntoHeap(heap, getNodeLength(node), node);
1485 totalLength += getNodeLength(node);
1486 }
1487 totalLength /= 2;
1488
1489 node = removeNextNodeFromHeap(heap);
1490 while (node != NULL) {
1491 sumLength += getNodeLength(node);
1492 if (sumLength >= totalLength)
1493 break;
1494 node = removeNextNodeFromHeap(heap);
1495 }
1496
1497 destroyHeap(heap);
1498 return getNodeLength(node);
1499 }
1500
getTotalCoverage(Node * node)1501 static Coordinate getTotalCoverage(Node * node)
1502 {
1503 Category cat;
1504 Coordinate coverage = 0;
1505
1506 for (cat = 0; cat < CATEGORIES; cat++)
1507 coverage += getVirtualCoverage(node, cat);
1508
1509 return coverage;
1510 }
1511
compareNodeCovs(const void * A,const void * B)1512 int compareNodeCovs(const void * A, const void * B) {
1513 Node * nodeA = *((Node **) A);
1514 Node * nodeB = *((Node **) B);
1515 double covA;
1516 double covB;
1517
1518 if (getNodeLength(nodeA) == 0)
1519 nodeA = NULL;
1520
1521 if (getNodeLength(nodeB) == 0)
1522 nodeB = NULL;
1523
1524 // Null nodes considered to have infinite coverage
1525 if (nodeA == NULL && nodeB == NULL)
1526 return 0;
1527 if (nodeA == NULL)
1528 return 1;
1529 if (nodeB == NULL)
1530 return -1;
1531
1532 // Deal with real coverage numbers:
1533 covA = getTotalCoverage(nodeA) / (double) getNodeLength(nodeA);
1534 covB = getTotalCoverage(nodeB) / (double) getNodeLength(nodeB);
1535
1536 if (covA > covB)
1537 return 1;
1538 if (covA == covB)
1539 return 0;
1540 return -1;
1541 }
1542
estimated_cov(Graph * graph,char * directory)1543 double estimated_cov(Graph * graph, char * directory)
1544 {
1545 Node ** nodeArray = callocOrExit(nodeCount(graph), Node*);
1546 IDnum index;
1547 Coordinate halfTotalLength = 0;
1548 Coordinate sumLength = 0;
1549 Node *node;
1550 char *logFilename =
1551 mallocOrExit(strlen(directory) + 100, char);
1552 char *statsLine =
1553 mallocOrExit(5000, char);
1554 FILE *logFile;
1555
1556 strcpy(logFilename, directory);
1557 strcat(logFilename, "/Log");
1558 logFile = fopen(logFilename, "a");
1559
1560 if (logFile == NULL)
1561 exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
1562 logFilename);
1563
1564 puts("Measuring median coverage depth...");
1565
1566 if (nodeCount(graph) == 0) {
1567 puts("EMPTY GRAPH");
1568 return 0;
1569 }
1570
1571 // Write nodes into array and compute total assembly length
1572 for (index = 1; index <= nodeCount(graph); index++) {
1573 node = getNodeInGraph(graph, index);
1574 nodeArray[index - 1] = node;
1575 if (node == NULL)
1576 continue;
1577 halfTotalLength += getNodeLength(node);
1578 }
1579 halfTotalLength /= 2;
1580
1581 // Sort nodes
1582 qsort(nodeArray, nodeCount(graph), sizeof(Node *), compareNodeCovs);
1583
1584 // Compute the length weighted median node coverage
1585 for (index = 0; index < nodeCount(graph); index++) {
1586 node = nodeArray[index];
1587 sumLength += getNodeLength(node);
1588 if (sumLength >= halfTotalLength) {
1589 printf("Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1590 fprintf(logFile, "Median coverage depth = %f\n", getTotalCoverage(node) / (double) getNodeLength(node));
1591 free(nodeArray);
1592 fclose(logFile);
1593 free(logFilename);
1594 free(statsLine);
1595 return getTotalCoverage(node) / (double) getNodeLength(node);
1596 }
1597 }
1598
1599 // In case something went wrong...
1600 free(nodeArray);
1601 fclose(logFile);
1602 free(logFilename);
1603 free(statsLine);
1604
1605 return -1;
1606 }
1607
destroyMixedNode(Node * node)1608 static void destroyMixedNode(Node * node)
1609 {
1610 PassageMarker *marker = getMarker(node);
1611 PassageMarker *current;
1612
1613 while (marker != NULL) {
1614 while (!isInitial(marker))
1615 marker = getPreviousInSequence(marker);
1616
1617 while (marker != NULL) {
1618 current = marker;
1619 marker = getNextInSequence(marker);
1620 destroyPassageMarker(current);
1621 }
1622
1623 marker = getMarker(node);
1624 }
1625 }
1626
destroyMixedReads(Graph * graph,IDnum minCoverage)1627 void destroyMixedReads(Graph * graph, IDnum minCoverage)
1628 {
1629 IDnum index;
1630 Node *node;
1631
1632 for (index = 1; index <= nodeCount(graph); index++) {
1633 node = getNodeInGraph(graph, index);
1634 if (node == NULL)
1635 continue;
1636
1637 if (markerCount(node) < minCoverage)
1638 destroyMixedNode(node);
1639 }
1640
1641 for (index = 1; index <= nodeCount(graph); index++) {
1642 node = getNodeInGraph(graph, index);
1643 if (node == NULL)
1644 continue;
1645
1646 if (getMarker(node) == NULL)
1647 destroyNode(node, graph);
1648 }
1649
1650 concatenateGraph(graph);
1651 }
1652
isConnectedRead(PassageMarker * marker)1653 static boolean isConnectedRead(PassageMarker * marker)
1654 {
1655 PassageMarker *current;
1656
1657 for (current = marker; getNodeStatus(getNode(current));
1658 current = getNextInSequence(current))
1659 if (isTerminal(current))
1660 return false;
1661
1662 for (current = getTwinMarker(marker);
1663 getNodeStatus(getNode(current));
1664 current = getNextInSequence(current))
1665 if (isTerminal(current))
1666 return false;
1667
1668 return true;
1669 }
1670
destroyWholeRead(PassageMarker * marker)1671 static void destroyWholeRead(PassageMarker * marker)
1672 {
1673 PassageMarker *current = marker;
1674 PassageMarker *next;
1675
1676 while (!isInitial(current))
1677 current = getPreviousInSequence(current);
1678
1679 for (; current != NULL; current = next) {
1680 next = getNextInSequence(current);
1681 destroyPassageMarker(current);
1682 }
1683 }
1684
cleanUpNode(Node * node,Graph * graph)1685 static void cleanUpNode(Node * node, Graph * graph)
1686 {
1687 Category cat;
1688 Node *twin = getTwinNode(node);
1689 PassageMarker *marker, *twinMarker;
1690
1691 for (cat = 0; cat < CATEGORIES; cat++)
1692 setVirtualCoverage(node, cat, 0);
1693
1694 while (getArc(node) != NULL)
1695 destroyArc(getArc(node), graph);
1696 while (getArc(twin) != NULL)
1697 destroyArc(getArc(twin), graph);
1698
1699 for (marker = getMarker(node); marker != NULL;
1700 marker = getNextInNode(marker)) {
1701 twinMarker = getTwinMarker(marker);
1702
1703 if (getNode(getNextInSequence(marker)) != twin
1704 || getPassageMarkerSequenceID(marker) > 0)
1705 createArc(node,
1706 getNode(getNextInSequence(marker)),
1707 graph);
1708 if (getNode(getNextInSequence(twinMarker)) != node
1709 || getPassageMarkerSequenceID(twinMarker) > 0)
1710 createArc(twin,
1711 getNode(getNextInSequence(twinMarker)),
1712 graph);
1713 }
1714 }
1715
destroySinglePoolNodes(Graph * graph)1716 void destroySinglePoolNodes(Graph * graph)
1717 {
1718 IDnum index;
1719 Node *node;
1720 PassageMarker *marker, *next;
1721
1722 puts("Destroying single pool nodes");
1723 resetNodeStatus(graph);
1724
1725 // Remove empty, single pool nodes, mark other single pool nodes
1726 for (index = 1; index <= nodeCount(graph); index++) {
1727 node = getNodeInGraph(graph, index);
1728 if (node == NULL)
1729 continue;
1730
1731 if (getVirtualCoverage(node, 0) != 0
1732 && getVirtualCoverage(node, 1) != 0)
1733 continue;
1734
1735 if (getMarker(node) == NULL)
1736 destroyNode(node, graph);
1737 else
1738 setNodeStatus(node, true);
1739 }
1740
1741 // Remove disconnected reads
1742 for (index = 1; index <= nodeCount(graph); index++) {
1743 node = getNodeInGraph(graph, index);
1744 if (node == NULL || !getNodeStatus(node))
1745 continue;
1746
1747 for (marker = getMarker(node); marker != NULL;
1748 marker = next) {
1749 if (isConnectedRead(marker))
1750 next = getNextInNode(marker);
1751 else {
1752 destroyWholeRead(marker);
1753 next = getMarker(node);
1754 }
1755 }
1756 }
1757
1758 // Remove empty, single pool nodes, review coverage of the others
1759 for (index = 1; index <= nodeCount(graph); index++) {
1760 node = getNodeInGraph(graph, index);
1761 if (node == NULL || !getNodeStatus(node))
1762 continue;
1763
1764 if (getMarker(node) == NULL)
1765 destroyNode(node, graph);
1766 else
1767 cleanUpNode(node, graph);
1768 }
1769
1770 puts("Done");
1771
1772 concatenateGraph(graph);
1773 }
1774
destroyShortTips(Graph * graph)1775 void destroyShortTips(Graph * graph)
1776 {
1777 IDnum index;
1778 Node *node;
1779 boolean modified = true;
1780
1781 puts("Removing short tips");
1782
1783 while (modified) {
1784 modified = false;
1785 for (index = 1; index <= nodeCount(graph); index++) {
1786 node = getNodeInGraph(graph, index);
1787 if (node == NULL)
1788 continue;
1789
1790 if (getArc(node) == NULL
1791 || getArc(getTwinNode(node)) == NULL) {
1792 if (getNodeLength(node) < 500) {
1793 modified = true;
1794 destroyNode(node, graph);
1795 }
1796 }
1797 }
1798 }
1799
1800 puts("Done");
1801
1802 concatenateGraph(graph);
1803 }
1804
connectDomain(Node * node)1805 static Coordinate connectDomain(Node * node)
1806 {
1807 Coordinate result = getNodeLength(node);
1808 Arc *arc;
1809
1810 if (getNodeStatus(node))
1811 return 0;
1812 setNodeStatus(node, true);
1813
1814 for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1815 result += connectDomain(getDestination(arc));
1816 for (arc = getArc(getTwinNode(node)); arc != NULL;
1817 arc = getNextArc(arc))
1818 result += connectDomain(getDestination(arc));
1819
1820 return result;
1821
1822 }
1823
destroyConnectedDomain(Node * node,Graph * graph)1824 static void destroyConnectedDomain(Node * node, Graph * graph)
1825 {
1826 Arc *arc;
1827
1828 if (getNodeStatus(node))
1829 return;
1830 setNodeStatus(node, true);
1831
1832 for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1833 destroyConnectedDomain(getDestination(arc), graph);
1834 for (arc = getArc(getTwinNode(node)); arc != NULL;
1835 arc = getNextArc(arc))
1836 destroyConnectedDomain(getDestination(arc), graph);
1837
1838 destroyNode(node, graph);
1839
1840 }
1841
destroyDisconnectedElements(Graph * graph)1842 void destroyDisconnectedElements(Graph * graph)
1843 {
1844 Node *node;
1845 IDnum index;
1846 Coordinate domainSize;
1847 FibHeap *heap = newFibHeap();
1848 Coordinate *domainSizes =
1849 callocOrExit(1 + nodeCount(graph), Coordinate);
1850
1851 resetNodeStatus(graph);
1852
1853 puts("Destroying disconnected domains");
1854
1855 for (index = 1; index <= nodeCount(graph); index++) {
1856 node = getNodeInGraph(graph, index);
1857 if (node == NULL || getNodeStatus(node))
1858 continue;
1859 domainSize = connectDomain(node);
1860 printf("CONNECT\t%lld\n", (long long) domainSize);
1861 insertNodeIntoHeap(heap, domainSize, node);
1862 domainSizes[index] = domainSize;
1863 }
1864
1865 resetNodeStatus(graph);
1866
1867 while (true) {
1868 node = removeNextNodeFromHeap(heap);
1869 if (node == NULL || domainSizes[getNodeID(node)] > 1200)
1870 break;
1871
1872 destroyConnectedDomain(node, graph);
1873 }
1874
1875
1876 destroyHeap(heap);
1877 free(domainSizes);
1878 puts("Done");
1879
1880 concatenateGraph(graph);
1881 }
1882
connectDomainNodeCount(Node * node)1883 static Coordinate connectDomainNodeCount(Node * node)
1884 {
1885 Coordinate result = 1;
1886 Arc *arc;
1887
1888 if (getNodeStatus(node))
1889 return 0;
1890 setNodeStatus(node, true);
1891
1892 for (arc = getArc(node); arc != NULL; arc = getNextArc(arc))
1893 result += connectDomain(getDestination(arc));
1894 for (arc = getArc(getTwinNode(node)); arc != NULL;
1895 arc = getNextArc(arc))
1896 result += connectDomain(getDestination(arc));
1897
1898 return result;
1899
1900 }
1901
measureTangleSizes(Graph * graph,Coordinate maxLength)1902 void measureTangleSizes(Graph * graph, Coordinate maxLength)
1903 {
1904 Node *node;
1905 IDnum index;
1906 Coordinate domainSize;
1907
1908 for (index = 1; index <= nodeCount(graph); index++) {
1909 node = getNodeInGraph(graph, index);
1910 if (node == NULL)
1911 continue;
1912 if (getNodeLength(node) >= maxLength)
1913 destroyNode(node, graph);
1914 }
1915
1916 renumberNodes(graph);
1917 resetNodeStatus(graph);
1918
1919 for (index = 1; index <= nodeCount(graph); index++) {
1920 node = getNodeInGraph(graph, index);
1921 if (node == NULL || getNodeStatus(node))
1922 continue;
1923 domainSize = connectDomainNodeCount(node);
1924 printf("CONNECT\t%lld\n", (long long) domainSize);
1925 }
1926
1927 puts("Done");
1928 }
1929
destroyEmptyNodes(Graph * graph)1930 void destroyEmptyNodes(Graph * graph)
1931 {
1932 IDnum index;
1933 Node *node;
1934
1935 for (index = 1; index <= nodeCount(graph); index++) {
1936 node = getNodeInGraph(graph, index);
1937 if (node == NULL)
1938 continue;
1939
1940 if (getMarker(node) == NULL)
1941 destroyNode(node, graph);
1942 }
1943
1944 concatenateGraph(graph);
1945 }
1946
getReadLength(PassageMarker * marker)1947 static Coordinate getReadLength(PassageMarker * marker)
1948 {
1949 PassageMarker *current;
1950 Coordinate totalLength = 0;
1951
1952 for (current = marker; current != NULL;
1953 current = getNextInSequence(current))
1954 totalLength += getPassageMarkerLength(current);
1955
1956 return totalLength;
1957 }
1958
destroyRead(PassageMarker * marker)1959 static void destroyRead(PassageMarker * marker)
1960 {
1961 PassageMarker *current, *next;
1962
1963 for (current = marker; current != NULL; current = next) {
1964 next = getNextInSequence(current);
1965 destroyPassageMarker(current);
1966 }
1967 }
1968
removeShortReads(Graph * graph)1969 void removeShortReads(Graph * graph)
1970 {
1971 IDnum index;
1972 Node *node;
1973 PassageMarker *marker, *next;
1974
1975 for (index = 1; index <= nodeCount(graph); index++) {
1976 node = getNodeInGraph(graph, index);
1977 if (node == NULL)
1978 continue;
1979
1980
1981 for (marker = getMarker(node); marker != NULL;
1982 marker = next) {
1983 if (getPassageMarkerSequenceID(marker) > 0
1984 && isInitial(marker)
1985 && getReadLength(marker) < 400) {
1986 destroyRead(marker);
1987 next = getMarker(node);
1988 } else if (getPassageMarkerSequenceID(marker) < 0
1989 && isTerminal(marker)
1990 && getReadLength(getTwinMarker(marker))
1991 < 400) {
1992 destroyRead(getTwinMarker(marker));
1993 next = getMarker(node);
1994 } else
1995 next = getNextInNode(marker);
1996
1997 }
1998 }
1999 }
2000
totalGraphLength(Graph * graph)2001 Coordinate totalGraphLength(Graph * graph)
2002 {
2003 IDnum index;
2004 Coordinate totalLength = 0;
2005
2006 for (index = 1; index <= nodeCount(graph); index++)
2007 totalLength += getNodeLength(getNodeInGraph(graph, index));
2008
2009 return totalLength;
2010 }
2011
destroySinglePoolNodesStrict(Graph * graph)2012 void destroySinglePoolNodesStrict(Graph * graph)
2013 {
2014 IDnum index;
2015 Node *node;
2016
2017 puts("Destroying single pool nodes");
2018 resetNodeStatus(graph);
2019
2020 // Remove empty, single pool nodes, mark other single pool nodes
2021 for (index = 1; index <= nodeCount(graph); index++) {
2022 node = getNodeInGraph(graph, index);
2023 if (node == NULL)
2024 continue;
2025
2026 if (getVirtualCoverage(node, 0) != 0
2027 && getVirtualCoverage(node, 1) != 0)
2028 continue;
2029
2030 destroyNode(node, graph);
2031 }
2032
2033 puts("Done");
2034
2035 concatenateGraph(graph);
2036 }
2037
contigStats(Node ** contigs,IDnum readCount)2038 void contigStats(Node ** contigs, IDnum readCount)
2039 {
2040 FibHeap *heap = newFibHeap();
2041 IDnum index;
2042 Coordinate totalLength = 0;
2043 Coordinate sumLength = 0;
2044 Node *node;
2045 Coordinate halfLength;
2046
2047 for (index = 0; index <= readCount; index++) {
2048 if (contigs[index] != NULL) {
2049 node = contigs[index];
2050 printf("CONTIG %lld\n", (long long) getNodeLength(node));
2051 insertNodeIntoHeap(heap, getNodeLength(node),
2052 node);
2053 totalLength += getNodeLength(node);
2054 }
2055 }
2056 halfLength = totalLength / 2;
2057
2058 node = removeNextNodeFromHeap(heap);
2059 while (node != NULL) {
2060 sumLength += getNodeLength(node);
2061 if (sumLength >= halfLength)
2062 break;
2063 node = removeNextNodeFromHeap(heap);
2064 }
2065
2066 destroyHeap(heap);
2067 printf("N50 %lld Total %lld\n", (long long) getNodeLength(node), (long long) totalLength);
2068 }
2069
exportContigs(Node ** contigs,ReadSet * reads,char * filename,int WORDLENGTH,int pairedReadsCount)2070 void exportContigs(Node ** contigs, ReadSet * reads, char *filename,
2071 int WORDLENGTH, int pairedReadsCount)
2072 {
2073 TightString **sequences =
2074 mallocOrExit(reads->readCount, TightString *);
2075 IDnum i;
2076
2077 for (i = 0; i < pairedReadsCount; i++) {
2078 if (contigs[i] == NULL)
2079 sequences[i] = NULL;
2080 else
2081 sequences[i] = expandNode(contigs[i], WORDLENGTH);
2082 }
2083
2084 exportSequenceArray(filename, sequences, reads->readCount);
2085 }
2086
removeLowCoverageNodesAndDenounceDubiousReads(Graph * graph,double minCov)2087 boolean *removeLowCoverageNodesAndDenounceDubiousReads(Graph * graph,
2088 double minCov)
2089 {
2090 IDnum index;
2091 Node *node;
2092 boolean denounceReads = readStartsAreActivated(graph);
2093 boolean *res = NULL;
2094 ShortReadMarker *nodeArray, *shortMarker;
2095 PassageMarker *marker;
2096 IDnum maxIndex;
2097 IDnum readID;
2098 IDnum index2;
2099 // Original
2100 double nodeDensity = 0.0;
2101 int countNodeTotal = 0, countNodeUnderMinCov = 0, countNodeOnLongRead = 0;
2102 int countNodeUMCbutOnLongRead = 0, countNodeSupportedByLongRead = 0;
2103 int countNodeEscapedByLongRead = 0;
2104 boolean escapeByLongRead = false;
2105 // Original
2106
2107 printf("Removing contigs with coverage < %f...\n", minCov);
2108
2109 if (denounceReads)
2110 res = callocOrExit(sequenceCount(graph), boolean);
2111
2112 // Original
2113 countNodeTotal = nodeCount(graph);
2114 // Original
2115
2116 for (index = 1; index <= nodeCount(graph); index++) {
2117 node = getNodeInGraph(graph, index);
2118
2119 if (getNodeLength(node) == 0)
2120 continue;
2121
2122 // Original
2123 if ((marker = getMarker(node))){
2124 countNodeOnLongRead++;
2125 }
2126 // Original
2127
2128 if ((double)getTotalCoverage(node) / getNodeLength(node) < minCov) {
2129 // Original
2130 countNodeUnderMinCov++;
2131 if ((marker = getMarker(node))) {
2132 nodeDensity = (double)getTotalCoverage(node) / getNodeLength(node);
2133 // printf("[Under]\tnodeID : %d\tnodeLen : %d\tnodeCov : %f\n",
2134 // getNodeID(node), getNodeLength(node), nodeDensity);
2135 countNodeUMCbutOnLongRead++;
2136
2137 if (markerCount(node) >= 1) {
2138 countNodeSupportedByLongRead++;
2139 }
2140 if (markerCount(node) >= 1 && !(nodeDensity == 0.0)) {
2141 escapeByLongRead = true;
2142 countNodeEscapedByLongRead++;
2143 } else {
2144 escapeByLongRead = false;
2145 }
2146 }
2147 // Original
2148
2149
2150 // Original
2151 //if (! escapeByLongRead) {
2152 // Original
2153
2154 if (denounceReads) {
2155 nodeArray = getNodeReads(node, graph);
2156 maxIndex = getNodeReadCount(node, graph);
2157 for (index2 = 0; index2 < maxIndex; index2++) {
2158 shortMarker =
2159 getShortReadMarkerAtIndex(nodeArray,
2160 index2);
2161 readID = getShortReadMarkerID(shortMarker);
2162 //printf("Dubious %d\n", readID);
2163 if (readID > 0)
2164 res[readID - 1] = true;
2165 else
2166 res[-readID - 1] = true;
2167 }
2168
2169 nodeArray = getNodeReads(getTwinNode(node), graph);
2170 maxIndex =
2171 getNodeReadCount(getTwinNode(node), graph);
2172 for (index2 = 0; index2 < maxIndex; index2++) {
2173 shortMarker =
2174 getShortReadMarkerAtIndex(nodeArray,
2175 index2);
2176 readID = getShortReadMarkerID(shortMarker);
2177 //printf("Dubious %d\n", readID);
2178 if (readID > 0)
2179 res[readID - 1] = true;
2180 else
2181 res[-readID - 1] = true;
2182 }
2183 }
2184
2185 // Original
2186 //}
2187 // Original
2188
2189 while ((marker = getMarker(node))) {
2190 if (!isInitial(marker)
2191 && !isTerminal(marker))
2192 disconnectNextPassageMarker
2193 (getPreviousInSequence(marker),
2194 graph);
2195 destroyPassageMarker(marker);
2196 }
2197 destroyNode(node, graph);
2198
2199 }
2200 // Original
2201 // else {
2202 // nodeDensity = (double)getTotalCoverage(node) / getNodeLength(node);
2203 // printf("[Over]\tnodeID : %d\tnodeLen : %d\tnodeCov : %f\n",
2204 // getNodeID(node), getNodeLength(node), nodeDensity);
2205 //}
2206 // Original
2207 }
2208
2209 /*
2210 // Original
2211 printf("No. of Nodes : %d\n", countNodeTotal);
2212 printf("No. of Nodes Under MinCov : %d\n", countNodeUnderMinCov);
2213 printf("No. of Nodes On Long Reads : %d\n", countNodeOnLongRead);
2214 printf("No. of Nodes Under MinCov but On Long Reads : %d\n", countNodeOnLongRead);
2215 printf("No. of Nodes Under MinCov but Supported by Long Reads : %d\n",
2216 countNodeSupportedByLongRead);
2217 printf("No. of Nodes 0 < Cov < minCov but Escaped (Supported) by Long Reads : %d\n",
2218 countNodeEscapedByLongRead);
2219 // Original
2220 */
2221
2222 concatenateGraph(graph);
2223 return res;
2224 }
2225
removeLowCoverageNodes(Graph * graph,double minCov)2226 void removeLowCoverageNodes(Graph * graph, double minCov)
2227 {
2228 IDnum index;
2229 Node *node;
2230 PassageMarker *marker;
2231
2232 if (minCov < 0)
2233 return;
2234
2235 printf("Applying a coverage cutoff of %f...\n", minCov);
2236
2237 for (index = 1; index <= nodeCount(graph); index++) {
2238 node = getNodeInGraph(graph, index);
2239
2240 if (getNodeLength(node) > 0
2241 && getTotalCoverage(node) / getNodeLength(node) <
2242 minCov) {
2243 while ((marker = getMarker(node))) {
2244 if (!isInitial(marker)
2245 && !isTerminal(marker))
2246 disconnectNextPassageMarker
2247 (getPreviousInSequence(marker),
2248 graph);
2249 destroyPassageMarker(marker);
2250 }
2251 destroyNode(node, graph);
2252 }
2253 }
2254
2255 concatenateGraph(graph);
2256 }
2257
removeHighCoverageNodes(Graph * graph,double maxCov)2258 void removeHighCoverageNodes(Graph * graph, double maxCov)
2259 {
2260 IDnum index;
2261 Node *node;
2262 PassageMarker *marker;
2263
2264 if (maxCov < 0)
2265 return;
2266
2267 printf("Applying an upper coverage cutoff of %f...\n", maxCov);
2268
2269 for (index = 1; index <= nodeCount(graph); index++) {
2270 node = getNodeInGraph(graph, index);
2271
2272 if (getNodeLength(node) > 0
2273 && getTotalCoverage(node) / getNodeLength(node) >
2274 maxCov) {
2275 while ((marker = getMarker(node))) {
2276 if (!isInitial(marker)
2277 && !isTerminal(marker))
2278 disconnectNextPassageMarker
2279 (getPreviousInSequence(marker),
2280 graph);
2281 destroyPassageMarker(marker);
2282 }
2283 destroyNode(node, graph);
2284 }
2285 }
2286
2287 concatenateGraph(graph);
2288 }
2289
removeMissingStrain(Graph * graph,Category cat)2290 void removeMissingStrain(Graph * graph, Category cat)
2291 {
2292 IDnum ID;
2293 Node *node;
2294
2295 for (ID = 1; ID <= nodeCount(graph); ID++) {
2296 node = getNodeInGraph(graph, ID);
2297
2298 if (node == NULL)
2299 continue;
2300
2301 if (getVirtualCoverage(node, cat) == 0)
2302 destroyNode(node, graph);
2303 }
2304
2305 concatenateGraph(graph);
2306 }
2307
exportAMOSLib(FILE * outfile,Graph * graph,Category cat)2308 static void exportAMOSLib(FILE * outfile, Graph * graph, Category cat)
2309 {
2310 Coordinate distance = getInsertLength(graph, cat * 2);
2311 double variance = getInsertLength_var(graph, cat * 2);
2312
2313 if (distance == -1)
2314 return;
2315
2316 fprintf(outfile, "{LIB\n");
2317 fprintf(outfile, "iid:%d\n", (int) (cat + 1));
2318 fprintf(outfile, "{DST\n");
2319 fprintf(outfile, "mea:%lld\n", (long long) distance);
2320 fprintf(outfile, "std:%lld\n", (long long) sqrt(variance));
2321 fprintf(outfile, "}\n");
2322 fprintf(outfile, "}\n");
2323 }
2324
exportAMOSMarker(FILE * outfile,PassageMarker * marker,Coordinate nodeLength,Coordinate start,Coordinate finish,int wordShift)2325 static void exportAMOSMarker(FILE * outfile, PassageMarker * marker,
2326 Coordinate nodeLength, Coordinate start,
2327 Coordinate finish, int wordShift)
2328 {
2329 Coordinate sequenceStart, sequenceFinish;
2330
2331 if (getStartOffset(marker) >= finish
2332 || getFinishOffset(marker) > nodeLength - start)
2333 return;
2334
2335 sequenceStart = getPassageMarkerStart(marker);
2336 if (start > getStartOffset(marker)) {
2337 if (getPassageMarkerSequenceID(marker) > 0)
2338 sequenceStart += start - getStartOffset(marker);
2339 else
2340 sequenceStart -= start - getStartOffset(marker);
2341 }
2342
2343 sequenceFinish = getPassageMarkerFinish(marker);
2344 if (nodeLength - finish > getFinishOffset(marker)) {
2345 if (getPassageMarkerSequenceID(marker) > 0)
2346 sequenceFinish -=
2347 nodeLength - finish - getFinishOffset(marker);
2348 else
2349 sequenceFinish +=
2350 nodeLength - finish - getFinishOffset(marker);
2351 }
2352
2353 if (getPassageMarkerSequenceID(marker) > 0)
2354 sequenceFinish += wordShift;
2355 else
2356 sequenceStart += wordShift;
2357
2358 fprintf(outfile, "{TLE\n");
2359 fprintf(outfile, "src:%d\n", getAbsolutePassMarkerSeqID(marker));
2360 if (getStartOffset(marker) > start)
2361 fprintf(outfile, "off:%lld\n",
2362 (long long) (getStartOffset(marker) - start));
2363 else
2364 fprintf(outfile, "off:0\n");
2365 fprintf(outfile, "clr:%lld,%lld\n", (long long) sequenceStart, (long long) sequenceFinish);
2366 fprintf(outfile, "}\n");
2367 }
2368
exportAMOSShortMarker(FILE * outfile,ShortReadMarker * marker,ReadSet * reads,Coordinate start,Coordinate finish)2369 static void exportAMOSShortMarker(FILE * outfile, ShortReadMarker * marker,
2370 ReadSet * reads, Coordinate start,
2371 Coordinate finish)
2372 {
2373 Coordinate offset =
2374 getShortReadMarkerPosition(marker) -
2375 getShortReadMarkerOffset(marker);
2376 TightString *sequence =
2377 reads->tSequences[getShortReadMarkerID(marker) - 1];
2378
2379 if (getShortReadMarkerPosition(marker) == -1)
2380 return;
2381
2382 if (offset >= finish || offset + getLength(sequence) < start)
2383 return;
2384
2385 fprintf(outfile, "{TLE\n");
2386 fprintf(outfile, "src:%d\n", getShortReadMarkerID(marker));
2387 fprintf(outfile, "off:%lld\n", (long long) (offset - start));
2388 fprintf(outfile, "clr:0,%lld\n", (long long) getLength(sequence));
2389 fprintf(outfile, "}\n");
2390 }
2391
exportAMOSReverseShortMarker(FILE * outfile,ShortReadMarker * marker,Coordinate nodeLength,int wordShift,ReadSet * reads,Coordinate start,Coordinate finish)2392 static void exportAMOSReverseShortMarker(FILE * outfile,
2393 ShortReadMarker * marker,
2394 Coordinate nodeLength,
2395 int wordShift, ReadSet * reads,
2396 Coordinate start,
2397 Coordinate finish)
2398 {
2399 TightString *sequence =
2400 reads->tSequences[getShortReadMarkerID(marker) - 1];
2401
2402 Coordinate offset =
2403 nodeLength - getShortReadMarkerPosition(marker) +
2404 getShortReadMarkerOffset(marker) - getLength(sequence) +
2405 wordShift;
2406
2407 if (getShortReadMarkerPosition(marker) == -1)
2408 return;
2409
2410 if (offset >= finish || offset + getLength(sequence) < start)
2411 return;
2412
2413 fprintf(outfile, "{TLE\n");
2414 fprintf(outfile, "src:%d\n", getShortReadMarkerID(marker));
2415 fprintf(outfile, "off:%lld\n", (long long) (offset - start));
2416 fprintf(outfile, "clr:%lld,0\n", (long long) getLength(sequence));
2417 fprintf(outfile, "}\n");
2418 }
2419
exportAMOSContig(FILE * outfile,ReadSet * reads,Node * node,Graph * graph,Coordinate contigStart,Coordinate contigFinish,IDnum iid,IDnum internalIndex)2420 static void exportAMOSContig(FILE * outfile, ReadSet * reads, Node * node,
2421 Graph * graph, Coordinate contigStart,
2422 Coordinate contigFinish, IDnum iid,
2423 IDnum internalIndex)
2424 {
2425 Coordinate start;
2426 char str[100];
2427 PassageMarker *marker;
2428 ShortReadMarker *shortMarkerArray, *shortMarker;
2429 Coordinate index, maxIndex;
2430 int wordShift = getWordLength(graph) - 1;
2431 char *string = expandNodeFragment(node, contigStart, contigFinish,
2432 getWordLength(graph));
2433 Coordinate length = contigFinish - contigStart + wordShift;
2434
2435 fprintf(outfile, "{CTG\n");
2436 fprintf(outfile, "iid:%d\n", iid);
2437 fprintf(outfile, "eid:%d-%d\n", getNodeID(node), internalIndex);
2438
2439 fprintf(outfile, "seq:\n");
2440 for (start = 0; start <= length; start += 60) {
2441 strncpy(str, &(string[start]), 60);
2442 str[60] = '\0';
2443 fprintf(outfile, "%s\n", str);
2444 }
2445 fprintf(outfile, ".\n");
2446
2447 fprintf(outfile, "qlt:\n");
2448 for (start = 0; start <= length; start += 60) {
2449 strncpy(str, &(string[start]), 60);
2450 str[60] = '\0';
2451 fprintf(outfile, "%s\n", str);
2452 }
2453 fprintf(outfile, ".\n");
2454
2455 free(string);
2456
2457 for (marker = getMarker(node); marker != NULL;
2458 marker = getNextInNode(marker))
2459 exportAMOSMarker(outfile, marker, getNodeLength(node),
2460 contigStart, contigFinish, wordShift);
2461
2462 if (readStartsAreActivated(graph)) {
2463 shortMarkerArray = getNodeReads(node, graph);
2464 maxIndex = getNodeReadCount(node, graph);
2465 for (index = 0; index < maxIndex; index++) {
2466 shortMarker =
2467 getShortReadMarkerAtIndex(shortMarkerArray,
2468 index);
2469 exportAMOSShortMarker(outfile, shortMarker, reads,
2470 contigStart, contigFinish);
2471 }
2472
2473 shortMarkerArray = getNodeReads(getTwinNode(node), graph);
2474 maxIndex = getNodeReadCount(getTwinNode(node), graph);
2475 for (index = 0; index < maxIndex; index++) {
2476 shortMarker =
2477 getShortReadMarkerAtIndex(shortMarkerArray,
2478 index);
2479 exportAMOSReverseShortMarker(outfile, shortMarker,
2480 getNodeLength(node),
2481 wordShift, reads,
2482 contigStart,
2483 contigFinish);
2484 }
2485 }
2486
2487 fprintf(outfile, "}\n");
2488 }
2489
exportAMOSNode(FILE * outfile,ReadSet * reads,Node * node,Graph * graph)2490 static void exportAMOSNode(FILE * outfile, ReadSet * reads, Node * node,
2491 Graph * graph)
2492 {
2493 Coordinate start = 0;
2494 Coordinate finish;
2495 GapMarker *gap;
2496 IDnum smallIndex = 0;
2497 static IDnum iid = 1;
2498 IDnum contigIndex = iid;
2499 int wordShift = getWordLength(graph) - 1;
2500
2501 for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
2502 finish = getGapStart(gap);
2503 exportAMOSContig(outfile, reads, node, graph, start,
2504 finish, iid++, smallIndex++);
2505 start = getGapFinish(gap);
2506 }
2507
2508 finish = getNodeLength(node);
2509 exportAMOSContig(outfile, reads, node, graph, start, finish, iid++,
2510 smallIndex);
2511
2512 if (!getGap(node, graph))
2513 return;
2514
2515 start = 0;
2516
2517 fprintf(outfile, "{SCF\n");
2518 fprintf(outfile, "eid:%d\n", getNodeID(node));
2519 for (gap = getGap(node, graph); gap; gap = getNextGap(gap)) {
2520 finish = getGapStart(gap);
2521 fprintf(outfile, "{TLE\n");
2522 fprintf(outfile, "off:%lld\n", (long long) start);
2523 fprintf(outfile, "clr:0,%lld\n",
2524 (long long) (finish - start + (long long) wordShift));
2525 fprintf(outfile, "src:%d\n", contigIndex++);
2526 fprintf(outfile, "}\n");
2527 start = getGapFinish(gap);
2528 }
2529 finish = getNodeLength(node);
2530 fprintf(outfile, "{TLE\n");
2531 fprintf(outfile, "off:%lld\n", (long long) start);
2532 fprintf(outfile, "clr:0,%lld\n", (long long) (finish - start));
2533 fprintf(outfile, "src:%d\n", contigIndex++);
2534 fprintf(outfile, "}\n");
2535
2536 fprintf(outfile, "}\n");
2537 }
2538
exportAMOSRead(FILE * outfile,TightString * tString,IDnum index,IDnum frg_index)2539 static void exportAMOSRead(FILE * outfile, TightString * tString,
2540 IDnum index, IDnum frg_index)
2541 {
2542 Coordinate start, finish;
2543 char str[100];
2544
2545 fprintf(outfile, "{RED\n");
2546 fprintf(outfile, "iid:%d\n", index);
2547 fprintf(outfile, "eid:%d\n", index);
2548 if (frg_index > 0)
2549 fprintf(outfile, "frg:%d\n", frg_index);
2550
2551 fprintf(outfile, "seq:\n");
2552 start = 0;
2553 while (start <= getLength(tString)) {
2554 finish = start + 60;
2555 readTightStringFragment(tString, start, finish, str);
2556 fprintf(outfile, "%s\n", str);
2557 start = finish;
2558 }
2559 fprintf(outfile, ".\n");
2560
2561 fprintf(outfile, "qlt:\n");
2562 start = 0;
2563 while (start <= getLength(tString)) {
2564 finish = start + 60;
2565 readTightStringFragment(tString, start, finish, str);
2566 fprintf(outfile, "%s\n", str);
2567 start = finish;
2568 }
2569 fprintf(outfile, ".\n");
2570
2571 fprintf(outfile, "}\n");
2572 }
2573
exportAMOSContigs(char * filename,Graph * graph,Coordinate cutoff_length,ReadSet * reads)2574 void exportAMOSContigs(char *filename, Graph * graph,
2575 Coordinate cutoff_length, ReadSet * reads)
2576 {
2577 IDnum index;
2578 Category cat;
2579 Node *node;
2580 FILE *outfile;
2581
2582 printf("Writing into AMOS file %s...\n", filename);
2583 outfile = fopen(filename, "w");
2584
2585 if (outfile == NULL)
2586 exitErrorf(EXIT_FAILURE, true, "Could not write to AMOS file %s",
2587 filename);
2588
2589 for (cat = 0; cat <= CATEGORIES; cat++)
2590 exportAMOSLib(outfile, graph, cat);
2591
2592 for (index = 1; index <= reads->readCount; index++) {
2593 if (reads->categories[index - 1] % 2 != 0 &&
2594 getInsertLength(graph,
2595 reads->categories[index - 1]) >= 0) {
2596 fprintf(outfile, "{FRG\n");
2597 fprintf(outfile, "lib:%d\n",
2598 (int) ((reads->categories[index - 1] / 2) + 1));
2599 fprintf(outfile, "rds:%d,%d\n", index,
2600 index + 1);
2601 fprintf(outfile, "eid:%d\n", index);
2602 fprintf(outfile, "iid:%d\n", index);
2603 fprintf(outfile, "typ:I\n");
2604 fprintf(outfile, "}\n");
2605 index++;
2606 }
2607 }
2608
2609 for (index = 1; index <= reads->readCount; index++) {
2610 if (reads->categories[index - 1] % 2 != 0 &&
2611 getInsertLength(graph,
2612 reads->categories[index - 1]) >= 0) {
2613 exportAMOSRead(outfile,
2614 reads->tSequences[index - 1], index,
2615 index);
2616 index++;
2617 exportAMOSRead(outfile,
2618 reads->tSequences[index - 1], index,
2619 index - 1);
2620 } else {
2621 exportAMOSRead(outfile,
2622 reads->tSequences[index - 1], index,
2623 -1);
2624 }
2625 }
2626
2627 for (index = 1; index <= nodeCount(graph); index++) {
2628 node = getNodeInGraph(graph, index);
2629
2630 if (node == NULL)
2631 continue;
2632
2633 if (getNodeLength(node) >= cutoff_length)
2634 exportAMOSNode(outfile, reads, node, graph);
2635 }
2636
2637 fclose(outfile);
2638
2639 }
2640
isNatural(Graph * graph)2641 boolean isNatural(Graph * graph)
2642 {
2643 Node *node;
2644 IDnum index;
2645
2646 for (index = 1; index < nodeCount(graph); index++) {
2647 node = getNodeInGraph(graph, index);
2648 if (node == NULL)
2649 continue;
2650
2651 if (getNodeLength(node) == 0)
2652 return false;
2653
2654 if (simpleArcCount(node) > 4)
2655 return false;
2656
2657 if (simpleArcCount(getTwinNode(node)) > 4)
2658 return false;
2659 }
2660
2661 return true;
2662 }
2663
followShortPath(Arc * arc)2664 static Node *followShortPath(Arc * arc)
2665 {
2666 Node *node = getDestination(arc);
2667 if (simpleArcCount(node) != 1)
2668 return NULL;
2669 node = getDestination(getArc(node));
2670 return getTwinNode(node);
2671
2672 }
2673
checkNodeForHallidayJunction(Node * node,Graph * graph)2674 static void checkNodeForHallidayJunction(Node * node, Graph * graph)
2675 {
2676 Node *nodeA, *nodeB, *nodeC, *nodeD;
2677 Arc *arc1, *arc2;
2678
2679 if (simpleArcCount(node) != 2)
2680 return;
2681
2682 arc1 = getArc(node);
2683 arc2 = getNextArc(arc1);
2684
2685 nodeA = followShortPath(arc1);
2686 if (nodeA == NULL || simpleArcCount(nodeA) != 2
2687 || !isUniqueBasic(nodeA))
2688 return;
2689
2690 nodeB = followShortPath(arc2);
2691 if (nodeB == NULL || simpleArcCount(nodeB) != 2
2692 || !isUniqueBasic(nodeB))
2693 return;
2694
2695 if (nodeA == nodeB) {
2696 return;
2697 }
2698
2699 arc1 = getArc(nodeA);
2700 arc2 = getNextArc(arc1);
2701 nodeC = followShortPath(arc1);
2702 if (nodeC == NULL)
2703 return;
2704 if (nodeC == node) {
2705 nodeC = followShortPath(arc2);
2706 if (nodeC == NULL || nodeC == node
2707 || simpleArcCount(nodeC) != 2
2708 || !isUniqueBasic(nodeC)) {
2709 printf("NO %d %d %d %d\n", getNodeID(node),
2710 getNodeID(nodeA), getNodeID(nodeB),
2711 getNodeID(nodeC));
2712 return;
2713 }
2714 } else {
2715 if (simpleArcCount(nodeC) != 2 || !isUniqueBasic(nodeC)) {
2716 puts("2");
2717 return;
2718 }
2719 nodeD = followShortPath(arc2);
2720 if (nodeD != node) {
2721 puts("3");
2722 return;
2723 }
2724 }
2725
2726 puts("A");
2727
2728 arc1 = getArc(nodeB);
2729 arc2 = getNextArc(arc1);
2730 nodeD = followShortPath(arc1);
2731 if (nodeD != nodeC && nodeD != node)
2732 return;
2733 nodeD = followShortPath(arc2);
2734 if (nodeD != nodeC && nodeD != node)
2735 return;
2736
2737 arc1 = getArc(nodeB);
2738 arc2 = getNextArc(arc1);
2739 nodeD = followShortPath(arc1);
2740 if (nodeD != nodeC && nodeD != node)
2741 return;
2742 nodeD = followShortPath(arc2);
2743 if (nodeD != nodeC && nodeD != node)
2744 return;
2745
2746 printf("JOHNNY HALLIDAY JUNCTION %d %d %d %d\n",
2747 getNodeID(node), getNodeID(nodeC), getNodeID(nodeA),
2748 getNodeID(nodeB));
2749 }
2750
searchForHallidayJunction(Graph * graph)2751 void searchForHallidayJunction(Graph * graph)
2752 {
2753 IDnum index;
2754 Node *node;
2755
2756 setBaseCoverage(8);
2757
2758 for (index = 1; index <= nodeCount(graph); index++) {
2759 node = getNodeInGraph(graph, index);
2760
2761 if (isUniqueBasic(node)) {
2762 checkNodeForHallidayJunction(node, graph);
2763 checkNodeForHallidayJunction(getTwinNode(node),
2764 graph);
2765 }
2766 }
2767 }
2768
totalAssemblyLength(Graph * graph)2769 Coordinate totalAssemblyLength(Graph * graph)
2770 {
2771 IDnum index;
2772 Node *node;
2773 Coordinate total = 0;
2774
2775 for (index = 1; index <= nodeCount(graph); index++) {
2776 node = getNodeInGraph(graph, index);
2777 if (node)
2778 total += getNodeLength(node);
2779 }
2780
2781 return total;
2782 }
2783
usedReads(Graph * graph,Coordinate minContigLength)2784 IDnum usedReads(Graph * graph, Coordinate minContigLength)
2785 {
2786 IDnum res = 0;
2787 boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
2788 IDnum nodeID, readID;
2789 Node * node;
2790 PassageMarker * marker;
2791 ShortReadMarker * shortReadArray, * shortReadMarker;
2792 IDnum shortReadCount, shortReadIndex;
2793
2794 for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
2795 node = getNodeInGraph(graph, nodeID);
2796 if (node == NULL || getNodeLength(node) < minContigLength)
2797 continue;
2798
2799 // Long reads
2800 for(marker = getMarker(node); marker != NULL; marker = getNextInNode(marker)) {
2801 readID = getPassageMarkerSequenceID(marker);
2802 if (readID < 0)
2803 readID = -readID;
2804 used[readID] = true;
2805 }
2806
2807 // Short reads
2808 if (!readStartsAreActivated(graph))
2809 continue;
2810
2811 shortReadArray = getNodeReads(node, graph);
2812 shortReadCount = getNodeReadCount(node, graph);
2813 for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2814 shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2815 readID = getShortReadMarkerID(shortReadMarker);
2816 used[readID] = true;
2817 }
2818
2819 shortReadArray = getNodeReads(getTwinNode(node), graph);
2820 shortReadCount = getNodeReadCount(getTwinNode(node), graph);
2821 for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2822 shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2823 readID = getShortReadMarkerID(shortReadMarker);
2824 used[readID] = true;
2825 }
2826 }
2827
2828 for (readID = 1; readID <= sequenceCount(graph); readID++)
2829 if (used[readID])
2830 res++;
2831
2832 free(used);
2833
2834 return res;
2835 }
2836
logFinalStats(Graph * graph,Coordinate minContigKmerLength,char * directory)2837 void logFinalStats(Graph * graph, Coordinate minContigKmerLength, char *directory)
2838 {
2839 char *logFilename =
2840 mallocOrExit(strlen(directory) + 100, char);
2841 char *statsLine =
2842 mallocOrExit(5000, char);
2843 FILE *logFile;
2844
2845 strcpy(logFilename, directory);
2846 strcat(logFilename, "/Log");
2847 logFile = fopen(logFilename, "a");
2848
2849 if (logFile == NULL)
2850 exitErrorf(EXIT_FAILURE, true, "Could not write to %s",
2851 logFilename);
2852
2853 sprintf
2854 (statsLine, "Final graph has %ld nodes and n50 of %lld, max %lld, total %lld, using %ld/%ld reads\n",
2855 (long) nodeCount(graph), (long long) n50(graph), (long long) maxLength(graph),
2856 (long long) totalAssemblyLength(graph), (long) usedReads(graph, minContigKmerLength),
2857 (long) sequenceCount(graph));
2858
2859 fprintf(logFile, "%s", statsLine);
2860 fprintf(stdout, "%s", statsLine);
2861
2862 fclose(logFile);
2863 free(logFilename);
2864 free(statsLine);
2865 }
2866
exportUnusedReads(Graph * graph,ReadSet * reads,Coordinate minContigKmerLength,char * directory)2867 void exportUnusedReads(Graph* graph, ReadSet * reads, Coordinate minContigKmerLength, char* directory) {
2868 char *outFilename =
2869 mallocOrExit(strlen(directory) + 100, char);
2870 FILE * outfile;
2871 boolean * used = callocOrExit(sequenceCount(graph) + 1, boolean);
2872 IDnum nodeID, readID;
2873 Node * node;
2874 PassageMarker * marker;
2875 ShortReadMarker * shortReadArray, * shortReadMarker;
2876 IDnum shortReadCount, shortReadIndex;
2877 // Original
2878 IDnum numUnusedReads;
2879 // Original
2880
2881 strcpy(outFilename, directory);
2882 strcat(outFilename, "/UnusedReads.fa");
2883 outfile = fopen(outFilename, "w");
2884
2885 printf("Printing unused reads into %s\n", outFilename);
2886
2887 for(nodeID = 1; nodeID <= nodeCount(graph); nodeID++) {
2888 node = getNodeInGraph(graph, nodeID);
2889 if (node == NULL || getNodeLength(node) < minContigKmerLength)
2890 continue;
2891
2892 // Long reads
2893 for(marker = getMarker(node); marker != NULL; marker = getNextInNode(marker)) {
2894 readID = getPassageMarkerSequenceID(marker);
2895 if (readID < 0)
2896 readID = -readID;
2897 used[readID] = true;
2898 }
2899
2900 // Short reads
2901 if (!readStartsAreActivated(graph))
2902 continue;
2903
2904 shortReadArray = getNodeReads(node, graph);
2905 shortReadCount = getNodeReadCount(node, graph);
2906 for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2907 shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2908 readID = getShortReadMarkerID(shortReadMarker);
2909 used[readID] = true;
2910 }
2911
2912 shortReadArray = getNodeReads(getTwinNode(node), graph);
2913 shortReadCount = getNodeReadCount(getTwinNode(node), graph);
2914 for (shortReadIndex = 0; shortReadIndex < shortReadCount; shortReadIndex++) {
2915 shortReadMarker = getShortReadMarkerAtIndex(shortReadArray, shortReadIndex);
2916 readID = getShortReadMarkerID(shortReadMarker);
2917 used[readID] = true;
2918 }
2919 }
2920
2921 for (readID = 1; readID <= sequenceCount(graph); readID++)
2922 if (!used[readID]) {
2923 exportTightString(outfile, reads->tSequences[readID - 1], readID);
2924 numUnusedReads++;
2925 }
2926
2927 // Original
2928 //printf("%d\n", numUnusedReads);
2929 // Original
2930
2931 free(outFilename);
2932 free(used);
2933 fclose(outfile);
2934 }
2935
2936 // Original
getNodeDensity(Node * node)2937 double getNodeDensity(Node * node)
2938 {
2939 Coordinate nodeLength, nodeCoverage;
2940
2941 nodeLength = getNodeLength(node);
2942 nodeCoverage = (getVirtualCoverage(node, 0)
2943 + getVirtualCoverage(node, 1));
2944
2945 return nodeCoverage /(double) nodeLength;
2946 }
2947
makeDummySubgraphMask(Graph * graph)2948 int * makeDummySubgraphMask(Graph * graph)
2949 {
2950 int lenSubgraphMask = 2 * nodeCount(graph) + 1;
2951 int *subgraphMask = callocOrExit(lenSubgraphMask, int);
2952 int index;
2953
2954 for (index = 0; index < lenSubgraphMask; index++)
2955 subgraphMask[index] = 1;
2956
2957 return subgraphMask;
2958 }
2959
estimated_cov_multi(Graph * graph,int * subgraphMask,double expCovMulti[100])2960 int estimated_cov_multi(Graph * graph, int * subgraphMask, double expCovMulti[100])
2961 {
2962 double histo[LEN_HISTO_X];
2963 Node *node;
2964 int index, ecmIndex = 0;
2965 double binWidth = 0.2;
2966 int bin = 0;
2967 double peakCov = 0.0, peakHeight = 0.0;
2968 double lastPeakHeight = 0.0;
2969 double SNratio = 10, thresNoiseHeight = 0.0;
2970 int noiseCount = 0, thresNoiseCount = 5;
2971 double thresMinPeak = 2.0;
2972
2973 puts("Starting peak detection...");
2974
2975 // Initialize expCovMulti[] and histo[]
2976 for (index = 0; index < 100; index++)
2977 expCovMulti[index] = -1;
2978 for (index = 0; index < LEN_HISTO_X; index++)
2979 histo[index] = 0.0;
2980
2981 // Make histogram
2982 for (index = 1; index <= nodeCount(graph); index++) {
2983 if (subgraphMask[index + nodeCount(graph)] == 1) {
2984 node = getNodeInGraph(graph, index);
2985 node = getNodeInGraph(graph, index);
2986 if (node == NULL || getNodeLength(node) <= 0)
2987 continue;
2988 bin = (int) floor(getNodeDensity(node) / binWidth);
2989 if (bin >= LEN_HISTO_X - 1)
2990 bin = LEN_HISTO_X - 1;
2991 histo[bin] += getNodeLength(node);
2992 }
2993 }
2994
2995 // Define length threshold of noise
2996 // Skip index = 0 to avoid the influence of long reads
2997 for (index = LEN_HISTO_X - 2; index >= 1; index--) {
2998 if (histo[index] > peakHeight)
2999 peakHeight = histo[index];
3000 }
3001 thresNoiseHeight = peakHeight / (double) SNratio;
3002 peakHeight = 0.0;
3003
3004 // Detect peaks
3005 for (index = LEN_HISTO_X - 2; index >= 1; index--) {
3006 if (histo[index] > thresNoiseHeight) {
3007 if (histo[index] > peakHeight
3008 && histo[index] > lastPeakHeight) {
3009 peakHeight = histo[index];
3010 peakCov = (double) (index + 0.5) * binWidth;
3011 noiseCount = 0;
3012 continue;
3013 }
3014 else {
3015 noiseCount++;
3016 }
3017 }
3018 else {
3019 lastPeakHeight = 0.0;
3020 noiseCount++;
3021 }
3022
3023 if (peakHeight > 0.0 && noiseCount >= thresNoiseCount) {
3024 if (peakCov < thresMinPeak)
3025 break;
3026
3027 expCovMulti[ecmIndex++] = peakCov;
3028
3029 peakCov = 0.0;
3030 lastPeakHeight = peakHeight;
3031 peakHeight = 0.0;
3032 noiseCount = 0;
3033 }
3034 }
3035
3036 // Output detedted peaks
3037 if (ecmIndex == 0) {
3038 puts("Error!! Couldn't detect any peaks");
3039 exit(1);
3040 }
3041 for (index = 0; index < ecmIndex; index++)
3042 printf("Detected Peak Coverage : %f\n", expCovMulti[index]);
3043
3044 puts("Peak detection finished");
3045
3046 return ecmIndex;
3047 }
3048
eliminateNullNodes(Graph * graph,int * subgraphMask)3049 static void eliminateNullNodes(Graph * graph, int * subgraphMask)
3050 {
3051 Node *node;
3052 int index;
3053 int lenSubgraphMask = 2 * nodeCount(graph) + 1;
3054
3055 for (index = 0; index < lenSubgraphMask; index++) {
3056 node = getNodeInGraph(graph, index - nodeCount(graph));
3057 if (node == NULL || getNodeID(node) == 0)
3058 subgraphMask[index] = -2;
3059 }
3060 }
3061
checkLongReadExistence(Graph * graph)3062 static boolean checkLongReadExistence(Graph * graph)
3063 {
3064 int index;
3065
3066 for (index = 1; index <= nodeCount(graph); index++) {
3067 if (getMarker(getNodeInGraph(graph, index)) != NULL)
3068 return true;
3069 }
3070
3071 return false;
3072 }
3073
depthFirstSearchSubgraph(int currentIndex,Graph * graph,int * subgraphMask)3074 static void depthFirstSearchSubgraph(int currentIndex, Graph * graph, int * subgraphMask)
3075 {
3076 Arc *activeArc = NULL;
3077 int nextIndex = 0;
3078
3079 if (subgraphMask[currentIndex + nodeCount(graph)] == 0) {
3080 // Mark "Visiting"
3081 subgraphMask[currentIndex + nodeCount(graph)] = 1;
3082
3083 // Find "Unvisited" Node
3084 for (activeArc = getArc(getNodeInGraph(graph, currentIndex));
3085 activeArc != NULL; activeArc = getNextArc(activeArc)) {
3086 nextIndex = getNodeID(getDestination(activeArc));
3087 if (subgraphMask[nextIndex] == 0) {
3088 depthFirstSearchSubgraph(nextIndex, graph, subgraphMask);
3089 depthFirstSearchSubgraph((nextIndex * -1), graph, subgraphMask);
3090 }
3091 }
3092 }
3093 }
3094
resetUniqueness(Graph * graph)3095 void resetUniqueness(Graph * graph)
3096 {
3097 Node *node;
3098 int index;
3099
3100 for (index = 1; index <= nodeCount(graph); index++) {
3101 node = getNodeInGraph(graph, index);
3102 if (node == NULL)
3103 continue;
3104 setUniqueness(node, false);
3105 }
3106 }
3107
setSubgraphMask(int * subgraphMask,int lenSubgraphMask,int before,int after)3108 static void setSubgraphMask(int * subgraphMask, int lenSubgraphMask,
3109 int before, int after)
3110 {
3111 int index;
3112
3113 for (index = 0; index < lenSubgraphMask; index++) {
3114 if (subgraphMask[index] == before)
3115 subgraphMask[index] = after;
3116 }
3117 }
3118
getUnvisitedNodeID(int * subgraphMask,int lenSubgraphMask)3119 static int getUnvisitedNodeID(int * subgraphMask, int lenSubgraphMask)
3120 {
3121 int index;
3122
3123 for (index = 0; index < lenSubgraphMask; index++) {
3124 if (index == (lenSubgraphMask - 1) / 2)
3125 continue;
3126 if (subgraphMask[index] == 0)
3127 return index - (lenSubgraphMask - 1) / 2;
3128 }
3129
3130 // Visited all nodes
3131 return 0;
3132 }
3133
shelveSubgraphMask(int * subgraphMask,int lenSubgraphMask,int exception)3134 static void shelveSubgraphMask(int * subgraphMask, int lenSubgraphMask,
3135 int exception)
3136 {
3137 int index;
3138
3139 for (index = 0; index < lenSubgraphMask; index++) {
3140 if (subgraphMask[index] == exception)
3141 subgraphMask[index] = 0;
3142 else
3143 subgraphMask[index] += 100;
3144 }
3145 }
3146
unshelveSubgraphMask(int * subgraphMask,int lenSubgraphMask)3147 static void unshelveSubgraphMask(int * subgraphMask, int lenSubgraphMask)
3148 {
3149 int index;
3150
3151 for (index = 0; index < lenSubgraphMask; index++) {
3152 if (subgraphMask[index] >= 50)
3153 subgraphMask[index] -= 100;
3154 }
3155 }
3156
estimated_cov_subgraph(Graph * graph,int * subgraphMask,double expCovPandS[2],double rateChimericSubgraph)3157 static int estimated_cov_subgraph(Graph * graph, int * subgraphMask, double expCovPandS[2],
3158 double rateChimericSubgraph)
3159 {
3160 int nodeIndex;
3161 long int sumLenPrimary = 0, sumLenSecondary = 0, sumLenTotal;
3162 double perPrimary, perSecondary;
3163 Node *node;
3164 double cov = 0.0;
3165 double primary = expCovPandS[0], secondary = expCovPandS[1];
3166
3167 for (nodeIndex = 1; nodeIndex <= nodeCount(graph); nodeIndex++) {
3168 if (subgraphMask[nodeIndex + nodeCount(graph)] == 1) {
3169 node = getNodeInGraph(graph, nodeIndex);
3170 if (node == NULL)
3171 continue;
3172 cov = getNodeDensity(node);
3173 if (fabs(cov - primary) <= fabs(cov - secondary))
3174 sumLenPrimary += getNodeLength(node);
3175 else
3176 sumLenSecondary += getNodeLength(node);
3177 }
3178 }
3179
3180 sumLenTotal = sumLenPrimary + sumLenSecondary;
3181 perPrimary = (double) sumLenPrimary / sumLenTotal;
3182 perSecondary = (double) sumLenSecondary / sumLenTotal;
3183 if (perSecondary <= rateChimericSubgraph) {
3184 // Non Chimeric Subgraph, belongs to Primary Species
3185 return 1;
3186 } else if (perPrimary <= rateChimericSubgraph) {
3187 // Non Chimeric Subgraph, belongs to Secondary Species
3188 return -1;
3189 } else {
3190 // Chimeric Subgraph
3191 return 2;
3192 }
3193 }
3194
forceSeparateChimericSubgraph(Graph * graph,int * subgraphMask,double expCovPandS[2])3195 static void forceSeparateChimericSubgraph(Graph * graph, int * subgraphMask,
3196 double expCovPandS[2])
3197 {
3198 int maskIndex;
3199 int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3200 Node *node;
3201 double cov, primary = expCovPandS[0], secondary = expCovPandS[1];
3202
3203 for (maskIndex = 0; maskIndex < lenSubgraphMask; maskIndex++) {
3204 if (subgraphMask[maskIndex] == 1) {
3205 node = getNodeInGraph(graph, maskIndex - nodeCount(graph));
3206 if (node == NULL)
3207 continue;
3208 cov = getNodeDensity(node);
3209 if (fabs(cov - primary) <= fabs(cov - secondary))
3210 subgraphMask[maskIndex] = 2;
3211 else
3212 subgraphMask[maskIndex] = -1;
3213 }
3214 }
3215 }
3216
judgeChimericSubgraph(Graph * graph,int * subgraphMask,double expCovPandS[2],double rateChimericSubgraph,boolean discardChimericSubgraph)3217 static void judgeChimericSubgraph(Graph * graph, int * subgraphMask, double expCovPandS[2],
3218 double rateChimericSubgraph, boolean discardChimericSubgraph)
3219 {
3220 int nodeIndex;
3221 int judgeResult = 0;
3222 int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3223 boolean flagVisitedAllNodes = false;
3224 int numSubgraph = 0, thresNumSubgraph = lenSubgraphMask;
3225
3226 // Shelve not "Visiting" subgraphs
3227 shelveSubgraphMask(subgraphMask, lenSubgraphMask, 1);
3228
3229 while (!flagVisitedAllNodes) {
3230 // Check Infinite Loop
3231 numSubgraph++;
3232 if (numSubgraph >= thresNumSubgraph) {
3233 puts("Resolving Repeat Error!! Infinite Loop");
3234 free(subgraphMask);
3235 exit(1);
3236 }
3237
3238 // Choice starting unvisited node
3239 nodeIndex = getUnvisitedNodeID(subgraphMask, lenSubgraphMask);
3240
3241 printf("nodeIndex = %d\n", nodeIndex);
3242
3243 // Depth-first search (node & twin)
3244 depthFirstSearchSubgraph(nodeIndex, graph, subgraphMask);
3245 depthFirstSearchSubgraph((nodeIndex * -1), graph, subgraphMask);
3246
3247 // Estimate exp_cov in the Subgraph
3248 judgeResult = estimated_cov_subgraph(graph, subgraphMask, expCovPandS,
3249 rateChimericSubgraph);
3250
3251 if (judgeResult == 1) {
3252 printf("NonChimeric Subgraph, belongs to Primary Species\n");
3253 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 2);
3254 }
3255 else if (judgeResult == -1) {
3256 printf("NonChimeric Subgraph, belongs to Secondary Species\n");
3257 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -1);
3258 }
3259 else {
3260 printf("Chimeric Subgraph!\n");
3261 if (discardChimericSubgraph)
3262 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -2);
3263 else
3264 forceSeparateChimericSubgraph(graph, subgraphMask, expCovPandS);
3265 }
3266
3267 // Judge whether all nodes in Subgraph have visited or not
3268 if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) == 0)
3269 flagVisitedAllNodes = true;
3270 }
3271
3272 // Unshelve not "Visiting" subgraphs
3273 unshelveSubgraphMask(subgraphMask, lenSubgraphMask);
3274
3275 printf("Separating Chimeric Subgraphs Finished!\n");
3276 }
3277
checkPrimaryExpCovExistence(double expCovMulti[2],double expCovPandS[2])3278 static boolean checkPrimaryExpCovExistence(double expCovMulti[2], double expCovPandS[2])
3279 {
3280 int index;
3281 double expCov, primary = expCovPandS[0], secondary = expCovPandS[1];
3282
3283 if (secondary == -1)
3284 secondary = primary / (double) 2;
3285
3286 for (index = 0; index < 2; index++) {
3287 expCov = expCovMulti[index];
3288 if (expCov == -1)
3289 break;
3290 if (fabs(expCov - primary) <= fabs(expCov - secondary)) {
3291 printf("Primary exp_cov (%f <-> %f), Exist\n",
3292 primary, secondary);
3293 return true;
3294 }
3295 }
3296
3297 printf("Primary exp_cov (%f <-> %f), NOT exist\n", primary, secondary);
3298 return false;
3299 }
3300
judgeSkip(Graph * graph,int * subgraphMask)3301 static boolean judgeSkip(Graph * graph, int * subgraphMask)
3302 {
3303 int nodeIndex;
3304 int lenSubgraphMask = nodeCount(graph) * 2 + 1;
3305 int countSkip = 0;
3306 double insertLen = getInsertLength(graph, 1);
3307 double skipCandidateNodeLen = 0.0;
3308 Node *node = NULL;
3309
3310 for (nodeIndex = 1; nodeIndex < nodeCount(graph); nodeIndex++) {
3311 if (subgraphMask[nodeIndex + nodeCount(graph)] == 1) {
3312 countSkip++;
3313 node = getNodeInGraph(graph, nodeIndex);
3314 skipCandidateNodeLen = getNodeLength(node);
3315 }
3316 }
3317
3318 if (countSkip <= 1 && skipCandidateNodeLen < insertLen) {
3319 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, -2);
3320 printf("Skipped\n");
3321 return true;
3322 }
3323 else
3324 return false;
3325 }
3326
printActiveNodes(int * subgraphMask,int lenSubgraphMask)3327 static void printActiveNodes(int * subgraphMask, int lenSubgraphMask)
3328 {
3329 int index;
3330
3331 printf("Active Nodes : ");
3332 for (index = 0; index < lenSubgraphMask; index++) {
3333 if (subgraphMask[index] == 2)
3334 printf("%d ", index - (lenSubgraphMask - 1) / 2);
3335 }
3336
3337 printf("\n");
3338 }
3339
resolveRepeatOfAllSubgraphs(Graph * graph,ReadSet * reads,double expCovMulti[100],boolean * dubious,boolean force_jumps,int argPebbleRounds,double rateChimericSubgraph,boolean discardChimericSubgraph,double repeatNodeCovSD)3340 void resolveRepeatOfAllSubgraphs(Graph * graph, ReadSet * reads, double expCovMulti[100],
3341 boolean * dubious, boolean force_jumps, int argPebbleRounds,
3342 double rateChimericSubgraph, boolean discardChimericSubgraph,
3343 double repeatNodeCovSD)
3344 {
3345 int nodeIndex = 1, ecmIndex = 0;
3346 int numSubgraph = 0;
3347 int thresNumSubgraph = nodeCount(graph) * 2;
3348 int lenSubgraphMask = 2 * nodeCount(graph) + 1;
3349 int *subgraphMask = callocOrExit(lenSubgraphMask, int);
3350 double expCovSubgraph = 0.0;
3351 double expCovPandS[2];
3352 int numPeaks = 0;
3353 int countInterRepeatLoop = 0, thresInterRepeatLoop = 20;
3354 int pebbleRounds = argPebbleRounds;
3355 boolean flagLongRead = false, flagVisitedAllNodes = false;
3356
3357 puts("\nResolving Repeats for each subgraph\n");
3358
3359 // Eliminate NULL nodes
3360 eliminateNullNodes(graph, subgraphMask);
3361
3362 // Check whether long reads are in the input sequences
3363 flagLongRead = checkLongReadExistence(graph);
3364
3365 // Print Expected Coverages
3366 for (ecmIndex = 0; expCovMulti[ecmIndex] > 0.001; ecmIndex++)
3367 printf("Expected Coverage %d : %f\n", ecmIndex+1, expCovMulti[ecmIndex]);
3368 ecmIndex = 0;
3369
3370 /*
3371 // Detect peaks from whole Graph
3372 setSubgraphMask(subgraphMask, lenSubgraphMask, 0, 1);
3373 estimated_cov_multi(graph, subgraphMask, expCovMulti);
3374 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 0);
3375 */
3376
3377 while (!flagVisitedAllNodes) {
3378 // Set expCovPandS
3379 if (expCovMulti[ecmIndex] != -1) {
3380 expCovPandS[0] = expCovMulti[ecmIndex++];
3381 expCovPandS[1] = expCovMulti[ecmIndex];
3382 }
3383 printf("\nPrimary exp_cov : %f\n", expCovPandS[0]);
3384
3385 // Resolve repeats for each Subgraph
3386 while (true) {
3387 // Check Infinite Loop
3388 numSubgraph++;
3389 if (numSubgraph >= thresNumSubgraph) {
3390 puts("Resolving Repeat Error!! Infinite Loop");
3391 free(subgraphMask);
3392 exit(1);
3393 }
3394
3395 // Choice starting unvisited node
3396 nodeIndex = getUnvisitedNodeID(subgraphMask, lenSubgraphMask);
3397
3398 printf("nodeIndex = %d\n", nodeIndex);
3399
3400 // Depth-first search (node & twin)
3401 depthFirstSearchSubgraph(nodeIndex, graph, subgraphMask);
3402 depthFirstSearchSubgraph((nodeIndex * -1), graph, subgraphMask);
3403
3404 // Estimate the number of peaks
3405 numPeaks = estimated_cov_subgraph(graph, subgraphMask, expCovPandS,
3406 rateChimericSubgraph);
3407
3408 // Judge whether the Subgraph is chimeric or not
3409 if (numPeaks >= 2) {
3410 puts("Multiple Peaks Detected!");
3411 // Identify and Separate InterRepeats
3412 while (identifyAndSeparateInterRepeats(graph, expCovPandS,
3413 repeatNodeCovSD)) {
3414 // Check Infinite Loop
3415 if (countInterRepeatLoop++ >= thresInterRepeatLoop) {
3416 puts("Force-quitted to Identify InterRepeats");
3417 eliminateNullNodes(graph, subgraphMask);
3418 break;
3419 //puts("Identifying InterRepeat Error! Infinite Loop");
3420 //free(subgraphMask);
3421 //exit(1);
3422 }
3423 // Eliminate NULL nodes
3424 eliminateNullNodes(graph, subgraphMask);
3425 }
3426 // Judge whether each Subgraph is chimeric or not
3427 judgeChimericSubgraph(graph, subgraphMask, expCovPandS,
3428 rateChimericSubgraph, discardChimericSubgraph);
3429 }
3430 else if (numPeaks == 1)
3431 setSubgraphMask(subgraphMask, lenSubgraphMask, 1, 2);
3432
3433 // Judge whether all nodes in Subgraphs have visited or not
3434 if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) != 0) {
3435 printf("Unvisited Node : %d\n",
3436 getUnvisitedNodeID(subgraphMask, lenSubgraphMask));
3437 continue;
3438 }
3439 else {
3440 printf("\nGo to Assembly!\n");
3441 //printActiveNodes(subgraphMask, lenSubgraphMask);
3442 expCovSubgraph = expCovPandS[0];
3443 printf("exp_cov = %f\n", expCovSubgraph);
3444 }
3445
3446 // -------------------- Assemble in the Subgraph --------------------
3447 // Judge unique or repeat
3448 identifyUniqueNodesSubgraph(graph, subgraphMask,
3449 isUniqueSolexaSubgraph, expCovSubgraph);
3450 // Rock Band in the Subgraph
3451 if (flagLongRead)
3452 readCoherentSubgraph(graph, expCovSubgraph, reads, subgraphMask);
3453 // Pebble in the Subgraph
3454 for (pebbleRounds = argPebbleRounds; pebbleRounds > 0; pebbleRounds--)
3455 exploitShortReadPairs(graph, reads, dubious, force_jumps);
3456 // Print "Finished"
3457 printf("Subgraph Assembly Finished!\n\n");
3458 // ------------------------------------------------------------------
3459
3460 // Eliminate NULL Nodes
3461 eliminateNullNodes(graph, subgraphMask);
3462
3463 // Reset uniqueness
3464 resetUniqueness(graph);
3465
3466 // Set "2" -> "-2", "-1" -> "0"
3467 setSubgraphMask(subgraphMask, lenSubgraphMask, 2, -2);
3468 setSubgraphMask(subgraphMask, lenSubgraphMask, -1, 0);
3469
3470 // Judge whether all nodes in Graph have visited or not
3471 if (getUnvisitedNodeID(subgraphMask, lenSubgraphMask) == 0)
3472 flagVisitedAllNodes = true;
3473 break;
3474 }
3475 }
3476
3477 // Resolved Successfully
3478 puts("Resolved Successfully!\n");
3479 free(subgraphMask);
3480 }
3481 // Original
3482