1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3
4 This file is part of Velvet.
5
6 Velvet is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 Velvet is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with Velvet; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <time.h>
24 #include <math.h>
25
26 #include "globals.h"
27 #include "graph.h"
28 #include "concatenatedGraph.h"
29 #include "recycleBin.h"
30 #include "locallyCorrectedGraph.h"
31 #include "passageMarker.h"
32 #include "readSet.h"
33 #include "utility.h"
34 #include "scaffold.h"
35
36 #define BLOCK_SIZE 100000
37 #define LN2 1.4
38
39 typedef struct readOccurence_st ReadOccurence;
40
41 struct connection_st {
42 Node *destination;
43 Connection *next;
44 Connection *previous;
45 Connection *twin;
46 double distance;
47 double variance;
48 IDnum direct_count;
49 IDnum paired_count;
50 };
51
52 struct readOccurence_st {
53 Coordinate position;
54 Coordinate offset;
55 IDnum nodeID;
56 };
57
58 // Global params
59 static IDnum UNRELIABLE_CONNECTION_CUTOFF = 5;
60
61 // Global pointers
62 static Graph *graph;
63 static Connection **scaffold = NULL;
64 static RecycleBin *connectionMemory = NULL;
65 static boolean estimated[CATEGORIES + 1];
66
allocateConnection()67 static Connection *allocateConnection()
68 {
69 if (connectionMemory == NULL)
70 connectionMemory =
71 newRecycleBin(sizeof(Connection), BLOCK_SIZE);
72
73 return allocatePointer(connectionMemory);
74 }
75
deallocateConnection(Connection * connect)76 static void deallocateConnection(Connection * connect)
77 {
78 deallocatePointer(connectionMemory, connect);
79 }
80
getConnectionDestination(Connection * connect)81 Node * getConnectionDestination(Connection * connect) {
82 return connect->destination;
83 }
84
getNextConnection(Connection * connect)85 Connection * getNextConnection(Connection * connect) {
86 return connect->next;
87 }
88
getTwinConnection(Connection * connect)89 Connection * getTwinConnection(Connection * connect) {
90 return connect->twin;
91 }
92
getConnectionDistance(Connection * connect)93 Coordinate getConnectionDistance(Connection * connect) {
94 return (Coordinate) connect->distance;
95 }
96
getConnectionVariance(Connection * connect)97 double getConnectionVariance(Connection * connect) {
98 return connect->variance;
99 }
100
getConnectionDirectCount(Connection * connect)101 IDnum getConnectionDirectCount(Connection * connect) {
102 return connect->direct_count;
103 }
104
getConnectionPairedCount(Connection * connect)105 IDnum getConnectionPairedCount(Connection * connect) {
106 return connect->paired_count;
107 }
108
getConnection(Node * node)109 Connection * getConnection(Node * node) {
110 return scaffold[getNodeID(node) + nodeCount(graph)];
111 }
112
incrementConnectionDistance(Connection * connect,Coordinate increment)113 void incrementConnectionDistance(Connection * connect, Coordinate increment) {
114 connect->distance += increment;
115 }
116
norm(double X)117 static double norm(double X)
118 {
119 return 0.4 * exp(-X * X / 2);
120 }
121
normInt(double X,double Y)122 static double normInt(double X, double Y)
123 {
124 return (erf(0.7 * Y) - erf(0.7 * X)) / 2;
125 }
126
expectedNumberOfConnections(IDnum IDA,Connection * connect,IDnum ** counts,Category cat)127 static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect,
128 IDnum ** counts, Category cat)
129 {
130 Node *A = getNodeInGraph(graph, IDA);
131 Node *B = connect->destination;
132 double left, middle, right;
133 Coordinate longLength, shortLength, D;
134 IDnum longCount;
135 double M, N, O, P;
136 Coordinate mu = getInsertLength(graph, cat);
137 double sigma = sqrt(getInsertLength_var(graph, cat));
138 double result;
139
140 if (mu <= 0)
141 return 0;
142
143 if (getNodeLength(A) < getNodeLength(B)) {
144 longLength = getNodeLength(B);
145 shortLength = getNodeLength(A);
146 longCount = counts[cat][getNodeID(B) + nodeCount(graph)];
147 } else {
148 longLength = getNodeLength(A);
149 shortLength = getNodeLength(B);
150 longCount = counts[cat][IDA + nodeCount(graph)];
151 }
152
153 D = getConnectionDistance(connect) - (longLength + shortLength) / 2;
154
155 M = (D - mu) / sigma;
156 N = (D + shortLength - mu) / sigma;
157 O = (D + longLength - mu) / sigma;
158 P = (D + shortLength + longLength - mu) / sigma;
159
160 left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma;
161 middle = shortLength * normInt(N, O);
162 right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma);
163
164 result = (longCount * (left + middle + right)) / longLength;
165
166 if (result > 0)
167 return (IDnum) result;
168 else
169 return 0;
170 }
171
destroyConnection(Connection * connect,IDnum nodeID)172 void destroyConnection(Connection * connect, IDnum nodeID)
173 {
174 Connection *previous, *next;
175
176 //printf("Destroying connection from %li to %li\n", nodeID, getNodeID(connect->destination));
177
178 if (connect == NULL)
179 return;
180
181 previous = connect->previous;
182 next = connect->next;
183
184 if (previous != NULL)
185 previous->next = next;
186 if (next != NULL)
187 next->previous = previous;
188
189 if (scaffold[nodeID + nodeCount(graph)] == connect)
190 scaffold[nodeID + nodeCount(graph)] = next;
191
192 if (connect->twin != NULL) {
193 connect->twin->twin = NULL;
194 destroyConnection(connect->twin,
195 getNodeID(connect->destination));
196 }
197
198 deallocateConnection(connect);
199 }
200
testConnection(IDnum IDA,Connection * connect,IDnum ** counts)201 static boolean testConnection(IDnum IDA, Connection * connect,
202 IDnum ** counts)
203 {
204 IDnum total = 0;
205 Category cat;
206
207 // Spare unique -> undetermined node connections
208 if (!getUniqueness(connect->destination))
209 return true;
210
211 // Destroy tenuous connections
212 if (connect->paired_count + connect->direct_count <
213 UNRELIABLE_CONNECTION_CUTOFF)
214 return false;
215
216 for (cat = 0; cat <= CATEGORIES; cat++)
217 total +=
218 expectedNumberOfConnections(IDA, connect, counts, cat);
219
220 // Remove inconsistent connections
221 return connect->paired_count >= total / 10;
222 }
223
computeReadToNodeCounts()224 static IDnum *computeReadToNodeCounts()
225 {
226 IDnum readIndex, nodeIndex;
227 IDnum maxNodeIndex = 2 * nodeCount(graph) + 1;
228 IDnum maxReadIndex = sequenceCount(graph) + 1;
229 IDnum *readNodeCounts = callocOrExit(maxReadIndex, IDnum);
230 boolean *readMarker = callocOrExit(maxReadIndex, boolean);
231 ShortReadMarker *nodeArray, *shortMarker;
232 PassageMarker *marker;
233 Node *node;
234 IDnum nodeReadCount;
235
236 // Original
237 /*
238 puts("Computing read to node mapping array sizes");
239 */
240 // Original
241
242 for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) {
243 node = getNodeInGraph(graph, nodeIndex - nodeCount(graph));
244 if (node == NULL)
245 continue;
246 nodeArray = getNodeReads(node, graph);
247 nodeReadCount = getNodeReadCount(node, graph);
248
249 // Short reads
250 for (readIndex = 0; readIndex < nodeReadCount; readIndex++) {
251 shortMarker =
252 getShortReadMarkerAtIndex(nodeArray,
253 readIndex);
254 readNodeCounts[getShortReadMarkerID
255 (shortMarker)]++;
256 }
257
258 // Long reads
259 for (marker = getMarker(node); marker != NULL;
260 marker = getNextInNode(marker)) {
261 readIndex = getPassageMarkerSequenceID(marker);
262 if (readIndex < 0)
263 continue;
264
265 if (readMarker[readIndex])
266 continue;
267
268 readNodeCounts[readIndex]++;
269 readMarker[readIndex] = true;
270 }
271
272 // Clean up marker array
273 for (marker = getMarker(node); marker != NULL;
274 marker = getNextInNode(marker)) {
275 readIndex = getPassageMarkerSequenceID(marker);
276 if (readIndex > 0)
277 readMarker[readIndex] = false;
278 }
279 }
280
281 free(readMarker);
282 return readNodeCounts;
283 }
284
allocateReadToNodeTables(IDnum * readNodeCounts)285 static ReadOccurence **allocateReadToNodeTables(IDnum * readNodeCounts)
286 {
287 IDnum readIndex;
288 IDnum maxReadIndex = sequenceCount(graph) + 1;
289 ReadOccurence **readNodes =
290 callocOrExit(maxReadIndex, ReadOccurence *);
291
292 for (readIndex = 1; readIndex < maxReadIndex; readIndex++) {
293 if (readNodeCounts[readIndex] != 0) {
294 readNodes[readIndex] =
295 callocOrExit(readNodeCounts[readIndex],
296 ReadOccurence);
297 readNodeCounts[readIndex] = 0;
298 }
299 }
300
301 return readNodes;
302 }
303
computePartialReadToNodeMapping(IDnum nodeID,ReadOccurence ** readNodes,IDnum * readNodeCounts,boolean * readMarker)304 static void computePartialReadToNodeMapping(IDnum nodeID,
305 ReadOccurence ** readNodes,
306 IDnum * readNodeCounts,
307 boolean * readMarker)
308 {
309 ShortReadMarker *shortMarker;
310 IDnum index, readIndex;
311 ReadOccurence *readArray, *readOccurence;
312 Node *node = getNodeInGraph(graph, nodeID);
313 ShortReadMarker *nodeArray = getNodeReads(node, graph);
314 IDnum nodeReadCount = getNodeReadCount(node, graph);
315 PassageMarker *marker;
316
317 for (index = 0; index < nodeReadCount; index++) {
318 shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
319 readIndex = getShortReadMarkerID(shortMarker);
320 readArray = readNodes[readIndex];
321 readOccurence = &readArray[readNodeCounts[readIndex]];
322 readOccurence->nodeID = nodeID;
323 readOccurence->position =
324 getShortReadMarkerPosition(shortMarker);
325 readOccurence->offset =
326 getShortReadMarkerOffset(shortMarker);
327 readNodeCounts[readIndex]++;
328 }
329
330 for (marker = getMarker(node); marker != NULL;
331 marker = getNextInNode(marker)) {
332 readIndex = getPassageMarkerSequenceID(marker);
333 if (readIndex < 0)
334 continue;
335
336 if (!readMarker[readIndex]) {
337 readArray = readNodes[readIndex];
338 readOccurence =
339 &readArray[readNodeCounts[readIndex]];
340 readOccurence->nodeID = nodeID;
341 readOccurence->position = getStartOffset(marker);
342 readOccurence->offset =
343 getPassageMarkerStart(marker);
344 readNodeCounts[readIndex]++;
345 readMarker[readIndex] = true;
346 } else {
347 readArray = readNodes[readIndex];
348 readOccurence =
349 &readArray[readNodeCounts[readIndex] - 1];
350 readOccurence->position = -1;
351 readOccurence->offset = -1;
352 }
353 }
354
355 for (marker = getMarker(node); marker != NULL;
356 marker = getNextInNode(marker)) {
357 readIndex = getPassageMarkerSequenceID(marker);
358 if (readIndex > 0)
359 readMarker[readIndex] = false;
360 }
361 }
362
computeReadToNodeMappings(IDnum * readNodeCounts)363 static ReadOccurence **computeReadToNodeMappings(IDnum * readNodeCounts)
364 {
365 IDnum nodeID;
366 IDnum nodes = nodeCount(graph);
367 ReadOccurence **readNodes =
368 allocateReadToNodeTables(readNodeCounts);
369 boolean *readMarker =
370 callocOrExit(sequenceCount(graph) + 1, boolean);
371
372 // Original
373 /*
374 puts("Computing read to node mappings");
375 */
376 // Original
377
378 for (nodeID = -nodes; nodeID <= nodes; nodeID++)
379 if (nodeID != 0 && getNodeInGraph(graph, nodeID))
380 computePartialReadToNodeMapping(nodeID, readNodes,
381 readNodeCounts,
382 readMarker);
383
384 free(readMarker);
385 return readNodes;
386 }
387
countCoOccurences(IDnum * coOccurencesCount,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)388 static boolean * countCoOccurences(IDnum * coOccurencesCount, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
389 IDnum readIndex, readPairIndex;
390 IDnum readNodeCount;
391 IDnum readOccurenceIndex, readPairOccurenceIndex;
392 ReadOccurence * readOccurence, *readPairOccurence;
393 boolean * interestingReads = callocOrExit(sequenceCount(graph), boolean);
394 Category libID;
395
396 for (libID = 0; libID < CATEGORIES + 1; libID++)
397 coOccurencesCount[libID] = 0;
398
399 for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
400 // Eliminating dodgy, unpaired, already counted or user-specified reads
401 if ( readPairs[readIndex] < readIndex
402 || getInsertLength(graph, cats[readIndex]) > -1)
403 continue;
404
405 // Check for co-occurence
406 // We know that for each read the read occurences are ordered by increasing node ID
407 // Therefore one list is followed by increasing index, whereas the other is followed
408 // by decreasing index
409 libID = cats[readIndex]/2;
410 readPairIndex = readPairs[readIndex];
411
412 readOccurenceIndex = 0;
413 readOccurence = readNodes[readIndex + 1];
414 readNodeCount = readNodeCounts[readIndex + 1];
415
416 readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
417 readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
418
419 while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
420 if (readOccurence->nodeID == -readPairOccurence->nodeID) {
421 if (readOccurence->position > 0 && readPairOccurence->position > 0) {
422 coOccurencesCount[libID]++;
423 interestingReads[readIndex] = true;
424 break;
425 } else {
426 readOccurence++;
427 readOccurenceIndex++;
428 readPairOccurence--;
429 readPairOccurenceIndex--;
430 }
431 } else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
432 readOccurence++;
433 readOccurenceIndex++;
434 } else {
435 readPairOccurence--;
436 readPairOccurenceIndex--;
437 }
438 }
439 }
440
441 return interestingReads;
442 }
443
measureCoOccurences(Coordinate ** coOccurences,boolean * interestingReads,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)444 static void measureCoOccurences(Coordinate ** coOccurences, boolean * interestingReads, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
445 IDnum coOccurencesIndex[CATEGORIES + 1];
446 IDnum observationIndex;
447 IDnum readIndex, readPairIndex;
448 IDnum readNodeCount;
449 IDnum readOccurenceIndex, readPairOccurenceIndex;
450 ReadOccurence * readOccurence, *readPairOccurence;
451 Category libID;
452
453 for (libID = 0; libID < CATEGORIES + 1; libID++)
454 coOccurencesIndex[libID] = 0;
455
456 for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
457 // Eliminating dodgy, unpaired, already counted or user-specified reads
458 if (!interestingReads[readIndex])
459 continue;
460
461 // Find co-occurence
462 // We know that for each read the read occurences are ordered by increasing node ID
463 libID = cats[readIndex]/2;
464 readPairIndex = readPairs[readIndex];
465 observationIndex = coOccurencesIndex[libID];
466
467 readOccurence = readNodes[readIndex + 1];
468 readOccurenceIndex = 0;
469 readNodeCount = readNodeCounts[readIndex + 1];
470
471 readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
472 readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
473
474 while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
475 if (readOccurence->nodeID == -readPairOccurence->nodeID) {
476 if (readOccurence->position > 0 && readPairOccurence->position > 0) {
477 coOccurences[libID][observationIndex] =
478 getNodeLength(getNodeInGraph(graph, readOccurence->nodeID))
479 + getWordLength(graph) - 1
480 - (readOccurence->position - readOccurence->offset)
481 - (readPairOccurence->position - readPairOccurence->offset);
482 coOccurencesIndex[libID]++;
483 break;
484 } else {
485 readOccurence++;
486 readOccurenceIndex++;
487 readPairOccurence--;
488 readPairOccurenceIndex--;
489 }
490 } else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
491 readOccurence++;
492 readOccurenceIndex++;
493 } else {
494 readPairOccurence--;
495 readPairOccurenceIndex--;
496 }
497 }
498 }
499 }
500
compareReadOccurences(const void * A,const void * B)501 int compareReadOccurences(const void *A, const void * B) {
502 Coordinate * cA = (Coordinate *) A;
503 Coordinate * cB = (Coordinate *) B;
504
505 if (*cA > *cB)
506 return 1;
507 if (*cA == *cB)
508 return 0;
509 return -1;
510 }
511
estimateLibraryInsertLength(Coordinate * coOccurences,IDnum coOccurencesCount,Category libID)512 static void estimateLibraryInsertLength(Coordinate * coOccurences, IDnum coOccurencesCount, Category libID) {
513 Coordinate median, variance;
514 IDnum index;
515 int counter = 0;
516 qsort(coOccurences, coOccurencesCount, sizeof(Coordinate), compareReadOccurences);
517
518 median = coOccurences[coOccurencesCount / 2];
519
520 // Modified variance around the median (proxy for expected value)
521 // interval censoring
522 variance = 0;
523 for (index = 0; index < coOccurencesCount; index++) {
524 if (coOccurences[index] > 0 && coOccurences[index] < 5 * median) {
525 variance += (coOccurences[index] - median) * (coOccurences[index] - median);
526 counter++;
527 }
528 }
529 if (counter)
530 variance /= counter;
531 else {
532 variance = 0;
533 for (index = 0; index < coOccurencesCount; index++)
534 variance += (coOccurences[index] - median) * (coOccurences[index] - median);
535 variance /= coOccurencesCount;
536 }
537
538 // To avoid subsequent divisions by zero
539 if (variance == 0)
540 variance = 1;
541
542 printf("Paired-end library %i has length: %lli, sample standard deviation: %lli\n", libID + 1, (long long) median, (long long) sqrt(variance));
543 setInsertLengths(graph, libID, median, sqrt(variance));
544 estimated[libID] = true;
545 }
546
estimateLibraryInsertLengths(Coordinate ** coOccurences,IDnum * coOccurencesCounts)547 static void estimateLibraryInsertLengths(Coordinate ** coOccurences, IDnum * coOccurencesCounts) {
548 Category libID;
549
550 for (libID = 0; libID < CATEGORIES + 1; libID++)
551 estimated[libID] = false;
552
553 for (libID = 0; libID < CATEGORIES + 1; libID++)
554 if (coOccurencesCounts[libID] > 0)
555 estimateLibraryInsertLength(coOccurences[libID], coOccurencesCounts[libID], libID);
556 }
557
estimateMissingInsertLengths(ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)558 static void estimateMissingInsertLengths(ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
559 Coordinate * coOccurences[CATEGORIES + 1];
560 IDnum coOccurencesCounts[CATEGORIES + 1];
561 Category libID;
562
563 // Original
564 /*
565 puts("Estimating library insert lengths...");
566 */
567 // Original
568
569 boolean * interestingReads = countCoOccurences(coOccurencesCounts, readNodes, readNodeCounts, readPairs, cats);
570
571 for (libID = 0; libID < CATEGORIES + 1; libID++)
572 coOccurences[libID] = callocOrExit(coOccurencesCounts[libID], Coordinate);
573
574 measureCoOccurences(coOccurences, interestingReads, readNodes, readNodeCounts, readPairs, cats);
575 estimateLibraryInsertLengths(coOccurences, coOccurencesCounts);
576
577 for (libID = 0; libID < CATEGORIES + 1; libID++)
578 free(coOccurences[libID]);
579
580 free(interestingReads);
581
582 // Original
583 /*
584 puts("Done");
585 */
586 // Original
587 }
588
findConnection(IDnum nodeID,IDnum node2ID)589 static Connection *findConnection(IDnum nodeID, IDnum node2ID)
590 {
591 Node *node2 = getNodeInGraph(graph, node2ID);
592 Connection *connect;
593
594 if (node2 == NULL)
595 return NULL;
596
597 for (connect = scaffold[nodeID + nodeCount(graph)];
598 connect != NULL; connect = connect->next)
599 if (connect->destination == node2)
600 break;
601
602 return connect;
603 }
604
createTwinConnection(IDnum nodeID,IDnum node2ID,Connection * connect)605 static void createTwinConnection(IDnum nodeID, IDnum node2ID,
606 Connection * connect)
607 {
608 Connection *newConnection = allocateConnection();
609 IDnum nodeIndex = nodeID + nodeCount(graph);
610
611 // Fill in
612 newConnection->distance = connect->distance;
613 newConnection->variance = connect->variance;
614 newConnection->direct_count = connect->direct_count;
615 newConnection->paired_count = connect->paired_count;
616 newConnection->destination = getNodeInGraph(graph, node2ID);
617
618 // Batch to twin
619 newConnection->twin = connect;
620 connect->twin = newConnection;
621
622 // Insert in scaffold
623 newConnection->previous = NULL;
624 newConnection->next = scaffold[nodeIndex];
625 if (scaffold[nodeIndex] != NULL)
626 scaffold[nodeIndex]->previous = newConnection;
627 scaffold[nodeIndex] = newConnection;
628 }
629
createNewConnection(IDnum nodeID,IDnum node2ID,IDnum direct_count,IDnum paired_count,Coordinate distance,double variance)630 Connection *createNewConnection(IDnum nodeID, IDnum node2ID,
631 IDnum direct_count,
632 IDnum paired_count,
633 Coordinate distance,
634 double variance)
635 {
636 Node *destination = getNodeInGraph(graph, node2ID);
637 IDnum nodeIndex = nodeID + nodeCount(graph);
638 Connection *connect = allocateConnection();
639
640 // Fill in
641 connect->destination = destination;
642 connect->direct_count = direct_count;
643 connect->paired_count = paired_count;
644 connect->distance = (double) distance;
645 connect->variance = variance;
646
647 // Insert in scaffold
648 connect->previous = NULL;
649 connect->next = scaffold[nodeIndex];
650 if (scaffold[nodeIndex] != NULL)
651 scaffold[nodeIndex]->previous = connect;
652 scaffold[nodeIndex] = connect;
653
654 // Event. pair up to twin
655 if (getUniqueness(destination))
656 createTwinConnection(node2ID, nodeID, connect);
657 else
658 connect->twin = NULL;
659
660 return connect;
661 }
662
readjustConnection(Connection * connect,Coordinate distance,double variance,IDnum direct_count,IDnum paired_count)663 void readjustConnection(Connection * connect, Coordinate distance,
664 double variance, IDnum direct_count,
665 IDnum paired_count)
666 {
667 connect->direct_count += direct_count;
668 connect->paired_count += paired_count;
669
670 connect->distance =
671 (variance * connect->distance +
672 distance * connect->variance) / (variance +
673 connect->variance);
674 connect->variance =
675 (variance *
676 connect->variance) / (variance + connect->variance);
677
678 if (connect->twin != NULL) {
679 connect->twin->distance = connect->distance;
680 connect->twin->variance = connect->variance;
681 connect->twin->direct_count = connect->direct_count;
682 connect->twin->paired_count = connect->paired_count;
683 }
684 }
685
createConnection(IDnum nodeID,IDnum node2ID,IDnum direct_count,IDnum paired_count,Coordinate distance,double variance)686 static void createConnection(IDnum nodeID, IDnum node2ID,
687 IDnum direct_count,
688 IDnum paired_count,
689 Coordinate distance, double variance)
690 {
691 Connection *connect = findConnection(nodeID, node2ID);
692
693 if (connect != NULL)
694 readjustConnection(connect, distance, variance,
695 direct_count, paired_count);
696 else
697 createNewConnection(nodeID, node2ID, direct_count,
698 paired_count, distance, variance);
699 }
700
projectFromSingleRead(Node * node,ReadOccurence * readOccurence,Coordinate position,Coordinate offset,Coordinate length)701 static void projectFromSingleRead(Node * node,
702 ReadOccurence * readOccurence,
703 Coordinate position,
704 Coordinate offset, Coordinate length)
705 {
706 Coordinate distance = 0;
707 Node *target = getNodeInGraph(graph, -readOccurence->nodeID);
708 double variance = 1;
709
710 if (target == getTwinNode(node) || target == node)
711 return;
712
713 if (position < 0) {
714 variance += getNodeLength(node) * getNodeLength(node) / 16;
715 // distance += 0;
716 } else {
717 // variance += 0;
718 distance += position - getNodeLength(node) / 2;
719 }
720
721 if (readOccurence->position < 0) {
722 variance +=
723 getNodeLength(target) * getNodeLength(target) / 16;
724 //distance += 0;
725 } else {
726 // variance += 0;
727 distance +=
728 -readOccurence->position + getNodeLength(target) / 2;
729 }
730
731 if (readOccurence->offset < 0 || offset < 0) {
732 variance += length * length / 16;
733 //distance += 0;
734 } else {
735 // variance += 0;
736 distance += readOccurence->offset - offset;
737 }
738
739 // Relative ordering
740 if (offset > 0 && readOccurence->offset > 0) {
741 if (offset < readOccurence->offset) {
742 if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
743 ;
744 else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
745 createConnection(getNodeID(node), getNodeID(target), 1, 0,
746 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
747 else
748 createConnection(getNodeID(node), getNodeID(target), 1, 0,
749 distance, variance);
750 } else if (offset > readOccurence->offset) {
751 if (-distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
752 ;
753 else if (-distance < getNodeLength(node)/2 + getNodeLength(target)/2)
754 createConnection(-getNodeID(node), -getNodeID(target), 1,
755 0, getNodeLength(node)/2 + getNodeLength(target)/2 , variance);
756 else
757 createConnection(-getNodeID(node), -getNodeID(target), 1,
758 0, -distance, variance);
759 }
760 } else if (offset > 0 && position > 0) {
761 if (distance - offset > -getNodeLength(node)/2 && distance - offset + length > getNodeLength(node)/2)
762 createConnection(getNodeID(node), getNodeID(target), 1, 0,
763 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
764 else if (distance - offset < -getNodeLength(node)/2 && distance - offset + length < getNodeLength(node)/2)
765 createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
766 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
767 else {
768 createConnection(getNodeID(node), getNodeID(target), 1, 0,
769 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
770 createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
771 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
772 }
773 } else if (readOccurence->offset > 0 && readOccurence->position > 0) {
774 if (-distance - readOccurence->offset > -getNodeLength(target)/2 && -distance - readOccurence->offset + length > getNodeLength(target)/2)
775 createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
776 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
777 if (-distance - readOccurence->offset < -getNodeLength(target)/2 && -distance - readOccurence->offset + length < getNodeLength(target)/2)
778 createConnection(getNodeID(node), getNodeID(target), 1, 0,
779 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
780 else {
781 createConnection(getNodeID(node), getNodeID(target), 1, 0,
782 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
783 createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
784 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
785 }
786 } else {
787 createConnection(getNodeID(node), getNodeID(target), 1, 0,
788 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
789 createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
790 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
791 }
792 }
793
projectFromReadPair(Node * node,ReadOccurence * readOccurence,Coordinate position,Coordinate offset,Coordinate insertLength,double insertVariance)794 static void projectFromReadPair(Node * node, ReadOccurence * readOccurence,
795 Coordinate position, Coordinate offset,
796 Coordinate insertLength,
797 double insertVariance)
798 {
799 Coordinate distance = insertLength;
800 Coordinate variance = insertVariance;
801 Node *target = getNodeInGraph(graph, readOccurence->nodeID);
802
803 if (target == getTwinNode(node) || target == node)
804 return;
805
806 if (getUniqueness(target) && getNodeID(target) < getNodeID(node))
807 return;
808
809 if (position < 0) {
810 variance += getNodeLength(node) * getNodeLength(node) / 16;
811 // distance += 0;
812 } else {
813 // variance += 0;
814 distance += position - offset - getNodeLength(node) / 2;
815 }
816
817 if (readOccurence->position < 0) {
818 variance +=
819 getNodeLength(target) * getNodeLength(target) / 16;
820 //distance += 0;
821 } else {
822 // variance += 0;
823 distance +=
824 readOccurence->position - readOccurence->offset -
825 getNodeLength(target) / 2;
826 }
827
828 if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -6 * sqrt(insertVariance))
829 return;
830 else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
831 distance = getNodeLength(node)/2 + getNodeLength(target)/2;
832
833 createConnection(getNodeID(node), getNodeID(target), 0, 1,
834 distance, variance);
835 }
836
projectFromShortRead(Node * node,ShortReadMarker * shortMarker,IDnum * readPairs,Category * cats,ReadOccurence ** readNodes,IDnum * readNodeCounts,Coordinate * lengths)837 static void projectFromShortRead(Node * node,
838 ShortReadMarker * shortMarker,
839 IDnum * readPairs, Category * cats,
840 ReadOccurence ** readNodes,
841 IDnum * readNodeCounts,
842 Coordinate * lengths)
843 {
844 IDnum index;
845 IDnum readIndex = getShortReadMarkerID(shortMarker);
846 ReadOccurence *readArray;
847 IDnum readPairIndex;
848 Category cat;
849 Coordinate position = getShortReadMarkerPosition(shortMarker);
850 Coordinate offset = getShortReadMarkerOffset(shortMarker);
851 Coordinate length = lengths[getShortReadMarkerID(shortMarker) - 1];
852 Coordinate insertLength;
853 double insertVariance;
854
855 // Going through single-read information
856 if (readNodeCounts[readIndex] > 1) {
857 readArray = readNodes[readIndex];
858 for (index = 0; index < readNodeCounts[readIndex]; index++)
859 projectFromSingleRead(node, &readArray[index],
860 position, offset, length);
861 }
862 // Going through paired read information
863 if (readPairs == NULL)
864 return;
865
866 readPairIndex = readPairs[readIndex - 1] + 1;
867
868 if (readPairIndex == 0)
869 return;
870
871 cat = cats[readIndex - 1];
872 insertLength = getInsertLength(graph, cat);
873 insertVariance = getInsertLength_var(graph, cat);
874
875 readArray = readNodes[readPairIndex];
876 for (index = 0; index < readNodeCounts[readPairIndex]; index++)
877 projectFromReadPair(node, &readArray[index], position,
878 offset, insertLength, insertVariance);
879
880 }
881
projectFromLongRead(Node * node,PassageMarker * marker,IDnum * readPairs,Category * cats,ReadOccurence ** readNodes,IDnum * readNodeCounts,Coordinate * lengths)882 static void projectFromLongRead(Node * node, PassageMarker * marker,
883 IDnum * readPairs, Category * cats,
884 ReadOccurence ** readNodes,
885 IDnum * readNodeCounts,
886 Coordinate * lengths)
887 {
888 IDnum index;
889 IDnum readIndex = getPassageMarkerSequenceID(marker);
890 ReadOccurence *readArray;
891 IDnum readPairIndex;
892 Category cat;
893 Coordinate position = getStartOffset(marker);
894 Coordinate offset = getPassageMarkerStart(marker);
895 Coordinate length =
896 lengths[getPassageMarkerSequenceID(marker) - 1];
897 Coordinate insertLength;
898 double insertVariance;
899
900 // Going through single-read information
901 if (readNodeCounts[readIndex] > 1 && position > 0) {
902 readArray = readNodes[readIndex];
903 for (index = 0; index < readNodeCounts[readIndex]; index++)
904 projectFromSingleRead(node, &readArray[index],
905 position, offset, length);
906 }
907 // Going through paired read information
908 if (readPairs == NULL)
909 return;
910
911 readPairIndex = readPairs[readIndex - 1] + 1;
912
913 if (readPairIndex == 0)
914 return;
915
916 cat = cats[readIndex - 1];
917 insertLength = getInsertLength(graph, cat);
918 insertVariance = getInsertLength_var(graph, cat);
919
920 readArray = readNodes[readPairIndex];
921 for (index = 0; index < readNodeCounts[readPairIndex]; index++)
922 projectFromReadPair(node, &readArray[index], position,
923 offset, insertLength, insertVariance);
924
925 }
926
projectFromNode(IDnum nodeID,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats,boolean * dubious,Coordinate * lengths)927 static void projectFromNode(IDnum nodeID,
928 ReadOccurence ** readNodes,
929 IDnum * readNodeCounts,
930 IDnum * readPairs, Category * cats,
931 boolean * dubious, Coordinate * lengths)
932 {
933 IDnum index;
934 ShortReadMarker *nodeArray, *shortMarker;
935 PassageMarker *marker;
936 Node *node;
937 IDnum nodeReadCount;
938
939 node = getNodeInGraph(graph, nodeID);
940
941 if (node == NULL || !getUniqueness(node))
942 return;
943
944 nodeArray = getNodeReads(node, graph);
945 nodeReadCount = getNodeReadCount(node, graph);
946 for (index = 0; index < nodeReadCount; index++) {
947 shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
948 if (dubious[getShortReadMarkerID(shortMarker) - 1])
949 continue;
950 projectFromShortRead(node, shortMarker, readPairs, cats,
951 readNodes, readNodeCounts, lengths);
952 }
953
954 for (marker = getMarker(node); marker != NULL;
955 marker = getNextInNode(marker)) {
956 if (getPassageMarkerSequenceID(marker) > 0)
957 projectFromLongRead(node, marker, readPairs, cats,
958 readNodes, readNodeCounts,
959 lengths);
960 }
961 }
962
computeNodeToNodeMappings(ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats,boolean * dubious,Coordinate * lengths)963 static Connection **computeNodeToNodeMappings(ReadOccurence ** readNodes,
964 IDnum * readNodeCounts,
965 IDnum * readPairs,
966 Category * cats,
967 boolean * dubious,
968 Coordinate * lengths)
969 {
970 IDnum nodeID;
971 IDnum nodes = nodeCount(graph);
972 scaffold = callocOrExit(2 * nodes + 1, Connection *);
973
974 // Original
975 /*
976 puts("Computing direct node to node mappings");
977 */
978 // Original
979
980 for (nodeID = -nodes; nodeID <= nodes; nodeID++) {
981 // Original
982 /*
983 if (nodeID % 10000 == 0)
984 printf("Scaffolding node %d\n", nodeID);
985 */
986 // Original
987
988 projectFromNode(nodeID, readNodes, readNodeCounts,
989 readPairs, cats, dubious, lengths);
990 }
991
992 return scaffold;
993 }
994
countShortReads(Graph * graph,ReadSet * reads)995 static IDnum **countShortReads(Graph * graph, ReadSet * reads)
996 {
997 IDnum **counts = callocOrExit(CATEGORIES + 1, IDnum *);
998 Category cat;
999 IDnum nodeIndex;
1000 IDnum nodes = nodeCount(graph);
1001 Node *node;
1002 ShortReadMarker *array, *marker;
1003 IDnum readCount, readIndex, readID;
1004
1005 // Allocate memory where needed
1006 for (cat = 0; cat <= CATEGORIES; cat++)
1007 if (getInsertLength(graph, cat) > 0)
1008 counts[cat] =
1009 callocOrExit(2 * nodeCount(graph) + 1,
1010 IDnum);
1011
1012 // Start fillin'
1013 for (nodeIndex = 0; nodeIndex < 2 * nodes + 1; nodeIndex++) {
1014 node = getNodeInGraph(graph, nodeIndex - nodes);
1015
1016 if (node == NULL || !getUniqueness(node))
1017 continue;
1018
1019 array = getNodeReads(node, graph);
1020 readCount = getNodeReadCount(node, graph);
1021 for (readIndex = 0; readIndex < readCount; readIndex++) {
1022 marker =
1023 getShortReadMarkerAtIndex(array, readIndex);
1024 readID = getShortReadMarkerID(marker);
1025 cat = reads->categories[readID - 1];
1026 if (cat % 2 == 1 && counts[cat / 2] != NULL)
1027 counts[cat / 2][nodeIndex]++;
1028 }
1029 }
1030
1031 return counts;
1032 }
1033
printConnections(ReadSet * reads)1034 void printConnections(ReadSet * reads)
1035 {
1036 IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1037 IDnum index;
1038 Connection *connect, *next;
1039 Node *node;
1040 IDnum **counts = countShortReads(graph, reads);
1041 IDnum nodes = nodeCount(graph);
1042
1043 puts("CONNECT IDA IDB dcount pcount dist lengthA lengthB var countA countB coordA coordB real exp distance test");
1044
1045 for (index = 0; index < maxNodeIndex; index++) {
1046 node = getNodeInGraph(graph, index - nodeCount(graph));
1047 for (connect = scaffold[index]; connect != NULL;
1048 connect = next) {
1049 next = connect->next;
1050 if (getUniqueness(connect->destination)) {
1051 printf
1052 ("CONNECT %ld %ld %ld %ld %lld %lld %lld %f %ld %ld",
1053 (long) index - nodeCount(graph),
1054 (long) getNodeID(connect->destination),
1055 (long) connect->direct_count,
1056 (long) connect->paired_count,
1057 (long long) getConnectionDistance(connect),
1058 (long long) getNodeLength(node),
1059 (long long) getNodeLength(connect->destination),
1060 connect->variance,
1061 (long) getNodeReadCount(node, graph),
1062 (long) getNodeReadCount(connect->destination,
1063 graph));
1064 if (markerCount(node) == 1
1065 && markerCount(connect->destination) ==
1066 1)
1067 printf(" %lld %lld %lld",
1068 (long long) getPassageMarkerFinish
1069 (getMarker(node)),
1070 (long long) getPassageMarkerFinish
1071 (getMarker
1072 (connect->destination)),
1073 (long long) (getPassageMarkerFinish
1074 (getMarker(node)) -
1075 getPassageMarkerFinish
1076 (getMarker
1077 (connect->destination))));
1078 else
1079 printf(" ? ?");
1080 printf(" %ld", (long) expectedNumberOfConnections(index-nodeCount(graph), connect, counts, 0));
1081 printf(" %lld", (long long) (getConnectionDistance(connect) - (getNodeLength(node) + getNodeLength(connect->destination))/2));
1082 if (testConnection
1083 (index - nodes, connect, counts))
1084 puts(" OK");
1085 else
1086 puts(" NG");
1087 }
1088 }
1089 }
1090 }
1091
removeUnreliableConnections(ReadSet * reads)1092 static void removeUnreliableConnections(ReadSet * reads)
1093 {
1094 IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1095 IDnum index;
1096 Connection *connect, *next;
1097 Category cat;
1098 IDnum **counts = countShortReads(graph, reads);
1099 IDnum nodes = nodeCount(graph);
1100
1101 for (index = 0; index < maxNodeIndex; index++) {
1102 for (connect = scaffold[index]; connect != NULL;
1103 connect = next) {
1104 next = connect->next;
1105 if (!testConnection
1106 (index - nodes, connect, counts))
1107 destroyConnection(connect, index - nodes);
1108 }
1109 }
1110
1111 // Free memory
1112 for (cat = 0; cat <= CATEGORIES; cat++)
1113 if (counts[cat])
1114 free(counts[cat]);
1115 free(counts);
1116 }
1117
buildScaffold(Graph * argGraph,ReadSet * reads,boolean * dubious)1118 void buildScaffold(Graph * argGraph, ReadSet * reads, boolean * dubious) {
1119 IDnum *readPairs;
1120 Category *cats;
1121 IDnum *readNodeCounts;
1122 ReadOccurence **readNodes;
1123 Coordinate *lengths =
1124 getSequenceLengths(reads, getWordLength(argGraph));
1125 IDnum index;
1126
1127 graph = argGraph;
1128 readPairs = reads->mateReads;
1129 cats = reads->categories;
1130
1131 // Prepare primary scaffold
1132 readNodeCounts = computeReadToNodeCounts();
1133 readNodes = computeReadToNodeMappings(readNodeCounts);
1134
1135 estimateMissingInsertLengths(readNodes, readNodeCounts, readPairs, cats);
1136
1137 scaffold = computeNodeToNodeMappings(readNodes, readNodeCounts,
1138 readPairs, cats, dubious, lengths);
1139 removeUnreliableConnections(reads);
1140
1141 // Clean up memory
1142 for (index = 1; index <= sequenceCount(graph); index++)
1143 free(readNodes[index]);
1144
1145 free(readNodes);
1146 free(readNodeCounts);
1147 free(lengths);
1148 }
1149
setUnreliableConnectionCutoff(int val)1150 void setUnreliableConnectionCutoff(int val)
1151 {
1152 UNRELIABLE_CONNECTION_CUTOFF = (IDnum) val;
1153 }
1154
cleanScaffoldMemory()1155 void cleanScaffoldMemory() {
1156 Category libID;
1157
1158 for (libID = 0; libID < CATEGORIES + 1; libID++)
1159 if (estimated[libID])
1160 setInsertLengths(graph, libID, -1, -1);
1161
1162 destroyRecycleBin(connectionMemory);
1163 free(scaffold);
1164 connectionMemory = NULL;
1165 }
1166