1 /*
2 Copyright 2007, 2008 Daniel Zerbino (zerbino@ebi.ac.uk)
3 
4     This file is part of Velvet.
5 
6     Velvet is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     Velvet is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with Velvet; if not, write to the Free Software
18     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 #include <stdlib.h>
22 #include <stdio.h>
23 #include <time.h>
24 #include <math.h>
25 
26 #include "globals.h"
27 #include "graph.h"
28 #include "concatenatedGraph.h"
29 #include "recycleBin.h"
30 #include "locallyCorrectedGraph.h"
31 #include "passageMarker.h"
32 #include "readSet.h"
33 #include "utility.h"
34 #include "scaffold.h"
35 
36 #define BLOCK_SIZE  100000
37 #define LN2 1.4
38 
39 typedef struct readOccurence_st ReadOccurence;
40 
41 struct connection_st {
42 	Node *destination;
43 	Connection *next;
44 	Connection *previous;
45 	Connection *twin;
46 	double distance;
47 	double variance;
48 	IDnum direct_count;
49 	IDnum paired_count;
50 };
51 
52 struct readOccurence_st {
53 	Coordinate position;
54 	Coordinate offset;
55 	IDnum nodeID;
56 };
57 
58 // Global params
59 static IDnum UNRELIABLE_CONNECTION_CUTOFF = 5;
60 
61 // Global pointers
62 static Graph *graph;
63 static Connection **scaffold = NULL;
64 static RecycleBin *connectionMemory = NULL;
65 static boolean estimated[CATEGORIES + 1];
66 
allocateConnection()67 static Connection *allocateConnection()
68 {
69 	if (connectionMemory == NULL)
70 		connectionMemory =
71 		    newRecycleBin(sizeof(Connection), BLOCK_SIZE);
72 
73 	return allocatePointer(connectionMemory);
74 }
75 
deallocateConnection(Connection * connect)76 static void deallocateConnection(Connection * connect)
77 {
78 	deallocatePointer(connectionMemory, connect);
79 }
80 
getConnectionDestination(Connection * connect)81 Node * getConnectionDestination(Connection * connect) {
82 	return connect->destination;
83 }
84 
getNextConnection(Connection * connect)85 Connection * getNextConnection(Connection * connect) {
86 	return connect->next;
87 }
88 
getTwinConnection(Connection * connect)89 Connection * getTwinConnection(Connection * connect) {
90 	return connect->twin;
91 }
92 
getConnectionDistance(Connection * connect)93 Coordinate getConnectionDistance(Connection * connect) {
94 	return (Coordinate) connect->distance;
95 }
96 
getConnectionVariance(Connection * connect)97 double getConnectionVariance(Connection * connect) {
98 	return connect->variance;
99 }
100 
getConnectionDirectCount(Connection * connect)101 IDnum getConnectionDirectCount(Connection * connect) {
102 	return connect->direct_count;
103 }
104 
getConnectionPairedCount(Connection * connect)105 IDnum getConnectionPairedCount(Connection * connect) {
106 	return connect->paired_count;
107 }
108 
getConnection(Node * node)109 Connection * getConnection(Node * node) {
110 	return scaffold[getNodeID(node) + nodeCount(graph)];
111 }
112 
incrementConnectionDistance(Connection * connect,Coordinate increment)113 void incrementConnectionDistance(Connection * connect, Coordinate increment) {
114 	connect->distance += increment;
115 }
116 
norm(double X)117 static double norm(double X)
118 {
119 	return 0.4 * exp(-X * X / 2);
120 }
121 
normInt(double X,double Y)122 static double normInt(double X, double Y)
123 {
124 	return (erf(0.7 * Y) - erf(0.7 * X)) / 2;
125 }
126 
expectedNumberOfConnections(IDnum IDA,Connection * connect,IDnum ** counts,Category cat)127 static IDnum expectedNumberOfConnections(IDnum IDA, Connection * connect,
128 					 IDnum ** counts, Category cat)
129 {
130 	Node *A = getNodeInGraph(graph, IDA);
131 	Node *B = connect->destination;
132 	double left, middle, right;
133 	Coordinate longLength, shortLength, D;
134 	IDnum longCount;
135 	double M, N, O, P;
136 	Coordinate mu = getInsertLength(graph, cat);
137 	double sigma = sqrt(getInsertLength_var(graph, cat));
138 	double result;
139 
140 	if (mu <= 0)
141 		return 0;
142 
143 	if (getNodeLength(A) < getNodeLength(B)) {
144 		longLength = getNodeLength(B);
145 		shortLength = getNodeLength(A);
146 		longCount = counts[cat][getNodeID(B) + nodeCount(graph)];
147 	} else {
148 		longLength = getNodeLength(A);
149 		shortLength = getNodeLength(B);
150 		longCount = counts[cat][IDA + nodeCount(graph)];
151 	}
152 
153 	D = getConnectionDistance(connect) - (longLength + shortLength) / 2;
154 
155 	M = (D - mu) / sigma;
156 	N = (D + shortLength - mu) / sigma;
157 	O = (D + longLength - mu) / sigma;
158 	P = (D + shortLength + longLength - mu) / sigma;
159 
160 	left = ((norm(M) - norm(N)) - M * normInt(M, N)) * sigma;
161 	middle = shortLength * normInt(N, O);
162 	right = ((norm(O) - norm(P)) - P * normInt(O, P)) * (-sigma);
163 
164 	result = (longCount * (left + middle + right)) / longLength;
165 
166 	if (result > 0)
167 		return (IDnum) result;
168 	else
169 		return 0;
170 }
171 
destroyConnection(Connection * connect,IDnum nodeID)172 void destroyConnection(Connection * connect, IDnum nodeID)
173 {
174 	Connection *previous, *next;
175 
176 	//printf("Destroying connection from %li to %li\n", nodeID, getNodeID(connect->destination));
177 
178 	if (connect == NULL)
179 		return;
180 
181 	previous = connect->previous;
182 	next = connect->next;
183 
184 	if (previous != NULL)
185 		previous->next = next;
186 	if (next != NULL)
187 		next->previous = previous;
188 
189 	if (scaffold[nodeID + nodeCount(graph)] == connect)
190 		scaffold[nodeID + nodeCount(graph)] = next;
191 
192 	if (connect->twin != NULL) {
193 		connect->twin->twin = NULL;
194 		destroyConnection(connect->twin,
195 				  getNodeID(connect->destination));
196 	}
197 
198 	deallocateConnection(connect);
199 }
200 
testConnection(IDnum IDA,Connection * connect,IDnum ** counts)201 static boolean testConnection(IDnum IDA, Connection * connect,
202 			      IDnum ** counts)
203 {
204 	IDnum total = 0;
205 	Category cat;
206 
207 	// Spare unique -> undetermined node connections
208 	if (!getUniqueness(connect->destination))
209 		return true;
210 
211 	// Destroy tenuous connections
212 	if (connect->paired_count + connect->direct_count <
213 	    UNRELIABLE_CONNECTION_CUTOFF)
214 		return false;
215 
216 	for (cat = 0; cat <= CATEGORIES; cat++)
217 		total +=
218 		    expectedNumberOfConnections(IDA, connect, counts, cat);
219 
220 	// Remove inconsistent connections
221 	return connect->paired_count >= total / 10;
222 }
223 
computeReadToNodeCounts()224 static IDnum *computeReadToNodeCounts()
225 {
226 	IDnum readIndex, nodeIndex;
227 	IDnum maxNodeIndex = 2 * nodeCount(graph) + 1;
228 	IDnum maxReadIndex = sequenceCount(graph) + 1;
229 	IDnum *readNodeCounts = callocOrExit(maxReadIndex, IDnum);
230 	boolean *readMarker = callocOrExit(maxReadIndex, boolean);
231 	ShortReadMarker *nodeArray, *shortMarker;
232 	PassageMarker *marker;
233 	Node *node;
234 	IDnum nodeReadCount;
235 
236 	// Original
237 	/*
238 	puts("Computing read to node mapping array sizes");
239 	*/
240 	// Original
241 
242 	for (nodeIndex = 0; nodeIndex < maxNodeIndex; nodeIndex++) {
243 		node = getNodeInGraph(graph, nodeIndex - nodeCount(graph));
244 		if (node == NULL)
245 			continue;
246 		nodeArray = getNodeReads(node, graph);
247 		nodeReadCount = getNodeReadCount(node, graph);
248 
249 		// Short reads
250 		for (readIndex = 0; readIndex < nodeReadCount; readIndex++) {
251 			shortMarker =
252 			    getShortReadMarkerAtIndex(nodeArray,
253 						      readIndex);
254 			readNodeCounts[getShortReadMarkerID
255 				       (shortMarker)]++;
256 		}
257 
258 		// Long reads
259 		for (marker = getMarker(node); marker != NULL;
260 		     marker = getNextInNode(marker)) {
261 			readIndex = getPassageMarkerSequenceID(marker);
262 			if (readIndex < 0)
263 				continue;
264 
265 			if (readMarker[readIndex])
266 				continue;
267 
268 			readNodeCounts[readIndex]++;
269 			readMarker[readIndex] = true;
270 		}
271 
272 		// Clean up marker array
273 		for (marker = getMarker(node); marker != NULL;
274 		     marker = getNextInNode(marker)) {
275 			readIndex = getPassageMarkerSequenceID(marker);
276 			if (readIndex > 0)
277 				readMarker[readIndex] = false;
278 		}
279 	}
280 
281 	free(readMarker);
282 	return readNodeCounts;
283 }
284 
allocateReadToNodeTables(IDnum * readNodeCounts)285 static ReadOccurence **allocateReadToNodeTables(IDnum * readNodeCounts)
286 {
287 	IDnum readIndex;
288 	IDnum maxReadIndex = sequenceCount(graph) + 1;
289 	ReadOccurence **readNodes =
290 	    callocOrExit(maxReadIndex, ReadOccurence *);
291 
292 	for (readIndex = 1; readIndex < maxReadIndex; readIndex++) {
293 		if (readNodeCounts[readIndex] != 0) {
294 			readNodes[readIndex] =
295 			    callocOrExit(readNodeCounts[readIndex],
296 				   ReadOccurence);
297 			readNodeCounts[readIndex] = 0;
298 		}
299 	}
300 
301 	return readNodes;
302 }
303 
computePartialReadToNodeMapping(IDnum nodeID,ReadOccurence ** readNodes,IDnum * readNodeCounts,boolean * readMarker)304 static void computePartialReadToNodeMapping(IDnum nodeID,
305 					    ReadOccurence ** readNodes,
306 					    IDnum * readNodeCounts,
307 					    boolean * readMarker)
308 {
309 	ShortReadMarker *shortMarker;
310 	IDnum index, readIndex;
311 	ReadOccurence *readArray, *readOccurence;
312 	Node *node = getNodeInGraph(graph, nodeID);
313 	ShortReadMarker *nodeArray = getNodeReads(node, graph);
314 	IDnum nodeReadCount = getNodeReadCount(node, graph);
315 	PassageMarker *marker;
316 
317 	for (index = 0; index < nodeReadCount; index++) {
318 		shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
319 		readIndex = getShortReadMarkerID(shortMarker);
320 		readArray = readNodes[readIndex];
321 		readOccurence = &readArray[readNodeCounts[readIndex]];
322 		readOccurence->nodeID = nodeID;
323 		readOccurence->position =
324 		    getShortReadMarkerPosition(shortMarker);
325 		readOccurence->offset =
326 		    getShortReadMarkerOffset(shortMarker);
327 		readNodeCounts[readIndex]++;
328 	}
329 
330 	for (marker = getMarker(node); marker != NULL;
331 	     marker = getNextInNode(marker)) {
332 		readIndex = getPassageMarkerSequenceID(marker);
333 		if (readIndex < 0)
334 			continue;
335 
336 		if (!readMarker[readIndex]) {
337 			readArray = readNodes[readIndex];
338 			readOccurence =
339 			    &readArray[readNodeCounts[readIndex]];
340 			readOccurence->nodeID = nodeID;
341 			readOccurence->position = getStartOffset(marker);
342 			readOccurence->offset =
343 			    getPassageMarkerStart(marker);
344 			readNodeCounts[readIndex]++;
345 			readMarker[readIndex] = true;
346 		} else {
347 			readArray = readNodes[readIndex];
348 			readOccurence =
349 			    &readArray[readNodeCounts[readIndex] - 1];
350 			readOccurence->position = -1;
351 			readOccurence->offset = -1;
352 		}
353 	}
354 
355 	for (marker = getMarker(node); marker != NULL;
356 	     marker = getNextInNode(marker)) {
357 		readIndex = getPassageMarkerSequenceID(marker);
358 		if (readIndex > 0)
359 			readMarker[readIndex] = false;
360 	}
361 }
362 
computeReadToNodeMappings(IDnum * readNodeCounts)363 static ReadOccurence **computeReadToNodeMappings(IDnum * readNodeCounts)
364 {
365 	IDnum nodeID;
366 	IDnum nodes = nodeCount(graph);
367 	ReadOccurence **readNodes =
368 	    allocateReadToNodeTables(readNodeCounts);
369 	boolean *readMarker =
370 	    callocOrExit(sequenceCount(graph) + 1, boolean);
371 
372 	// Original
373 	/*
374 	puts("Computing read to node mappings");
375 	*/
376 	// Original
377 
378 	for (nodeID = -nodes; nodeID <= nodes; nodeID++)
379 		if (nodeID != 0 && getNodeInGraph(graph, nodeID))
380 			computePartialReadToNodeMapping(nodeID, readNodes,
381 							readNodeCounts,
382 							readMarker);
383 
384 	free(readMarker);
385 	return readNodes;
386 }
387 
countCoOccurences(IDnum * coOccurencesCount,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)388 static boolean * countCoOccurences(IDnum * coOccurencesCount, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
389 	IDnum readIndex, readPairIndex;
390 	IDnum readNodeCount;
391 	IDnum readOccurenceIndex, readPairOccurenceIndex;
392 	ReadOccurence * readOccurence, *readPairOccurence;
393 	boolean * interestingReads = callocOrExit(sequenceCount(graph), boolean);
394 	Category libID;
395 
396 	for (libID = 0; libID < CATEGORIES + 1; libID++)
397 		coOccurencesCount[libID] = 0;
398 
399 	for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
400 		// Eliminating dodgy, unpaired, already counted or user-specified reads
401 		if ( readPairs[readIndex] < readIndex
402 		    || getInsertLength(graph, cats[readIndex]) > -1)
403 			continue;
404 
405 		// Check for co-occurence
406 		// We know that for each read the read occurences are ordered by increasing node ID
407 		// Therefore one list is followed by increasing index, whereas the other is followed
408 		// by decreasing index
409 		libID = cats[readIndex]/2;
410 		readPairIndex = readPairs[readIndex];
411 
412 		readOccurenceIndex = 0;
413 		readOccurence = readNodes[readIndex + 1];
414 		readNodeCount = readNodeCounts[readIndex + 1];
415 
416 		readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
417 		readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
418 
419 		while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
420 			if (readOccurence->nodeID == -readPairOccurence->nodeID) {
421 				if (readOccurence->position > 0 && readPairOccurence->position > 0) {
422 					coOccurencesCount[libID]++;
423 					interestingReads[readIndex] = true;
424 					break;
425 				} else {
426 					readOccurence++;
427 					readOccurenceIndex++;
428 					readPairOccurence--;
429 					readPairOccurenceIndex--;
430 				}
431 			} else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
432 				readOccurence++;
433 				readOccurenceIndex++;
434 			} else {
435 				readPairOccurence--;
436 				readPairOccurenceIndex--;
437 			}
438 		}
439 	}
440 
441 	return interestingReads;
442 }
443 
measureCoOccurences(Coordinate ** coOccurences,boolean * interestingReads,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)444 static void measureCoOccurences(Coordinate ** coOccurences, boolean * interestingReads, ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
445 	IDnum coOccurencesIndex[CATEGORIES + 1];
446 	IDnum observationIndex;
447 	IDnum readIndex, readPairIndex;
448 	IDnum readNodeCount;
449 	IDnum readOccurenceIndex, readPairOccurenceIndex;
450 	ReadOccurence * readOccurence, *readPairOccurence;
451 	Category libID;
452 
453 	for (libID = 0; libID < CATEGORIES + 1; libID++)
454 		coOccurencesIndex[libID] = 0;
455 
456 	for (readIndex = 0; readIndex < sequenceCount(graph); readIndex++) {
457 		// Eliminating dodgy, unpaired, already counted or user-specified reads
458 		if (!interestingReads[readIndex])
459 			continue;
460 
461 		// Find co-occurence
462 		// We know that for each read the read occurences are ordered by increasing node ID
463 		libID = cats[readIndex]/2;
464 		readPairIndex = readPairs[readIndex];
465 		observationIndex = coOccurencesIndex[libID];
466 
467 		readOccurence = readNodes[readIndex + 1];
468 		readOccurenceIndex = 0;
469 		readNodeCount = readNodeCounts[readIndex + 1];
470 
471 		readPairOccurenceIndex = readNodeCounts[readPairIndex + 1] - 1;
472 		readPairOccurence = &(readNodes[readPairIndex + 1][readPairOccurenceIndex]);
473 
474 		while (readOccurenceIndex < readNodeCount && readPairOccurenceIndex >= 0) {
475 			if (readOccurence->nodeID == -readPairOccurence->nodeID) {
476 				if (readOccurence->position > 0 && readPairOccurence->position > 0) {
477 					coOccurences[libID][observationIndex] =
478 					      getNodeLength(getNodeInGraph(graph, readOccurence->nodeID))
479 					      + getWordLength(graph) - 1
480 					      - (readOccurence->position - readOccurence->offset)
481 					      - (readPairOccurence->position - readPairOccurence->offset);
482 					coOccurencesIndex[libID]++;
483 					break;
484 				} else {
485 					readOccurence++;
486 					readOccurenceIndex++;
487 					readPairOccurence--;
488 					readPairOccurenceIndex--;
489 				}
490 			} else if (readOccurence->nodeID < -readPairOccurence->nodeID) {
491 				readOccurence++;
492 				readOccurenceIndex++;
493 			} else {
494 				readPairOccurence--;
495 				readPairOccurenceIndex--;
496 			}
497 		}
498 	}
499 }
500 
compareReadOccurences(const void * A,const void * B)501 int compareReadOccurences(const void *A, const void * B) {
502 	Coordinate * cA = (Coordinate *) A;
503 	Coordinate * cB = (Coordinate *) B;
504 
505 	if (*cA > *cB)
506 		return 1;
507 	if (*cA == *cB)
508 		return 0;
509 	return -1;
510 }
511 
estimateLibraryInsertLength(Coordinate * coOccurences,IDnum coOccurencesCount,Category libID)512 static void estimateLibraryInsertLength(Coordinate * coOccurences, IDnum coOccurencesCount, Category libID) {
513 	Coordinate median, variance;
514 	IDnum index;
515 	int counter = 0;
516 	qsort(coOccurences, coOccurencesCount, sizeof(Coordinate), compareReadOccurences);
517 
518 	median = coOccurences[coOccurencesCount / 2];
519 
520 	// Modified variance around the median (proxy for expected value)
521 	// interval censoring
522 	variance = 0;
523 	for (index = 0; index < coOccurencesCount; index++) {
524 		if (coOccurences[index] > 0 && coOccurences[index] < 5 * median) {
525 			variance += (coOccurences[index] - median) * (coOccurences[index] - median);
526 			counter++;
527 		}
528 	}
529 	if (counter)
530 		variance /= counter;
531 	else {
532 		variance = 0;
533 		for (index = 0; index < coOccurencesCount; index++)
534 			variance += (coOccurences[index] - median) * (coOccurences[index] - median);
535 		variance /= coOccurencesCount;
536 	}
537 
538 	// To avoid subsequent divisions by zero
539 	if (variance == 0)
540 		variance = 1;
541 
542 	printf("Paired-end library %i has length: %lli, sample standard deviation: %lli\n", libID + 1, (long long) median, (long long) sqrt(variance));
543 	setInsertLengths(graph, libID, median, sqrt(variance));
544 	estimated[libID] = true;
545 }
546 
estimateLibraryInsertLengths(Coordinate ** coOccurences,IDnum * coOccurencesCounts)547 static void estimateLibraryInsertLengths(Coordinate ** coOccurences, IDnum * coOccurencesCounts) {
548 	Category libID;
549 
550 	for (libID = 0; libID < CATEGORIES + 1; libID++)
551 		estimated[libID] = false;
552 
553 	for (libID = 0; libID < CATEGORIES + 1; libID++)
554 		if (coOccurencesCounts[libID] > 0)
555 			estimateLibraryInsertLength(coOccurences[libID], coOccurencesCounts[libID], libID);
556 }
557 
estimateMissingInsertLengths(ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats)558 static void estimateMissingInsertLengths(ReadOccurence ** readNodes, IDnum * readNodeCounts, IDnum * readPairs, Category * cats) {
559 	Coordinate * coOccurences[CATEGORIES + 1];
560 	IDnum coOccurencesCounts[CATEGORIES + 1];
561 	Category libID;
562 
563 	// Original
564 	/*
565 	puts("Estimating library insert lengths...");
566 	*/
567 	// Original
568 
569 	boolean * interestingReads = countCoOccurences(coOccurencesCounts, readNodes, readNodeCounts, readPairs, cats);
570 
571 	for (libID = 0; libID < CATEGORIES + 1; libID++)
572 		coOccurences[libID] = callocOrExit(coOccurencesCounts[libID], Coordinate);
573 
574 	measureCoOccurences(coOccurences, interestingReads, readNodes, readNodeCounts, readPairs, cats);
575 	estimateLibraryInsertLengths(coOccurences, coOccurencesCounts);
576 
577 	for (libID = 0; libID < CATEGORIES + 1; libID++)
578 		free(coOccurences[libID]);
579 
580 	free(interestingReads);
581 
582 	// Original
583 	/*
584 	puts("Done");
585 	*/
586 	// Original
587 }
588 
findConnection(IDnum nodeID,IDnum node2ID)589 static Connection *findConnection(IDnum nodeID, IDnum node2ID)
590 {
591 	Node *node2 = getNodeInGraph(graph, node2ID);
592 	Connection *connect;
593 
594 	if (node2 == NULL)
595 		return NULL;
596 
597 	for (connect = scaffold[nodeID + nodeCount(graph)];
598 	     connect != NULL; connect = connect->next)
599 		if (connect->destination == node2)
600 			break;
601 
602 	return connect;
603 }
604 
createTwinConnection(IDnum nodeID,IDnum node2ID,Connection * connect)605 static void createTwinConnection(IDnum nodeID, IDnum node2ID,
606 				 Connection * connect)
607 {
608 	Connection *newConnection = allocateConnection();
609 	IDnum nodeIndex = nodeID + nodeCount(graph);
610 
611 	// Fill in
612 	newConnection->distance = connect->distance;
613 	newConnection->variance = connect->variance;
614 	newConnection->direct_count = connect->direct_count;
615 	newConnection->paired_count = connect->paired_count;
616 	newConnection->destination = getNodeInGraph(graph, node2ID);
617 
618 	// Batch to twin
619 	newConnection->twin = connect;
620 	connect->twin = newConnection;
621 
622 	// Insert in scaffold
623 	newConnection->previous = NULL;
624 	newConnection->next = scaffold[nodeIndex];
625 	if (scaffold[nodeIndex] != NULL)
626 		scaffold[nodeIndex]->previous = newConnection;
627 	scaffold[nodeIndex] = newConnection;
628 }
629 
createNewConnection(IDnum nodeID,IDnum node2ID,IDnum direct_count,IDnum paired_count,Coordinate distance,double variance)630 Connection *createNewConnection(IDnum nodeID, IDnum node2ID,
631 				       IDnum direct_count,
632 				       IDnum paired_count,
633 				       Coordinate distance,
634 				       double variance)
635 {
636 	Node *destination = getNodeInGraph(graph, node2ID);
637 	IDnum nodeIndex = nodeID + nodeCount(graph);
638 	Connection *connect = allocateConnection();
639 
640 	// Fill in
641 	connect->destination = destination;
642 	connect->direct_count = direct_count;
643 	connect->paired_count = paired_count;
644 	connect->distance = (double) distance;
645 	connect->variance = variance;
646 
647 	// Insert in scaffold
648 	connect->previous = NULL;
649 	connect->next = scaffold[nodeIndex];
650 	if (scaffold[nodeIndex] != NULL)
651 		scaffold[nodeIndex]->previous = connect;
652 	scaffold[nodeIndex] = connect;
653 
654 	// Event. pair up to twin
655 	if (getUniqueness(destination))
656 		createTwinConnection(node2ID, nodeID, connect);
657 	else
658 		connect->twin = NULL;
659 
660 	return connect;
661 }
662 
readjustConnection(Connection * connect,Coordinate distance,double variance,IDnum direct_count,IDnum paired_count)663 void readjustConnection(Connection * connect, Coordinate distance,
664 			       double variance, IDnum direct_count,
665 			       IDnum paired_count)
666 {
667 	connect->direct_count += direct_count;
668 	connect->paired_count += paired_count;
669 
670 	connect->distance =
671 	    (variance * connect->distance +
672 	     distance * connect->variance) / (variance +
673 					      connect->variance);
674 	connect->variance =
675 	    (variance *
676 	     connect->variance) / (variance + connect->variance);
677 
678 	if (connect->twin != NULL) {
679 		connect->twin->distance = connect->distance;
680 		connect->twin->variance = connect->variance;
681 		connect->twin->direct_count = connect->direct_count;
682 		connect->twin->paired_count = connect->paired_count;
683 	}
684 }
685 
createConnection(IDnum nodeID,IDnum node2ID,IDnum direct_count,IDnum paired_count,Coordinate distance,double variance)686 static void createConnection(IDnum nodeID, IDnum node2ID,
687 			     IDnum direct_count,
688 			     IDnum paired_count,
689 			     Coordinate distance, double variance)
690 {
691 	Connection *connect = findConnection(nodeID, node2ID);
692 
693 	if (connect != NULL)
694 		readjustConnection(connect, distance, variance,
695 				   direct_count, paired_count);
696 	else
697 		createNewConnection(nodeID, node2ID, direct_count,
698 				    paired_count, distance, variance);
699 }
700 
projectFromSingleRead(Node * node,ReadOccurence * readOccurence,Coordinate position,Coordinate offset,Coordinate length)701 static void projectFromSingleRead(Node * node,
702 				  ReadOccurence * readOccurence,
703 				  Coordinate position,
704 				  Coordinate offset, Coordinate length)
705 {
706 	Coordinate distance = 0;
707 	Node *target = getNodeInGraph(graph, -readOccurence->nodeID);
708 	double variance = 1;
709 
710 	if (target == getTwinNode(node) || target == node)
711 		return;
712 
713 	if (position < 0) {
714 		variance += getNodeLength(node) * getNodeLength(node) / 16;
715 		// distance += 0;
716 	} else {
717 		// variance += 0;
718 		distance += position - getNodeLength(node) / 2;
719 	}
720 
721 	if (readOccurence->position < 0) {
722 		variance +=
723 		    getNodeLength(target) * getNodeLength(target) / 16;
724 		//distance += 0;
725 	} else {
726 		// variance += 0;
727 		distance +=
728 		    -readOccurence->position + getNodeLength(target) / 2;
729 	}
730 
731 	if (readOccurence->offset < 0 || offset < 0) {
732 		variance += length * length / 16;
733 		//distance += 0;
734 	} else {
735 		// variance += 0;
736 		distance += readOccurence->offset - offset;
737 	}
738 
739 	// Relative ordering
740 	if (offset > 0 && readOccurence->offset > 0) {
741 		if (offset < readOccurence->offset) {
742 			if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
743 				;
744 			else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
745 				createConnection(getNodeID(node), getNodeID(target), 1, 0,
746 						 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
747 			else
748 				createConnection(getNodeID(node), getNodeID(target), 1, 0,
749 						 distance, variance);
750 		} else if (offset > readOccurence->offset) {
751 			if (-distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -10)
752 				;
753 			else if (-distance < getNodeLength(node)/2 + getNodeLength(target)/2)
754 				createConnection(-getNodeID(node), -getNodeID(target), 1,
755 						 0, getNodeLength(node)/2 + getNodeLength(target)/2 , variance);
756 			else
757 				createConnection(-getNodeID(node), -getNodeID(target), 1,
758 						 0, -distance, variance);
759 		}
760 	} else if (offset > 0 && position > 0) {
761 		if (distance - offset > -getNodeLength(node)/2 && distance - offset + length > getNodeLength(node)/2)
762 			createConnection(getNodeID(node), getNodeID(target), 1, 0,
763 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
764 		else if (distance - offset < -getNodeLength(node)/2 && distance - offset + length < getNodeLength(node)/2)
765 			createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
766 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
767 		else {
768 			createConnection(getNodeID(node), getNodeID(target), 1, 0,
769 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
770 			createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
771 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
772 		}
773 	} else if (readOccurence->offset > 0 && readOccurence->position > 0) {
774 		if (-distance - readOccurence->offset > -getNodeLength(target)/2 && -distance - readOccurence->offset + length > getNodeLength(target)/2)
775 			createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
776 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
777 		if (-distance - readOccurence->offset < -getNodeLength(target)/2 && -distance - readOccurence->offset + length < getNodeLength(target)/2)
778 			createConnection(getNodeID(node), getNodeID(target), 1, 0,
779 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
780 		else {
781 			createConnection(getNodeID(node), getNodeID(target), 1, 0,
782 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
783 			createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
784 					 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
785 		}
786 	} else {
787 		createConnection(getNodeID(node), getNodeID(target), 1, 0,
788 				 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
789 		createConnection(-getNodeID(node), -getNodeID(target), 1, 0,
790 				 getNodeLength(node)/2 + getNodeLength(target)/2, variance);
791 	}
792 }
793 
projectFromReadPair(Node * node,ReadOccurence * readOccurence,Coordinate position,Coordinate offset,Coordinate insertLength,double insertVariance)794 static void projectFromReadPair(Node * node, ReadOccurence * readOccurence,
795 				Coordinate position, Coordinate offset,
796 				Coordinate insertLength,
797 				double insertVariance)
798 {
799 	Coordinate distance = insertLength;
800 	Coordinate variance = insertVariance;
801 	Node *target = getNodeInGraph(graph, readOccurence->nodeID);
802 
803 	if (target == getTwinNode(node) || target == node)
804 		return;
805 
806 	if (getUniqueness(target) && getNodeID(target) < getNodeID(node))
807 		return;
808 
809 	if (position < 0) {
810 		variance += getNodeLength(node) * getNodeLength(node) / 16;
811 		// distance += 0;
812 	} else {
813 		// variance += 0;
814 		distance += position - offset - getNodeLength(node) / 2;
815 	}
816 
817 	if (readOccurence->position < 0) {
818 		variance +=
819 		    getNodeLength(target) * getNodeLength(target) / 16;
820 		//distance += 0;
821 	} else {
822 		// variance += 0;
823 		distance +=
824 		    readOccurence->position - readOccurence->offset -
825 		    getNodeLength(target) / 2;
826 	}
827 
828 	if (distance - getNodeLength(node)/2 - getNodeLength(target)/2 < -6 * sqrt(insertVariance))
829 		return;
830 	else if (distance < getNodeLength(node)/2 + getNodeLength(target)/2)
831 		distance = getNodeLength(node)/2 + getNodeLength(target)/2;
832 
833 	createConnection(getNodeID(node), getNodeID(target), 0, 1,
834 			 distance, variance);
835 }
836 
projectFromShortRead(Node * node,ShortReadMarker * shortMarker,IDnum * readPairs,Category * cats,ReadOccurence ** readNodes,IDnum * readNodeCounts,Coordinate * lengths)837 static void projectFromShortRead(Node * node,
838 				 ShortReadMarker * shortMarker,
839 				 IDnum * readPairs, Category * cats,
840 				 ReadOccurence ** readNodes,
841 				 IDnum * readNodeCounts,
842 				 Coordinate * lengths)
843 {
844 	IDnum index;
845 	IDnum readIndex = getShortReadMarkerID(shortMarker);
846 	ReadOccurence *readArray;
847 	IDnum readPairIndex;
848 	Category cat;
849 	Coordinate position = getShortReadMarkerPosition(shortMarker);
850 	Coordinate offset = getShortReadMarkerOffset(shortMarker);
851 	Coordinate length = lengths[getShortReadMarkerID(shortMarker) - 1];
852 	Coordinate insertLength;
853 	double insertVariance;
854 
855 	// Going through single-read information
856 	if (readNodeCounts[readIndex] > 1) {
857 		readArray = readNodes[readIndex];
858 		for (index = 0; index < readNodeCounts[readIndex]; index++)
859 			projectFromSingleRead(node, &readArray[index],
860 					      position, offset, length);
861 	}
862 	// Going through paired read information
863 	if (readPairs == NULL)
864 		return;
865 
866 	readPairIndex = readPairs[readIndex - 1] + 1;
867 
868 	if (readPairIndex == 0)
869 		return;
870 
871 	cat = cats[readIndex - 1];
872 	insertLength = getInsertLength(graph, cat);
873 	insertVariance = getInsertLength_var(graph, cat);
874 
875 	readArray = readNodes[readPairIndex];
876 	for (index = 0; index < readNodeCounts[readPairIndex]; index++)
877 		projectFromReadPair(node, &readArray[index], position,
878 				    offset, insertLength, insertVariance);
879 
880 }
881 
projectFromLongRead(Node * node,PassageMarker * marker,IDnum * readPairs,Category * cats,ReadOccurence ** readNodes,IDnum * readNodeCounts,Coordinate * lengths)882 static void projectFromLongRead(Node * node, PassageMarker * marker,
883 				IDnum * readPairs, Category * cats,
884 				ReadOccurence ** readNodes,
885 				IDnum * readNodeCounts,
886 				Coordinate * lengths)
887 {
888 	IDnum index;
889 	IDnum readIndex = getPassageMarkerSequenceID(marker);
890 	ReadOccurence *readArray;
891 	IDnum readPairIndex;
892 	Category cat;
893 	Coordinate position = getStartOffset(marker);
894 	Coordinate offset = getPassageMarkerStart(marker);
895 	Coordinate length =
896 	    lengths[getPassageMarkerSequenceID(marker) - 1];
897 	Coordinate insertLength;
898 	double insertVariance;
899 
900 	// Going through single-read information
901 	if (readNodeCounts[readIndex] > 1 && position > 0) {
902 		readArray = readNodes[readIndex];
903 		for (index = 0; index < readNodeCounts[readIndex]; index++)
904 			projectFromSingleRead(node, &readArray[index],
905 					      position, offset, length);
906 	}
907 	// Going through paired read information
908 	if (readPairs == NULL)
909 		return;
910 
911 	readPairIndex = readPairs[readIndex - 1] + 1;
912 
913 	if (readPairIndex == 0)
914 		return;
915 
916 	cat = cats[readIndex - 1];
917 	insertLength = getInsertLength(graph, cat);
918 	insertVariance = getInsertLength_var(graph, cat);
919 
920 	readArray = readNodes[readPairIndex];
921 	for (index = 0; index < readNodeCounts[readPairIndex]; index++)
922 		projectFromReadPair(node, &readArray[index], position,
923 				    offset, insertLength, insertVariance);
924 
925 }
926 
projectFromNode(IDnum nodeID,ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats,boolean * dubious,Coordinate * lengths)927 static void projectFromNode(IDnum nodeID,
928 			    ReadOccurence ** readNodes,
929 			    IDnum * readNodeCounts,
930 			    IDnum * readPairs, Category * cats,
931 			    boolean * dubious, Coordinate * lengths)
932 {
933 	IDnum index;
934 	ShortReadMarker *nodeArray, *shortMarker;
935 	PassageMarker *marker;
936 	Node *node;
937 	IDnum nodeReadCount;
938 
939 	node = getNodeInGraph(graph, nodeID);
940 
941 	if (node == NULL || !getUniqueness(node))
942 		return;
943 
944 	nodeArray = getNodeReads(node, graph);
945 	nodeReadCount = getNodeReadCount(node, graph);
946 	for (index = 0; index < nodeReadCount; index++) {
947 		shortMarker = getShortReadMarkerAtIndex(nodeArray, index);
948 		if (dubious[getShortReadMarkerID(shortMarker) - 1])
949 			continue;
950 		projectFromShortRead(node, shortMarker, readPairs, cats,
951 				     readNodes, readNodeCounts, lengths);
952 	}
953 
954 	for (marker = getMarker(node); marker != NULL;
955 	     marker = getNextInNode(marker)) {
956 		if (getPassageMarkerSequenceID(marker) > 0)
957 			projectFromLongRead(node, marker, readPairs, cats,
958 					    readNodes, readNodeCounts,
959 					    lengths);
960 	}
961 }
962 
computeNodeToNodeMappings(ReadOccurence ** readNodes,IDnum * readNodeCounts,IDnum * readPairs,Category * cats,boolean * dubious,Coordinate * lengths)963 static Connection **computeNodeToNodeMappings(ReadOccurence ** readNodes,
964 					      IDnum * readNodeCounts,
965 					      IDnum * readPairs,
966 					      Category * cats,
967 					      boolean * dubious,
968 					      Coordinate * lengths)
969 {
970 	IDnum nodeID;
971 	IDnum nodes = nodeCount(graph);
972 	scaffold = callocOrExit(2 * nodes + 1, Connection *);
973 
974 	// Original
975 	/*
976 	puts("Computing direct node to node mappings");
977 	*/
978 	// Original
979 
980 	for (nodeID = -nodes; nodeID <= nodes; nodeID++) {
981 		// Original
982 		/*
983 		if (nodeID % 10000 == 0)
984 			printf("Scaffolding node %d\n", nodeID);
985 		*/
986 		// Original
987 
988 		projectFromNode(nodeID, readNodes, readNodeCounts,
989 				readPairs, cats, dubious, lengths);
990 	}
991 
992 	return scaffold;
993 }
994 
countShortReads(Graph * graph,ReadSet * reads)995 static IDnum **countShortReads(Graph * graph, ReadSet * reads)
996 {
997 	IDnum **counts = callocOrExit(CATEGORIES + 1, IDnum *);
998 	Category cat;
999 	IDnum nodeIndex;
1000 	IDnum nodes = nodeCount(graph);
1001 	Node *node;
1002 	ShortReadMarker *array, *marker;
1003 	IDnum readCount, readIndex, readID;
1004 
1005 	// Allocate memory where needed
1006 	for (cat = 0; cat <= CATEGORIES; cat++)
1007 		if (getInsertLength(graph, cat) > 0)
1008 			counts[cat] =
1009 			    callocOrExit(2 * nodeCount(graph) + 1,
1010 				   IDnum);
1011 
1012 	// Start fillin'
1013 	for (nodeIndex = 0; nodeIndex < 2 * nodes + 1; nodeIndex++) {
1014 		node = getNodeInGraph(graph, nodeIndex - nodes);
1015 
1016 		if (node == NULL || !getUniqueness(node))
1017 			continue;
1018 
1019 		array = getNodeReads(node, graph);
1020 		readCount = getNodeReadCount(node, graph);
1021 		for (readIndex = 0; readIndex < readCount; readIndex++) {
1022 			marker =
1023 			    getShortReadMarkerAtIndex(array, readIndex);
1024 			readID = getShortReadMarkerID(marker);
1025 			cat = reads->categories[readID - 1];
1026 			if (cat % 2 == 1 && counts[cat / 2] != NULL)
1027 				counts[cat / 2][nodeIndex]++;
1028 		}
1029 	}
1030 
1031 	return counts;
1032 }
1033 
printConnections(ReadSet * reads)1034 void printConnections(ReadSet * reads)
1035 {
1036 	IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1037 	IDnum index;
1038 	Connection *connect, *next;
1039 	Node *node;
1040 	IDnum **counts = countShortReads(graph, reads);
1041 	IDnum nodes = nodeCount(graph);
1042 
1043 	puts("CONNECT IDA IDB dcount pcount dist lengthA lengthB var countA countB coordA coordB real exp distance test");
1044 
1045 	for (index = 0; index < maxNodeIndex; index++) {
1046 		node = getNodeInGraph(graph, index - nodeCount(graph));
1047 		for (connect = scaffold[index]; connect != NULL;
1048 		     connect = next) {
1049 			next = connect->next;
1050 			if (getUniqueness(connect->destination)) {
1051 				printf
1052 				    ("CONNECT %ld %ld %ld %ld %lld %lld %lld %f %ld %ld",
1053 				     (long) index - nodeCount(graph),
1054 				     (long) getNodeID(connect->destination),
1055 				     (long) connect->direct_count,
1056 				     (long) connect->paired_count,
1057 				     (long long) getConnectionDistance(connect),
1058 				     (long long) getNodeLength(node),
1059 				     (long long) getNodeLength(connect->destination),
1060 				     connect->variance,
1061 				     (long) getNodeReadCount(node, graph),
1062 				     (long) getNodeReadCount(connect->destination,
1063 						      graph));
1064 				if (markerCount(node) == 1
1065 				    && markerCount(connect->destination) ==
1066 				    1)
1067 					printf(" %lld %lld %lld",
1068 					       (long long) getPassageMarkerFinish
1069 					       (getMarker(node)),
1070 					       (long long) getPassageMarkerFinish
1071 					       (getMarker
1072 						(connect->destination)),
1073 					       (long long) (getPassageMarkerFinish
1074 					       (getMarker(node)) -
1075 					       getPassageMarkerFinish
1076 					       (getMarker
1077 						(connect->destination))));
1078 				else
1079 					printf(" ? ?");
1080 				printf(" %ld", (long) expectedNumberOfConnections(index-nodeCount(graph), connect, counts, 0));
1081 				printf(" %lld", (long long) (getConnectionDistance(connect) - (getNodeLength(node) + getNodeLength(connect->destination))/2));
1082 				if (testConnection
1083 				    (index - nodes, connect, counts))
1084 					puts(" OK");
1085 				else
1086 					puts(" NG");
1087 			}
1088 		}
1089 	}
1090 }
1091 
removeUnreliableConnections(ReadSet * reads)1092 static void removeUnreliableConnections(ReadSet * reads)
1093 {
1094 	IDnum maxNodeIndex = nodeCount(graph) * 2 + 1;
1095 	IDnum index;
1096 	Connection *connect, *next;
1097 	Category cat;
1098 	IDnum **counts = countShortReads(graph, reads);
1099 	IDnum nodes = nodeCount(graph);
1100 
1101 	for (index = 0; index < maxNodeIndex; index++) {
1102 		for (connect = scaffold[index]; connect != NULL;
1103 		     connect = next) {
1104 			next = connect->next;
1105 			if (!testConnection
1106 			    (index - nodes, connect, counts))
1107 				destroyConnection(connect, index - nodes);
1108 		}
1109 	}
1110 
1111 	// Free memory
1112 	for (cat = 0; cat <= CATEGORIES; cat++)
1113 		if (counts[cat])
1114 			free(counts[cat]);
1115 	free(counts);
1116 }
1117 
buildScaffold(Graph * argGraph,ReadSet * reads,boolean * dubious)1118 void buildScaffold(Graph * argGraph, ReadSet * reads, boolean * dubious) {
1119 	IDnum *readPairs;
1120 	Category *cats;
1121 	IDnum *readNodeCounts;
1122 	ReadOccurence **readNodes;
1123 	Coordinate *lengths =
1124 	    getSequenceLengths(reads, getWordLength(argGraph));
1125 	IDnum index;
1126 
1127 	graph = argGraph;
1128 	readPairs = reads->mateReads;
1129 	cats = reads->categories;
1130 
1131 	// Prepare primary scaffold
1132 	readNodeCounts = computeReadToNodeCounts();
1133 	readNodes = computeReadToNodeMappings(readNodeCounts);
1134 
1135 	estimateMissingInsertLengths(readNodes, readNodeCounts, readPairs, cats);
1136 
1137 	scaffold = computeNodeToNodeMappings(readNodes, readNodeCounts,
1138 				      readPairs, cats, dubious, lengths);
1139 	removeUnreliableConnections(reads);
1140 
1141 	// Clean up memory
1142 	for (index = 1; index <= sequenceCount(graph); index++)
1143 		free(readNodes[index]);
1144 
1145 	free(readNodes);
1146 	free(readNodeCounts);
1147 	free(lengths);
1148 }
1149 
setUnreliableConnectionCutoff(int val)1150 void setUnreliableConnectionCutoff(int val)
1151 {
1152 	UNRELIABLE_CONNECTION_CUTOFF = (IDnum) val;
1153 }
1154 
cleanScaffoldMemory()1155 void cleanScaffoldMemory() {
1156 	Category libID;
1157 
1158 	for (libID = 0; libID < CATEGORIES + 1; libID++)
1159 		if (estimated[libID])
1160 			setInsertLengths(graph, libID, -1, -1);
1161 
1162 	destroyRecycleBin(connectionMemory);
1163 	free(scaffold);
1164 	connectionMemory = NULL;
1165 }
1166