1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "computeGlobalScore.H"
19
20 #include <set>
21 #include <algorithm>
22
23
24 uint16
compute(uint32 ovlLen,ovOverlap * ovl,uint32 expectedCoverage,uint32 thresholdsLen,uint16 * thresholds)25 globalScore::compute(uint32 ovlLen,
26 ovOverlap *ovl,
27 uint32 expectedCoverage,
28 uint32 thresholdsLen,
29 uint16 *thresholds) {
30
31 // Build a list of all the overlap scores. Ignore
32 // overlaps that are too bad/good or too short/long.
33
34 histLen = 0;
35
36 resizeArray(hist, histLen, histMax, ovlLen, _raAct::doNothing);
37
38 for (uint32 oo=0; oo<ovlLen; oo++) {
39 uint32 ovlLength = ovl[oo].a_end() - ovl[oo].a_bgn();
40
41 if ((ovl[oo].evalue() < minEvalue) || (maxEvalue < ovl[oo].evalue()) ||
42 (ovlLength < minOvlLength) || (maxOvlLength < ovlLength))
43 continue;
44
45 hist[histLen++] = ovl[oo].overlapScore();
46 }
47
48 // Sort, reversely.
49
50 sort(hist, hist + histLen, std::greater<uint64>());
51
52 // Figure out our threshold score. Any overlap with score below this should be filtered.
53
54 uint16 threshold = (expectedCoverage < histLen) ? hist[expectedCoverage] : 0;
55
56 for (uint32 ii=0; ii<thresholdsLen; ii++) {
57 if (ii * 10 < histLen)
58 thresholds[ii] = hist[ii * 10];
59 else
60 thresholds[ii] = 0;
61 }
62
63 if (stats == NULL)
64 return(threshold);
65
66 // One more pass, just to gather statistics now that we know the threshold score.
67
68 uint32 belowCutoffLocal = 0;
69
70 for (uint32 oo=0; oo<ovlLen; oo++) {
71 uint64 ovlLength = ovl[oo].a_len();
72 uint16 ovlScore = ovl[oo].overlapScore();
73
74 bool isC = false;
75 bool isD = false;
76 bool skipIt = false;
77
78 stats->totalOverlaps++;
79
80 // First, count the filtering done above.
81
82 if (ovl[oo].evalue() < minEvalue) {
83 stats->lowErate++;
84 skipIt = true;
85 }
86
87 if (maxEvalue < ovl[oo].evalue()) {
88 stats->highErate++;
89 skipIt = true;
90 }
91
92 if (ovlLength < minOvlLength) {
93 stats->tooShort++;
94 skipIt = true;
95 }
96
97 if (maxOvlLength < ovlLength) {
98 stats->tooLong++;
99 skipIt = true;
100 }
101
102 // Now, apply the global filter cutoff, only if the overlap wasn't already tossed out.
103
104 if ((skipIt == false) &&
105 (ovlScore < threshold)) {
106 stats->belowCutoff++;
107 belowCutoffLocal++;
108 skipIt = true;
109 }
110
111 if (skipIt)
112 continue;
113
114 stats->retained++;
115 } // Over all overlaps
116
117 double fractionFiltered = (double)belowCutoffLocal / histLen;
118
119 if (fractionFiltered <= 0.00) stats->reads00OlapsFiltered++;
120 if (fractionFiltered <= 0.50) stats->reads50OlapsFiltered++;
121 if (fractionFiltered <= 0.80) stats->reads80OlapsFiltered++;
122 if (fractionFiltered <= 0.95) stats->reads95OlapsFiltered++;
123 if (fractionFiltered <= 1.00) stats->reads99OlapsFiltered++;
124
125 if (logFile) {
126 if (histLen <= expectedCoverage)
127 fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (no filtering)\n",
128 ovl[0].a_iid, ovlLen, histLen, 0, histLen);
129 else
130 fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (threshold %u)\n",
131 ovl[0].a_iid, ovlLen, histLen, belowCutoffLocal, histLen - belowCutoffLocal, threshold);
132 }
133
134 return(threshold);
135 }
136
137
138
139 void
estimate(uint32 ovlLen,uint32 expectedCoverage)140 globalScore::estimate(uint32 ovlLen,
141 uint32 expectedCoverage) {
142 double fractionFiltered = 0;
143
144 stats->totalOverlaps += ovlLen;
145
146 if (ovlLen < expectedCoverage) {
147 stats->retained += ovlLen;
148
149 } else {
150 fractionFiltered = (double)(ovlLen - expectedCoverage) / ovlLen;
151
152 stats->retained += expectedCoverage;
153 stats->belowCutoff += ovlLen - expectedCoverage;
154 }
155
156 if (fractionFiltered <= 0.00) stats->reads00OlapsFiltered++;
157 if (fractionFiltered <= 0.50) stats->reads50OlapsFiltered++;
158 if (fractionFiltered <= 0.80) stats->reads80OlapsFiltered++;
159 if (fractionFiltered <= 0.95) stats->reads95OlapsFiltered++;
160 if (fractionFiltered <= 1.00) stats->reads99OlapsFiltered++;
161 }
162
163