1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "computeGlobalScore.H"
19 
20 #include <set>
21 #include <algorithm>
22 
23 
24 uint16
compute(uint32 ovlLen,ovOverlap * ovl,uint32 expectedCoverage,uint32 thresholdsLen,uint16 * thresholds)25 globalScore::compute(uint32             ovlLen,
26                      ovOverlap         *ovl,
27                      uint32             expectedCoverage,
28                      uint32             thresholdsLen,
29                      uint16            *thresholds) {
30 
31   //  Build a list of all the overlap scores.  Ignore
32   //  overlaps that are too bad/good or too short/long.
33 
34   histLen = 0;
35 
36   resizeArray(hist, histLen, histMax, ovlLen, _raAct::doNothing);
37 
38   for (uint32 oo=0; oo<ovlLen; oo++) {
39     uint32  ovlLength  = ovl[oo].a_end() - ovl[oo].a_bgn();
40 
41     if ((ovl[oo].evalue() < minEvalue)    || (maxEvalue    < ovl[oo].evalue()) ||
42         (ovlLength        < minOvlLength) || (maxOvlLength < ovlLength))
43       continue;
44 
45     hist[histLen++] = ovl[oo].overlapScore();
46   }
47 
48   //  Sort, reversely.
49 
50   sort(hist, hist + histLen, std::greater<uint64>());
51 
52   //  Figure out our threshold score.  Any overlap with score below this should be filtered.
53 
54   uint16 threshold = (expectedCoverage < histLen) ? hist[expectedCoverage] : 0;
55 
56   for (uint32 ii=0; ii<thresholdsLen; ii++) {
57     if (ii * 10 < histLen)
58       thresholds[ii] = hist[ii * 10];
59     else
60       thresholds[ii] = 0;
61   }
62 
63   if (stats == NULL)
64     return(threshold);
65 
66   //  One more pass, just to gather statistics now that we know the threshold score.
67 
68   uint32 belowCutoffLocal = 0;
69 
70   for (uint32 oo=0; oo<ovlLen; oo++) {
71     uint64  ovlLength  = ovl[oo].a_len();
72     uint16  ovlScore   = ovl[oo].overlapScore();
73 
74     bool    isC        = false;
75     bool    isD        = false;
76     bool    skipIt     = false;
77 
78     stats->totalOverlaps++;
79 
80     //  First, count the filtering done above.
81 
82     if (ovl[oo].evalue() < minEvalue) {
83       stats->lowErate++;
84       skipIt = true;
85     }
86 
87     if (maxEvalue < ovl[oo].evalue()) {
88       stats->highErate++;
89       skipIt = true;
90     }
91 
92     if (ovlLength < minOvlLength) {
93       stats->tooShort++;
94       skipIt = true;
95     }
96 
97     if (maxOvlLength < ovlLength) {
98       stats->tooLong++;
99       skipIt = true;
100     }
101 
102     //  Now, apply the global filter cutoff, only if the overlap wasn't already tossed out.
103 
104     if ((skipIt == false) &&
105         (ovlScore < threshold)) {
106       stats->belowCutoff++;
107       belowCutoffLocal++;
108       skipIt = true;
109     }
110 
111     if (skipIt)
112       continue;
113 
114     stats->retained++;
115   }  //  Over all overlaps
116 
117   double  fractionFiltered = (double)belowCutoffLocal / histLen;
118 
119   if (fractionFiltered <= 0.00)   stats->reads00OlapsFiltered++;
120   if (fractionFiltered <= 0.50)   stats->reads50OlapsFiltered++;
121   if (fractionFiltered <= 0.80)   stats->reads80OlapsFiltered++;
122   if (fractionFiltered <= 0.95)   stats->reads95OlapsFiltered++;
123   if (fractionFiltered <= 1.00)   stats->reads99OlapsFiltered++;
124 
125   if (logFile) {
126     if (histLen <= expectedCoverage)
127       fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (no filtering)\n",
128               ovl[0].a_iid, ovlLen, histLen, 0, histLen);
129     else
130       fprintf(logFile, "%9u - %6u overlaps - %6u scored - %6u filtered - %4u saved (threshold %u)\n",
131               ovl[0].a_iid, ovlLen, histLen, belowCutoffLocal, histLen - belowCutoffLocal, threshold);
132   }
133 
134   return(threshold);
135 }
136 
137 
138 
139 void
estimate(uint32 ovlLen,uint32 expectedCoverage)140 globalScore::estimate(uint32            ovlLen,
141                       uint32            expectedCoverage) {
142   double    fractionFiltered = 0;
143 
144   stats->totalOverlaps += ovlLen;
145 
146   if (ovlLen < expectedCoverage) {
147     stats->retained    += ovlLen;
148 
149   } else {
150     fractionFiltered = (double)(ovlLen - expectedCoverage) / ovlLen;
151 
152     stats->retained    +=          expectedCoverage;
153     stats->belowCutoff += ovlLen - expectedCoverage;
154   }
155 
156   if (fractionFiltered <= 0.00)   stats->reads00OlapsFiltered++;
157   if (fractionFiltered <= 0.50)   stats->reads50OlapsFiltered++;
158   if (fractionFiltered <= 0.80)   stats->reads80OlapsFiltered++;
159   if (fractionFiltered <= 0.95)   stats->reads95OlapsFiltered++;
160   if (fractionFiltered <= 1.00)   stats->reads99OlapsFiltered++;
161 }
162 
163