1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "sqStore.H"
20 #include "tgStore.H"
21
22 #include <map>
23 #include <algorithm>
24
25
26 // Replace the children list in tig with one that has fewer contains.
27 // The original list is saved in the tig.
28 // When contains are 'unstashed', positions are updated.
29
30
31
32 struct readInfo {
33 uint32 idx; // index of the read in the original unsorted _children
34 int32 len; // length of the read
35 bool use; // true if the read is not contained in some other read
36 };
37
38
39 void
stashContains(double maxCov,tgTigStashed & S)40 tgTig::stashContains(double maxCov, tgTigStashed &S) {
41
42 // Initialize.
43 // Declare that we have no stashed reads.
44 // Clear the return statistics.
45
46 _stashed = nullptr;
47 _stashedLen = 0;
48 _stashedMax = 0;
49
50 S.clear();
51
52 if (_childrenLen == 1)
53 return;
54
55 // Sort the original children by position.
56
57 std::sort(_children, _children + _childrenLen);
58
59 // Decide which children to save.
60
61 readInfo *posLen = new readInfo [_childrenLen]; // Sorting by length of child
62
63 // Flag the read for stashing if it doesn't extend the tig.
64
65 int32 hiEnd = -1;
66
67 for (uint32 ci=0; ci<_childrenLen; ci++) {
68 int32 lo = _children[ci].min();
69 int32 hi = _children[ci].max();
70
71 if (hi <= hiEnd) {
72 posLen[ci].idx = ci;
73 posLen[ci].len = hi - lo;
74 posLen[ci].use = false;
75
76 S.nStsh += 1;
77 S.bStsh += posLen[ci].len;
78 }
79
80 else {
81 posLen[ci].idx = ci;
82 posLen[ci].len = hi - lo;
83 posLen[ci].use = true;
84
85 S.nBack += 1;
86 S.bBack += posLen[ci].len;
87 }
88
89 hiEnd = std::max(hi, hiEnd);
90 }
91
92 // Sort by length, longest first, then verify we're sorted.
93
94 auto longestFirst = [](readInfo const &A, readInfo const &B) {
95 return(A.len > B.len);
96 };
97
98 std::sort(posLen, posLen + _childrenLen, longestFirst);
99
100 for (uint32 ci=1; ci<_childrenLen; ci++)
101 assert(posLen[ci-1].len >= posLen[ci].len);
102
103 // Save the longer of the contained reads, until we reach the maximum
104 // coverage desired.
105
106 uint64 bLimit = (uint64)floor(maxCov * hiEnd);
107
108 for (uint32 ci=0; ci<_childrenLen; ci++) {
109 if (posLen[ci].use == true) // Already a backbone read.
110 continue; // Skip this read.
111
112 if (S.bCont + S.bBack > bLimit) // Exceeded coverage limit.
113 break; // Bail.
114
115 posLen[ci].use = true; // Do not stash the read.
116
117 S.nStsh -= 1; // Do not stash this read.
118 S.bStsh -= posLen[ci].len; //
119
120 S.nCont += 1; // And do use it for consensus.
121 S.bCont += posLen[ci].len; //
122 }
123
124 // Make a new list of reads, while saving the original, if there reads
125 // we want to stash.
126
127 if (S.nStsh > 0) {
128 _stashedLen = _childrenLen; // Save the original list
129 _stashedMax = _childrenMax; // so we can restore it later.
130 _stashed = _children;
131
132 _childrenLen = 0; // Allocate a new list for
133 _childrenMax = S.nBack + S.nCont; // exactly what we need to save.
134 _children = new tgPosition [_childrenMax];
135
136 for (uint32 ci=0; ci<_stashedLen; ci++)
137 if (posLen[ci].use == true) // If used, we want to keep the
138 _children[_childrenLen++] = _stashed[posLen[ci].idx]; // read, so copy it to the new list.
139 }
140
141 // since we added the reads using length sorted order, re-sort them by position to make everyone downstream happy
142 std::sort(_children, _children + _childrenLen);
143
144 // Cleanup and return the statistics.
145
146 delete [] posLen;
147 }
148
149
150
151 void
unstashContains(void)152 tgTig::unstashContains(void) {
153
154 if (_stashed == NULL) // If no saved list, nothing to unstash.
155 return;
156
157 // For the reads we 'stashed', we need to compute a position in the new
158 // consensus sequence. We'll just linearly scale the positions.
159
160 int32 oldMax = 0;
161 int32 newMax = 0;
162 double sf = 1.0;
163
164 for (uint32 ci=0; ci<_stashedLen; ci++)
165 oldMax = std::max(oldMax, _stashed[ci].max());
166
167 for (uint32 ci=0; ci<_childrenLen; ci++)
168 newMax = std::max(newMax, _children[ci].max());
169
170 if (oldMax > 0)
171 sf = (double)newMax / oldMax;
172
173 // Build a map from child ID to it's current position.
174
175 std::map<uint32, uint32> idmap;
176
177 for (uint32 ci=0; ci < _childrenLen; ci++)
178 idmap[_children[ci].ident()] = ci;
179
180 // Over all the reads in the original list (currently held in _stashed),
181 // update the position, either from its current position out of consensus
182 // or by extrapolating the original position.
183
184 for (uint32 ci=0; ci<_stashedLen; ci++) {
185 uint32 id = _stashed[ci].ident();
186
187 if (idmap.find(id) != idmap.end()) { // If we find the ID in the current list,
188 _stashed[ci] = _children[idmap[id]]; // simply copy the new position data.
189 idmap.erase(id);
190 }
191
192 else {
193 int32 nmin = sf * _stashed[ci].min(); // If not, scale the positions based
194 int32 nmax = sf * _stashed[ci].max(); // on new vs old tig length.
195
196 if (nmin > newMax) nmin = newMax; // But don't let them exceed the limit!
197 if (nmax > newMax) nmax = newMax;
198
199 _stashed[ci].setMinMax(nmin, nmax);
200 }
201 }
202
203 // Make sure that we updated all the children.
204
205 if (idmap.empty() == false)
206 fprintf(stderr, "Failed to unstash the contained reads. Still have " F_SIZE_T " reads unplaced.\n",
207 idmap.size());
208 assert(idmap.empty() == true);
209
210 // Throw out the stashed list, and restore the original.
211
212 delete [] _children;
213
214 _childrenLen = _stashedLen;
215 _childrenMax = _stashedMax;
216 _children = _stashed;
217
218 _stashedLen = 0;
219 _stashedMax = 0;
220 _stashed = NULL;
221 }
222
223