1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "sqStore.H"
20 #include "tgStore.H"
21 
22 #include <map>
23 #include <algorithm>
24 
25 
26 //  Replace the children list in tig with one that has fewer contains.
27 //  The original list is saved in the tig.
28 //  When contains are 'unstashed', positions are updated.
29 
30 
31 
32 struct readInfo {
33   uint32    idx;   //  index of the read in the original unsorted _children
34   int32     len;   //  length of the read
35   bool      use;   //  true if the read is not contained in some other read
36 };
37 
38 
39 void
stashContains(double maxCov,tgTigStashed & S)40 tgTig::stashContains(double  maxCov, tgTigStashed &S) {
41 
42   //  Initialize.
43   //    Declare that we have no stashed reads.
44   //    Clear the return statistics.
45 
46   _stashed    = nullptr;
47   _stashedLen = 0;
48   _stashedMax = 0;
49 
50   S.clear();
51 
52   if (_childrenLen == 1)
53     return;
54 
55   //  Sort the original children by position.
56 
57   std::sort(_children, _children + _childrenLen);
58 
59   //  Decide which children to save.
60 
61   readInfo        *posLen  = new readInfo [_childrenLen];   //  Sorting by length of child
62 
63   //  Flag the read for stashing if it doesn't extend the tig.
64 
65   int32         hiEnd = -1;
66 
67   for (uint32 ci=0; ci<_childrenLen; ci++) {
68     int32  lo = _children[ci].min();
69     int32  hi = _children[ci].max();
70 
71     if (hi <= hiEnd) {
72       posLen[ci].idx = ci;
73       posLen[ci].len = hi - lo;
74       posLen[ci].use = false;
75 
76       S.nStsh += 1;
77       S.bStsh += posLen[ci].len;
78     }
79 
80     else {
81       posLen[ci].idx = ci;
82       posLen[ci].len = hi - lo;
83       posLen[ci].use = true;
84 
85       S.nBack += 1;
86       S.bBack += posLen[ci].len;
87     }
88 
89     hiEnd = std::max(hi, hiEnd);
90   }
91 
92   //  Sort by length, longest first, then verify we're sorted.
93 
94   auto longestFirst = [](readInfo const &A, readInfo const &B) {
95                         return(A.len > B.len);
96                       };
97 
98   std::sort(posLen, posLen + _childrenLen, longestFirst);
99 
100   for (uint32 ci=1; ci<_childrenLen; ci++)
101     assert(posLen[ci-1].len >= posLen[ci].len);
102 
103   //  Save the longer of the contained reads, until we reach the maximum
104   //  coverage desired.
105 
106   uint64  bLimit = (uint64)floor(maxCov * hiEnd);
107 
108   for (uint32 ci=0; ci<_childrenLen; ci++) {
109     if (posLen[ci].use == true)            //  Already a backbone read.
110       continue;                            //  Skip this read.
111 
112     if (S.bCont + S.bBack > bLimit)        //  Exceeded coverage limit.
113       break;                               //  Bail.
114 
115     posLen[ci].use = true;                 //  Do not stash the read.
116 
117     S.nStsh -= 1;                          //  Do not stash this read.
118     S.bStsh -= posLen[ci].len;             //
119 
120     S.nCont += 1;                          //  And do use it for consensus.
121     S.bCont += posLen[ci].len;             //
122   }
123 
124   //  Make a new list of reads, while saving the original, if there reads
125   //  we want to stash.
126 
127   if (S.nStsh > 0) {
128     _stashedLen = _childrenLen;                      //  Save the original list
129     _stashedMax = _childrenMax;                      //  so we can restore it later.
130     _stashed    = _children;
131 
132     _childrenLen  = 0;                               //  Allocate a new list for
133     _childrenMax  = S.nBack + S.nCont;               //  exactly what we need to save.
134     _children     = new tgPosition [_childrenMax];
135 
136     for (uint32 ci=0; ci<_stashedLen; ci++)
137       if (posLen[ci].use == true)                                //  If used, we want to keep the
138         _children[_childrenLen++] = _stashed[posLen[ci].idx];    //  read, so copy it to the new list.
139   }
140 
141   // since we added the reads using length sorted order, re-sort them by position to make everyone downstream  happy
142   std::sort(_children, _children + _childrenLen);
143 
144   //  Cleanup and return the statistics.
145 
146   delete [] posLen;
147 }
148 
149 
150 
151 void
unstashContains(void)152 tgTig::unstashContains(void) {
153 
154   if (_stashed == NULL)   //  If no saved list, nothing to unstash.
155     return;
156 
157   //  For the reads we 'stashed', we need to compute a position in the new
158   //  consensus sequence.  We'll just linearly scale the positions.
159 
160   int32   oldMax = 0;
161   int32   newMax = 0;
162   double  sf     = 1.0;
163 
164   for (uint32 ci=0; ci<_stashedLen; ci++)
165     oldMax = std::max(oldMax, _stashed[ci].max());
166 
167   for (uint32 ci=0; ci<_childrenLen; ci++)
168     newMax = std::max(newMax, _children[ci].max());
169 
170   if (oldMax > 0)
171     sf = (double)newMax / oldMax;
172 
173   //  Build a map from child ID to it's current position.
174 
175   std::map<uint32, uint32>   idmap;
176 
177   for (uint32 ci=0; ci < _childrenLen; ci++)
178     idmap[_children[ci].ident()] = ci;
179 
180   //  Over all the reads in the original list (currently held in _stashed),
181   //  update the position, either from its current position out of consensus
182   //  or by extrapolating the original position.
183 
184   for (uint32 ci=0; ci<_stashedLen; ci++) {
185     uint32  id = _stashed[ci].ident();
186 
187     if (idmap.find(id) != idmap.end()) {      //  If we find the ID in the current list,
188       _stashed[ci] = _children[idmap[id]];    //  simply copy the new position data.
189       idmap.erase(id);
190     }
191 
192     else {
193       int32  nmin = sf * _stashed[ci].min();   //  If not, scale the positions based
194       int32  nmax = sf * _stashed[ci].max();   //  on new vs old tig length.
195 
196       if (nmin > newMax)  nmin = newMax;       //  But don't let them exceed the limit!
197       if (nmax > newMax)  nmax = newMax;
198 
199       _stashed[ci].setMinMax(nmin, nmax);
200     }
201   }
202 
203   //  Make sure that we updated all the children.
204 
205   if (idmap.empty() == false)
206     fprintf(stderr, "Failed to unstash the contained reads.  Still have " F_SIZE_T " reads unplaced.\n",
207             idmap.size());
208   assert(idmap.empty() == true);
209 
210   //  Throw out the stashed list, and restore the original.
211 
212   delete [] _children;
213 
214   _childrenLen  = _stashedLen;
215   _childrenMax  = _stashedMax;
216   _children     = _stashed;
217 
218   _stashedLen = 0;
219   _stashedMax = 0;
220   _stashed    = NULL;
221 }
222 
223