1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #ifndef INCLUDE_AS_BAT_OVERLAPCACHE
19 #define INCLUDE_AS_BAT_OVERLAPCACHE
20
21 #include "runtime.H"
22 #include "ovStore.H"
23 #include "sqStore.H"
24
25 // CA8 used to re-encode the error rate into a smaller-precision number. This was
26 // confusing and broken (it tried to use a log-based encoding to give more precision
27 // to the smaller values). CA3g gives up and uses all 12 bits of precision.
28
29 // If not enough space for the minimum number of error bits, bump up to a 64-bit word for overlap
30 // storage.
31
32 // For storing overlaps in memory. 12 bytes per overlap.
33 class BAToverlap {
34 public:
BAToverlap()35 BAToverlap() {
36 evalue = 0;
37 a_hang = 0;
38 b_hang = 0;
39 flipped = false;
40
41 filtered = false;
42 symmetric = false;
43
44 a_iid = 0;
45 b_iid = 0;
46 };
~BAToverlap()47 ~BAToverlap() {};
48
49 // Nasty bit of code duplication.
50
51 bool
isDovetail(void)52 isDovetail(void) const {
53 return(((a_hang < 0) && (b_hang < 0)) ||
54 ((a_hang > 0) && (b_hang > 0)));
55 };
56
57 bool
AEndIs5prime(void)58 AEndIs5prime(void) const { // -------->
59 return((a_hang < 0) && (b_hang < 0)); // -------
60 };
61
62 bool
AEndIs3prime(void)63 AEndIs3prime(void) const { // -------->
64 return((a_hang > 0) && (b_hang > 0)); // -------
65 };
66
67 bool
AisContainer(void)68 AisContainer(void) const { // -------->
69 return((a_hang >= 0) && (b_hang <= 0)); // ----
70 };
71
72 bool
AisContained(void)73 AisContained(void) const { // --->
74 return((a_hang <= 0) && (b_hang >= 0)); // ---------
75 };
76
77 bool
BEndIs3prime(void)78 BEndIs3prime(void) const {
79 assert(AisContainer() == false); // Function is not defined
80 assert(AisContained() == false); // for containments.
81 return((AEndIs5prime() && (flipped == false)) || // <=== ------>
82 (AEndIs3prime() && (flipped == true))); // ---->
83 };
84
85 bool
BEndIs5prime(void)86 BEndIs5prime(void) const {
87 assert(AisContainer() == false); // Function is not defined
88 assert(AisContained() == false); // for containments.
89 return((AEndIs5prime() && (flipped == true)) || // ------>
90 (AEndIs3prime() && (flipped == false))); // <=== ---->
91 };
92
93 double
erate(void)94 erate(void) const {
95 return(AS_OVS_decodeEvalue(evalue));
96 }
97
98 void
convert(ovOverlap & olap)99 convert(ovOverlap &olap) {
100 olap.clear();
101
102 olap.a_iid = a_iid;
103 olap.b_iid = b_iid;
104
105 olap.evalue(evalue);
106
107 olap.flipped(flipped);
108 olap.a_hang(a_hang);
109 olap.b_hang(b_hang);
110 }
111
112 #if AS_MAX_READLEN_BITS < 24
113 uint64 evalue : AS_MAX_EVALUE_BITS; // 12
114 int64 a_hang : AS_MAX_READLEN_BITS+1; // 21+1
115 int64 b_hang : AS_MAX_READLEN_BITS+1; // 21+1
116 uint64 flipped : 1; // 1
117
118 uint64 filtered : 1; // 1
119 uint64 symmetric : 1; // 1 - twin overlap exists
120
121 uint32 a_iid;
122 uint32 b_iid;
123
124 #if (AS_MAX_EVALUE_BITS + (AS_MAX_READLEN_BITS + 1) + (AS_MAX_READLEN_BITS + 1) + 1 + 1 + 1 > 64)
125 #error not enough bits to store overlaps. decrease AS_MAX_EVALUE_BITS or AS_MAX_READLEN_BITS.
126 #endif
127
128 #else
129 int32 a_hang;
130 int32 b_hang;
131
132 uint32 evalue : AS_MAX_EVALUE_BITS; // 12
133 uint32 flipped : 1; // 1
134 uint32 filtered : 1; // 1
135 uint32 symmetric : 1; // 1 - twin overlap exists
136
137 uint32 a_iid;
138 uint32 b_iid;
139 #endif
140
141 };
142
143
144
145 inline
146 bool
BAToverlap_sortByEvalue(BAToverlap const & a,BAToverlap const & b)147 BAToverlap_sortByEvalue(BAToverlap const &a, BAToverlap const &b) {
148 return(a.evalue > b.evalue);
149 }
150
151
152
153 class OverlapStorage {
154 public:
OverlapStorage(uint64 nOvl)155 OverlapStorage(uint64 nOvl) {
156 _osAllocLen = 1024 * 1024 * 1024 / sizeof(BAToverlap); // 1GB worth of overlaps
157 _osLen = 0; // osMax is cheap and we overallocate it.
158 _osPos = 0; // If allocLen is small, we can end up with
159 _osMax = 2 * nOvl / _osAllocLen + 2; // more blocks than expected, when overlaps
160 _os = new BAToverlap * [_osMax]; // don't fit in the remaining space.
161
162 memset(_os, 0, sizeof(BAToverlap *) * _osMax);
163
164 _os[0] = new BAToverlap [_osAllocLen]; // Alloc first block, keeps getOverlapStorage() simple
165 };
166
OverlapStorage(OverlapStorage * original)167 OverlapStorage(OverlapStorage *original) {
168 _osAllocLen = original->_osAllocLen;
169 _osLen = 0;
170 _osPos = 0;
171 _osMax = original->_osMax;
172 _os = NULL;
173 };
174
~OverlapStorage()175 ~OverlapStorage() {
176 if (_os == NULL)
177 return;
178
179 for (uint32 ii=0; ii<_osMax; ii++)
180 delete [] _os[ii];
181 delete [] _os;
182 }
183
184
reset(void)185 OverlapStorage *reset(void) {
186 _osLen = 0;
187 _osPos = 0;
188
189 return(this);
190 };
191
192
get(void)193 BAToverlap *get(void) {
194 if (_os == NULL)
195 return(NULL);
196 return(_os[_osLen] + _osPos);
197 };
198
199
get(uint32 nOlaps)200 BAToverlap *get(uint32 nOlaps) {
201 if (_osPos + nOlaps > _osAllocLen) { // If we don't fit in the current allocation,
202 _osPos = 0; // move to the next one.
203 _osLen++;
204 }
205
206 _osPos += nOlaps; // Reserve space for these overlaps.
207
208 assert(_osLen < _osMax);
209
210 if (_os == NULL) // If we're not allowed to allocate,
211 return(NULL); // return nothing.
212
213 if (_os[_osLen] == NULL) // Otherwise, make sure we have space and return
214 _os[_osLen] = new BAToverlap [_osAllocLen]; // that space.
215
216 return(_os[_osLen] + _osPos - nOlaps);
217 };
218
219
advance(OverlapStorage * that)220 void advance(OverlapStorage *that) {
221 if (((that->_osLen < _osLen)) || // That segment before mine, or
222 ((that->_osLen == _osLen) && (that->_osPos <= _osPos))) // that segment equal and position before mine
223 return; // So no need to modify
224
225 _osLen = that->_osLen;
226 _osPos = that->_osPos;
227 };
228
229
230 private:
231 uint32 _osAllocLen; // Size of each allocation
232 uint32 _osLen; // Current allocation being used
233 uint32 _osPos; // Position in current allocation; next free overlap
234 uint32 _osMax; // Number of allocations we can make
235 BAToverlap **_os; // Allocations
236 };
237
238
239
240 class OverlapCache {
241 public:
242 OverlapCache(const char *ovlStorePath,
243 const char *prefix,
244 double maxErate,
245 uint32 minOverlap,
246 uint64 maxMemory,
247 uint64 genomeSize,
248 bool symmetrize=true);
249 ~OverlapCache();
250
251 bool compareOverlaps(const BAToverlap &a, const BAToverlap &b) const; // we can almost do templated but the fields are functions in one and just members in the other
252
253 private:
254 bool compareOverlaps(const ovOverlap &a, const ovOverlap &b) const;
255
256 uint32 filterOverlaps(uint32 aid, uint32 maxOVSerate, uint32 minOverlap, uint32 no);
257 uint32 filterDuplicates(uint32 &no);
258
259 void computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize);
260 void loadOverlaps(ovStore *ovlStore);
261 void symmetrizeOverlaps(void);
262
263 public:
getOverlaps(uint32 readIID,uint32 & numOverlaps)264 BAToverlap *getOverlaps(uint32 readIID, uint32 &numOverlaps) {
265 numOverlaps = _overlapLen[readIID];
266 return(_overlaps[readIID]);
267 }
268
269 private:
270 const char *_prefix;
271
272 uint64 _memLimit; // Expected max size of bogart
273 uint64 _memReserved; // Memory to reserve for processing
274 uint64 _memAvail; // Memory available for storing overlaps
275 uint64 _memStore; // Memory used to support overlaps
276 uint64 _memOlaps; // Memory used to store overlaps
277
278 uint32 *_overlapLen;
279 uint32 *_overlapMax;
280 BAToverlap **_overlaps;
281
282 // Instead of allocating space for overlaps per read (which has some visible but unknown size
283 // cost with each allocation), or in a single massive allocation (which we can't resize), we
284 // allocate overlaps in large blocks then set pointers into each block where overlaps for each
285 // read start. This is managed by OverlapStorage.
286
287 OverlapStorage *_overlapStorage;
288
289 uint32 _maxEvalue; // Don't load overlaps with high error
290 uint32 _minOverlap; // Don't load overlaps that are short
291
292 uint32 _minPer; // Minimum number of overlaps to retain for a single read
293 uint32 _maxPer; // Maximum number of overlaps to load for a single read
294
295 uint64 *_minSco; // The minimum score accepted for each read
296
297 uint32 _ovsMax; // For loading overlaps
298 ovOverlap *_ovs; //
299 uint64 *_ovsSco; // For scoring overlaps during the load
300 uint64 *_ovsTmp; // For picking out a score threshold
301
302 uint64 _genomeSize;
303 };
304
305
306
307 extern OverlapCache *OC;
308
309 #endif // INCLUDE_AS_BAT_OVERLAPCACHE
310