1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #ifndef INCLUDE_AS_BAT_OVERLAPCACHE
19 #define INCLUDE_AS_BAT_OVERLAPCACHE
20 
21 #include "runtime.H"
22 #include "ovStore.H"
23 #include "sqStore.H"
24 
25 //  CA8 used to re-encode the error rate into a smaller-precision number.  This was
26 //  confusing and broken (it tried to use a log-based encoding to give more precision
27 //  to the smaller values).  CA3g gives up and uses all 12 bits of precision.
28 
29 //  If not enough space for the minimum number of error bits, bump up to a 64-bit word for overlap
30 //  storage.
31 
32 //  For storing overlaps in memory.  12 bytes per overlap.
33 class BAToverlap {
34 public:
BAToverlap()35   BAToverlap() {
36     evalue    = 0;
37     a_hang    = 0;
38     b_hang    = 0;
39     flipped   = false;
40 
41     filtered  = false;
42     symmetric = false;
43 
44     a_iid     = 0;
45     b_iid     = 0;
46   };
~BAToverlap()47   ~BAToverlap() {};
48 
49   //  Nasty bit of code duplication.
50 
51   bool
isDovetail(void)52   isDovetail(void) const {
53     return(((a_hang < 0) && (b_hang < 0)) ||
54            ((a_hang > 0) && (b_hang > 0)));
55   };
56 
57   bool
AEndIs5prime(void)58   AEndIs5prime(void) const {                   //  -------->
59     return((a_hang < 0) && (b_hang < 0));      //        -------
60   };
61 
62   bool
AEndIs3prime(void)63   AEndIs3prime(void) const {                   //     -------->
64     return((a_hang > 0) && (b_hang > 0));      //  -------
65   };
66 
67   bool
AisContainer(void)68   AisContainer(void) const {                   //  -------->
69     return((a_hang >= 0) && (b_hang <= 0));    //    ----
70   };
71 
72   bool
AisContained(void)73   AisContained(void) const {                   //    --->
74     return((a_hang <= 0) && (b_hang >= 0));    //  ---------
75   };
76 
77   bool
BEndIs3prime(void)78   BEndIs3prime(void) const {
79     assert(AisContainer() == false);  //  Function is not defined
80     assert(AisContained() == false);  //    for containments.
81     return((AEndIs5prime() && (flipped == false)) ||   // <===     ------>
82            (AEndIs3prime() && (flipped == true)));     //        ---->
83   };
84 
85   bool
BEndIs5prime(void)86   BEndIs5prime(void) const {
87     assert(AisContainer() == false);  //  Function is not defined
88     assert(AisContained() == false);  //    for containments.
89     return((AEndIs5prime() && (flipped == true)) ||   //          ------>
90            (AEndIs3prime() && (flipped == false)));   // <===          ---->
91   };
92 
93   double
erate(void)94   erate(void) const {
95     return(AS_OVS_decodeEvalue(evalue));
96   }
97 
98   void
convert(ovOverlap & olap)99   convert(ovOverlap &olap) {
100     olap.clear();
101 
102     olap.a_iid = a_iid;
103     olap.b_iid = b_iid;
104 
105     olap.evalue(evalue);
106 
107     olap.flipped(flipped);
108     olap.a_hang(a_hang);
109     olap.b_hang(b_hang);
110   }
111 
112 #if AS_MAX_READLEN_BITS < 24
113   uint64      evalue    : AS_MAX_EVALUE_BITS;     //  12
114   int64       a_hang    : AS_MAX_READLEN_BITS+1;  //  21+1
115   int64       b_hang    : AS_MAX_READLEN_BITS+1;  //  21+1
116   uint64      flipped   : 1;                      //   1
117 
118   uint64      filtered  : 1;                      //   1
119   uint64      symmetric : 1;                      //   1    - twin overlap exists
120 
121   uint32      a_iid;
122   uint32      b_iid;
123 
124 #if (AS_MAX_EVALUE_BITS + (AS_MAX_READLEN_BITS + 1) + (AS_MAX_READLEN_BITS + 1) + 1 + 1 + 1 > 64)
125 #error not enough bits to store overlaps.  decrease AS_MAX_EVALUE_BITS or AS_MAX_READLEN_BITS.
126 #endif
127 
128 #else
129   int32       a_hang;
130   int32       b_hang;
131 
132   uint32      evalue    : AS_MAX_EVALUE_BITS;     //  12
133   uint32      flipped   : 1;                      //   1
134   uint32      filtered  : 1;                      //   1
135   uint32      symmetric : 1;                      //   1    - twin overlap exists
136 
137   uint32      a_iid;
138   uint32      b_iid;
139 #endif
140 
141 };
142 
143 
144 
145 inline
146 bool
BAToverlap_sortByEvalue(BAToverlap const & a,BAToverlap const & b)147 BAToverlap_sortByEvalue(BAToverlap const &a, BAToverlap const &b) {
148   return(a.evalue > b.evalue);
149 }
150 
151 
152 
153 class OverlapStorage {
154 public:
OverlapStorage(uint64 nOvl)155   OverlapStorage(uint64 nOvl) {
156     _osAllocLen = 1024 * 1024 * 1024 / sizeof(BAToverlap);  //  1GB worth of overlaps
157     _osLen      = 0;                            //  osMax is cheap and we overallocate it.
158     _osPos      = 0;                            //  If allocLen is small, we can end up with
159     _osMax      = 2 * nOvl / _osAllocLen + 2;   //  more blocks than expected, when overlaps
160     _os         = new BAToverlap * [_osMax];    //  don't fit in the remaining space.
161 
162     memset(_os, 0, sizeof(BAToverlap *) * _osMax);
163 
164     _os[0]      = new BAToverlap [_osAllocLen];   //  Alloc first block, keeps getOverlapStorage() simple
165   };
166 
OverlapStorage(OverlapStorage * original)167   OverlapStorage(OverlapStorage *original) {
168     _osAllocLen = original->_osAllocLen;
169     _osLen      = 0;
170     _osPos      = 0;
171     _osMax      = original->_osMax;
172     _os         = NULL;
173   };
174 
~OverlapStorage()175   ~OverlapStorage() {
176     if (_os == NULL)
177       return;
178 
179     for (uint32 ii=0; ii<_osMax; ii++)
180       delete [] _os[ii];
181     delete [] _os;
182   }
183 
184 
reset(void)185   OverlapStorage *reset(void) {
186     _osLen = 0;
187     _osPos = 0;
188 
189     return(this);
190   };
191 
192 
get(void)193   BAToverlap   *get(void) {
194     if (_os == NULL)
195       return(NULL);
196     return(_os[_osLen] + _osPos);
197   };
198 
199 
get(uint32 nOlaps)200   BAToverlap   *get(uint32 nOlaps) {
201     if (_osPos + nOlaps > _osAllocLen) {           //  If we don't fit in the current allocation,
202       _osPos = 0;                                  //  move to the next one.
203       _osLen++;
204     }
205 
206     _osPos += nOlaps;                              //  Reserve space for these overlaps.
207 
208     assert(_osLen < _osMax);
209 
210     if (_os == NULL)                               //  If we're not allowed to allocate,
211       return(NULL);                                //  return nothing.
212 
213     if (_os[_osLen] == NULL)                       //  Otherwise, make sure we have space and return
214       _os[_osLen] = new BAToverlap [_osAllocLen];  //  that space.
215 
216     return(_os[_osLen] + _osPos - nOlaps);
217   };
218 
219 
advance(OverlapStorage * that)220   void          advance(OverlapStorage *that) {
221     if (((that->_osLen <  _osLen)) ||                            //  That segment before mine, or
222         ((that->_osLen == _osLen) && (that->_osPos <= _osPos)))  //  that segment equal and position before mine
223       return;                                                    //  So no need to modify
224 
225     _osLen = that->_osLen;
226     _osPos = that->_osPos;
227   };
228 
229 
230 private:
231   uint32                  _osAllocLen;   //  Size of each allocation
232   uint32                  _osLen;        //  Current allocation being used
233   uint32                  _osPos;        //  Position in current allocation; next free overlap
234   uint32                  _osMax;        //  Number of allocations we can make
235   BAToverlap            **_os;           //  Allocations
236 };
237 
238 
239 
240 class OverlapCache {
241 public:
242   OverlapCache(const char *ovlStorePath,
243                const char *prefix,
244                double maxErate,
245                uint32 minOverlap,
246                uint64 maxMemory,
247                uint64 genomeSize,
248                bool symmetrize=true);
249   ~OverlapCache();
250 
251   bool         compareOverlaps(const BAToverlap &a, const BAToverlap &b) const; // we can almost do templated but the fields are functions in one and just members in the other
252 
253 private:
254   bool         compareOverlaps(const ovOverlap &a,  const ovOverlap &b) const;
255 
256   uint32       filterOverlaps(uint32 aid, uint32 maxOVSerate, uint32 minOverlap, uint32 no);
257   uint32       filterDuplicates(uint32 &no);
258 
259   void         computeOverlapLimit(ovStore *ovlStore, uint64 genomeSize);
260   void         loadOverlaps(ovStore *ovlStore);
261   void         symmetrizeOverlaps(void);
262 
263 public:
getOverlaps(uint32 readIID,uint32 & numOverlaps)264   BAToverlap  *getOverlaps(uint32 readIID, uint32 &numOverlaps) {
265     numOverlaps = _overlapLen[readIID];
266     return(_overlaps[readIID]);
267   }
268 
269 private:
270   const char             *_prefix;
271 
272   uint64                  _memLimit;       //  Expected max size of bogart
273   uint64                  _memReserved;    //  Memory to reserve for processing
274   uint64                  _memAvail;       //  Memory available for storing overlaps
275   uint64                  _memStore;       //  Memory used to support overlaps
276   uint64                  _memOlaps;       //  Memory used to store overlaps
277 
278   uint32                 *_overlapLen;
279   uint32                 *_overlapMax;
280   BAToverlap            **_overlaps;
281 
282   //  Instead of allocating space for overlaps per read (which has some visible but unknown size
283   //  cost with each allocation), or in a single massive allocation (which we can't resize), we
284   //  allocate overlaps in large blocks then set pointers into each block where overlaps for each
285   //  read start.  This is managed by OverlapStorage.
286 
287   OverlapStorage         *_overlapStorage;
288 
289   uint32                  _maxEvalue;  //  Don't load overlaps with high error
290   uint32                  _minOverlap; //  Don't load overlaps that are short
291 
292   uint32                  _minPer;     //  Minimum number of overlaps to retain for a single read
293   uint32                  _maxPer;     //  Maximum number of overlaps to load for a single read
294 
295   uint64                 *_minSco;     //  The minimum score accepted for each read
296 
297   uint32                  _ovsMax;     //  For loading overlaps
298   ovOverlap              *_ovs;        //
299   uint64                 *_ovsSco;     //  For scoring overlaps during the load
300   uint64                 *_ovsTmp;     //  For picking out a score threshold
301 
302   uint64                  _genomeSize;
303 };
304 
305 
306 
307 extern OverlapCache     *OC;
308 
309 #endif  //  INCLUDE_AS_BAT_OVERLAPCACHE
310