1 /*****************************************************************************
2   chromsweep.cpp
3 
4   (c) 2009 - Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "chromsweep.h"
14 #include <queue>
15 
16 bool after(const BED &a, const BED &b);
17 
18 /*
19     // constructor using existing BedFile pointers
20 */
ChromSweep(BedFile * query,BedFile * db,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal,bool useMergedIntervals,bool printHeader)21 ChromSweep::ChromSweep(BedFile *query, BedFile *db,
22                        bool sameStrand, bool diffStrand,
23                        float overlapFraction, bool reciprocal,
24                        bool useMergedIntervals, bool printHeader)
25 
26 
27 : _query(query)
28 , _db(db)
29 , _overlapFraction(overlapFraction)
30 , _sameStrand(sameStrand)
31 , _diffStrand(diffStrand)
32 , _reciprocal(reciprocal)
33 , _useMergedIntervals(useMergedIntervals)
34 {
35     _hits.reserve(100000);
36 
37     _query->Open();
38     if (printHeader) _query->PrintHeader();
39     _db->Open();
40 
41     NextQuery();
42     NextDatabase();
43 }
44 
45 /*
46     Constructor with filenames
47 */
ChromSweep(string & queryFile,string & dbFile)48 ChromSweep::ChromSweep(string &queryFile, string &dbFile)
49 {
50     _hits.reserve(100000);
51 
52     _query = new BedFile(queryFile);
53     _db = new BedFile(dbFile);
54 
55     _query->Open();
56     _db->Open();
57 
58     NextQuery();
59     NextDatabase();
60 }
61 
62 
NextQuery()63 bool ChromSweep::NextQuery() {
64     if (!_useMergedIntervals)
65         return _query->GetNextBed(_curr_qy, true);
66     else
67         return _query->GetNextMergedBed(_curr_qy);
68 }
69 
NextDatabase()70 bool ChromSweep::NextDatabase() {
71     if (!_useMergedIntervals)
72         return _db->GetNextBed(_curr_db, true);
73     else
74         return _db->GetNextMergedBed(_curr_db);
75 }
76 
77 /*
78     Destructor
79 */
~ChromSweep(void)80 ChromSweep::~ChromSweep(void) {
81 }
82 
83 
ScanCache()84 void ChromSweep::ScanCache() {
85     list<BED>::iterator c = _cache.begin();
86     while (c != _cache.end())
87     {
88         if ((_curr_qy.chrom == c->chrom) && !(after(_curr_qy, *c))) {
89             if (IsValidHit(_curr_qy, *c)) {
90                 _hits.push_back(*c);
91             }
92             ++c;
93         }
94         else {
95             c = _cache.erase(c);
96         }
97     }
98 }
99 
100 
ChromChange()101 bool ChromSweep::ChromChange()
102 {
103     // the files are on the same chrom
104     if (_curr_qy.chrom == _curr_db.chrom) {
105         return false;
106     }
107     // the query is ahead of the database.
108 	// fast-forward the database to catch-up.
109     else if ((_curr_qy.chrom > _curr_db.chrom) && (!_db->Empty())) {
110 
111         while (NextDatabase() &&
112                _curr_db.chrom < _curr_qy.chrom)
113         {
114         }
115         _cache.clear();
116         return false;
117     }
118     // the database is ahead of the query.
119     else {
120         // 1. scan the cache for remaining hits on the query's current chrom.
121         if (_curr_qy.chrom == _curr_chrom)
122         {
123             ScanCache();
124             _results.push(make_pair(_curr_qy, _hits));
125             _hits.clear();
126         }
127         // 2. fast-forward until we catch up and report 0 hits until we do.
128         else if (_curr_qy.chrom < _curr_db.chrom)
129         {
130             _results.push(make_pair(_curr_qy, _no_hits));
131             _cache.clear();
132         }
133         NextQuery();
134         _curr_chrom = _curr_qy.chrom;
135         return true;
136     }
137 }
138 
139 
IsValidHit(const BED & query,const BED & db)140 bool ChromSweep::IsValidHit(const BED &query, const BED &db) {
141     CHRPOS aLength = query.end - query.start;
142     CHRPOS s = max(query.start, db.start);
143     CHRPOS e = min(query.end, db.end);
144     CHRPOS overlapBases = (e - s);
145     // 1. is there sufficient overlap w.r.t A?
146     if ( (float) overlapBases / (float) aLength  >= _overlapFraction) {
147         CHRPOS bLength = (db.end - db.start);
148         float bOverlap = ( (float) overlapBases / (float) bLength );
149 
150         // Now test for necessary strandedness.
151         bool strands_are_same = (query.strand == db.strand);
152         if ( (_sameStrand == false && _diffStrand == false)
153              ||
154              (_sameStrand == true && strands_are_same == true)
155              ||
156              (_diffStrand == true && strands_are_same == false)
157            )
158         {
159             // 3. did the user request reciprocal overlap
160             // (i.e. sufficient overlap w.r.t. both A and B?)
161             if (!_reciprocal)
162                 return true;
163             else if (bOverlap >= _overlapFraction)
164                 return true;
165         }
166     }
167     return false;
168 }
169 
170 
Next(pair<BED,vector<BED>> & next)171 bool ChromSweep::Next(pair<BED, vector<BED> > &next) {
172     if (!_query->Empty()) {
173         // have we changed chromosomes?
174         if (ChromChange() == false) {
175             // scan the database cache for hits
176             ScanCache();
177             // advance the db until we are ahead of the query.
178             // update hits and cache as necessary
179             while (!_db->Empty() &&
180                    _curr_qy.chrom == _curr_db.chrom &&
181                    !(after(_curr_db, _curr_qy)))
182             {
183                 if (IsValidHit(_curr_qy, _curr_db)) {
184                     _hits.push_back(_curr_db);
185                 }
186                 _cache.push_back(_curr_db);
187                 NextDatabase();
188             }
189             // add the hits for this query to the pump
190             _results.push(make_pair(_curr_qy, _hits));
191             // reset for the next query
192             _hits.clear();
193             NextQuery();
194             _curr_chrom = _curr_qy.chrom;
195         }
196     }
197     // report the next set if hits if there are still overlaps in the pump
198     if (!_results.empty()) {
199         next = _results.front();
200         _results.pop();
201         return true;
202     }
203     else {
204         return false;
205     }
206 }
207 
208