1 /*****************************************************************************
2 chromsweep.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "chromsweep.h"
14 #include <queue>
15
16 bool after(const BED &a, const BED &b);
17
18 /*
19 // constructor using existing BedFile pointers
20 */
ChromSweep(BedFile * query,BedFile * db,bool sameStrand,bool diffStrand,float overlapFraction,bool reciprocal,bool useMergedIntervals,bool printHeader)21 ChromSweep::ChromSweep(BedFile *query, BedFile *db,
22 bool sameStrand, bool diffStrand,
23 float overlapFraction, bool reciprocal,
24 bool useMergedIntervals, bool printHeader)
25
26
27 : _query(query)
28 , _db(db)
29 , _overlapFraction(overlapFraction)
30 , _sameStrand(sameStrand)
31 , _diffStrand(diffStrand)
32 , _reciprocal(reciprocal)
33 , _useMergedIntervals(useMergedIntervals)
34 {
35 _hits.reserve(100000);
36
37 _query->Open();
38 if (printHeader) _query->PrintHeader();
39 _db->Open();
40
41 NextQuery();
42 NextDatabase();
43 }
44
45 /*
46 Constructor with filenames
47 */
ChromSweep(string & queryFile,string & dbFile)48 ChromSweep::ChromSweep(string &queryFile, string &dbFile)
49 {
50 _hits.reserve(100000);
51
52 _query = new BedFile(queryFile);
53 _db = new BedFile(dbFile);
54
55 _query->Open();
56 _db->Open();
57
58 NextQuery();
59 NextDatabase();
60 }
61
62
NextQuery()63 bool ChromSweep::NextQuery() {
64 if (!_useMergedIntervals)
65 return _query->GetNextBed(_curr_qy, true);
66 else
67 return _query->GetNextMergedBed(_curr_qy);
68 }
69
NextDatabase()70 bool ChromSweep::NextDatabase() {
71 if (!_useMergedIntervals)
72 return _db->GetNextBed(_curr_db, true);
73 else
74 return _db->GetNextMergedBed(_curr_db);
75 }
76
77 /*
78 Destructor
79 */
~ChromSweep(void)80 ChromSweep::~ChromSweep(void) {
81 }
82
83
ScanCache()84 void ChromSweep::ScanCache() {
85 list<BED>::iterator c = _cache.begin();
86 while (c != _cache.end())
87 {
88 if ((_curr_qy.chrom == c->chrom) && !(after(_curr_qy, *c))) {
89 if (IsValidHit(_curr_qy, *c)) {
90 _hits.push_back(*c);
91 }
92 ++c;
93 }
94 else {
95 c = _cache.erase(c);
96 }
97 }
98 }
99
100
ChromChange()101 bool ChromSweep::ChromChange()
102 {
103 // the files are on the same chrom
104 if (_curr_qy.chrom == _curr_db.chrom) {
105 return false;
106 }
107 // the query is ahead of the database.
108 // fast-forward the database to catch-up.
109 else if ((_curr_qy.chrom > _curr_db.chrom) && (!_db->Empty())) {
110
111 while (NextDatabase() &&
112 _curr_db.chrom < _curr_qy.chrom)
113 {
114 }
115 _cache.clear();
116 return false;
117 }
118 // the database is ahead of the query.
119 else {
120 // 1. scan the cache for remaining hits on the query's current chrom.
121 if (_curr_qy.chrom == _curr_chrom)
122 {
123 ScanCache();
124 _results.push(make_pair(_curr_qy, _hits));
125 _hits.clear();
126 }
127 // 2. fast-forward until we catch up and report 0 hits until we do.
128 else if (_curr_qy.chrom < _curr_db.chrom)
129 {
130 _results.push(make_pair(_curr_qy, _no_hits));
131 _cache.clear();
132 }
133 NextQuery();
134 _curr_chrom = _curr_qy.chrom;
135 return true;
136 }
137 }
138
139
IsValidHit(const BED & query,const BED & db)140 bool ChromSweep::IsValidHit(const BED &query, const BED &db) {
141 CHRPOS aLength = query.end - query.start;
142 CHRPOS s = max(query.start, db.start);
143 CHRPOS e = min(query.end, db.end);
144 CHRPOS overlapBases = (e - s);
145 // 1. is there sufficient overlap w.r.t A?
146 if ( (float) overlapBases / (float) aLength >= _overlapFraction) {
147 CHRPOS bLength = (db.end - db.start);
148 float bOverlap = ( (float) overlapBases / (float) bLength );
149
150 // Now test for necessary strandedness.
151 bool strands_are_same = (query.strand == db.strand);
152 if ( (_sameStrand == false && _diffStrand == false)
153 ||
154 (_sameStrand == true && strands_are_same == true)
155 ||
156 (_diffStrand == true && strands_are_same == false)
157 )
158 {
159 // 3. did the user request reciprocal overlap
160 // (i.e. sufficient overlap w.r.t. both A and B?)
161 if (!_reciprocal)
162 return true;
163 else if (bOverlap >= _overlapFraction)
164 return true;
165 }
166 }
167 return false;
168 }
169
170
Next(pair<BED,vector<BED>> & next)171 bool ChromSweep::Next(pair<BED, vector<BED> > &next) {
172 if (!_query->Empty()) {
173 // have we changed chromosomes?
174 if (ChromChange() == false) {
175 // scan the database cache for hits
176 ScanCache();
177 // advance the db until we are ahead of the query.
178 // update hits and cache as necessary
179 while (!_db->Empty() &&
180 _curr_qy.chrom == _curr_db.chrom &&
181 !(after(_curr_db, _curr_qy)))
182 {
183 if (IsValidHit(_curr_qy, _curr_db)) {
184 _hits.push_back(_curr_db);
185 }
186 _cache.push_back(_curr_db);
187 NextDatabase();
188 }
189 // add the hits for this query to the pump
190 _results.push(make_pair(_curr_qy, _hits));
191 // reset for the next query
192 _hits.clear();
193 NextQuery();
194 _curr_chrom = _curr_qy.chrom;
195 }
196 }
197 // report the next set if hits if there are still overlaps in the pump
198 if (!_results.empty()) {
199 next = _results.front();
200 _results.pop();
201 return true;
202 }
203 else {
204 return false;
205 }
206 }
207
208