1 /*****************************************************************************
2 sortBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include <set>
13 #include "lineFileUtilities.h"
14 #include "sortBed.h"
15
16 //
17 // Constructor
18 //
BedSort(string & bedFile,bool printHeader,string & faidxFile)19 BedSort::BedSort(string &bedFile, bool printHeader,string &faidxFile):_faidxFile(faidxFile) {
20 _bedFile = bedFile;
21 _bed = new BedFile(bedFile);
22
23 _bed->loadBedFileIntoMapNoBin();
24 // report the header first if asked.
25 if (printHeader == true) {
26 _bed->PrintHeader();
27 }
28 }
29
30 //
31 // Destructor
32 //
~BedSort(void)33 BedSort::~BedSort(void) {
34 }
35
36
SortBed()37 void BedSort::SortBed() {
38
39 // loop through each chromosome and merge their BED entries
40 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
41
42 // bedList is already sorted by start position.
43 const vector<BED> &bedList = m->second;
44
45 for (unsigned int i = 0; i < bedList.size(); ++i) {
46 _bed->reportBedNewLine(bedList[i]);
47 }
48 }
49 }
50
SortBedOnFaidx()51 void BedSort::SortBedOnFaidx()
52 {
53 set<string> all_chromosomes;
54 if(_faidxFile.empty())
55 {
56 cerr << "[sortBed] File for fasta index undefined." << endl;
57 exit(EXIT_FAILURE);
58 }
59 /* read FAIDX file */
60 ifstream faidx(_faidxFile.c_str(),ios::in);
61 if(!faidx.is_open()) {
62 cerr << "Cannot open \""<< _faidxFile << "\""
63 << strerror(errno)
64 << endl;
65 exit(EXIT_FAILURE);
66 }
67 string line;
68 while(getline(faidx,line,'\n')) {
69 if(line.empty()) continue;
70 string::size_type tab= line.find('\t');
71 if(tab!=string::npos) {
72 line.erase(tab);
73 }
74 if(all_chromosomes.find(line) != all_chromosomes.end()) {
75 cerr << "Chromosome \"" << line
76 <<"\" defined twice in "
77 << _faidxFile
78 << endl;
79 exit(EXIT_FAILURE);
80 }
81 _tid2chrom[_tid2chrom.size()]=line;
82 all_chromosomes.insert(line);
83 }
84 faidx.close();
85 /** end read FAIDX */
86
87 //check BED chromosomes
88 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
89 if( all_chromosomes.find(m->first) == all_chromosomes.end()) {
90 cerr << "Chromosome \"" << m->first
91 <<"\" undefined in "
92 << _faidxFile
93 << endl;
94 exit(EXIT_FAILURE);
95 }
96 }
97
98
99 //loop over each chromosome using the faidx index
100 for(size_t tid=0; tid <_tid2chrom.size(); ++tid)
101 {
102 string chrom = _tid2chrom[tid];
103 masterBedMapNoBin::iterator m = _bed->bedMapNoBin.find(chrom);
104
105 if( m == _bed->bedMapNoBin.end() ) continue; //this chromosome is not present in BED
106
107 // bedList is already sorted by start position.
108 const vector<BED> &bedList = m->second;
109
110 for (unsigned int i = 0; i < bedList.size(); ++i) {
111 _bed->reportBedNewLine(bedList[i]);
112 }
113 }
114
115 }
116
117
SortBedBySizeAsc()118 void BedSort::SortBedBySizeAsc() {
119
120 vector<BED> masterList;
121 masterList.reserve(1000000);
122
123 // loop through each chromosome and merge their BED entries
124 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
125
126 // bedList is already sorted by start position.
127 const vector<BED> &bedList = m->second;
128
129 // add the entries from this chromosome to the current list
130 for (unsigned int i = 0; i < bedList.size(); ++i) {
131 masterList.push_back(bedList[i]);
132 }
133 }
134
135 // sort the master list by size (asc.)
136 sort(masterList.begin(), masterList.end(), sortBySizeAsc);
137
138 // report the entries in ascending order
139 for (unsigned int i = 0; i < masterList.size(); ++i) {
140 _bed->reportBedNewLine(masterList[i]);
141 }
142 }
143
144
SortBedBySizeDesc()145 void BedSort::SortBedBySizeDesc() {
146
147 vector<BED> masterList;
148 masterList.reserve(1000000);
149
150 // loop through each chromosome and merge their BED entries
151 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
152
153 // bedList is already sorted by start position.
154 const vector<BED> &bedList = m->second;
155
156 // add the entries from this chromosome to the current list
157 for (unsigned int i = 0; i < bedList.size(); ++i) {
158 masterList.push_back(bedList[i]);
159 }
160 }
161
162 // sort the master list by size (asc.)
163 sort(masterList.begin(), masterList.end(), sortBySizeDesc);
164
165 // report the entries in ascending order
166 for (unsigned int i = 0; i < masterList.size(); ++i) {
167 _bed->reportBedNewLine(masterList[i]);
168 }
169 }
170
SortBedByChromThenSizeAsc()171 void BedSort::SortBedByChromThenSizeAsc() {
172
173 // loop through each chromosome and merge their BED entries
174 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
175
176 // bedList is already sorted by start position.
177 vector<BED> &bedList = m->second;
178 sort(bedList.begin(), bedList.end(), sortBySizeAsc);
179
180 for (unsigned int i = 0; i < bedList.size(); ++i) {
181 _bed->reportBedNewLine(bedList[i]);
182 }
183 }
184 }
185
186
SortBedByChromThenSizeDesc()187 void BedSort::SortBedByChromThenSizeDesc() {
188
189 // loop through each chromosome and merge their BED entries
190 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
191
192 // bedList is already sorted by start position.
193 vector<BED> &bedList = m->second;
194
195 sort(bedList.begin(), bedList.end(), sortBySizeDesc);
196
197 for (unsigned int i = 0; i < bedList.size(); ++i) {
198 _bed->reportBedNewLine(bedList[i]);
199 }
200 }
201 }
202
203
SortBedByChromThenScoreAsc()204 void BedSort::SortBedByChromThenScoreAsc() {
205
206 if (_bed->bedType >= 5) {
207 // loop through each chromosome and merge their BED entries
208 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
209
210 // bedList is already sorted by start position.
211 vector<BED> &bedList = m->second;
212 sort(bedList.begin(), bedList.end(), sortByScoreAsc);
213
214 for (unsigned int i = 0; i < bedList.size(); ++i) {
215 _bed->reportBedNewLine(bedList[i]);
216 }
217 }
218 }
219 else {
220 cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl;
221 exit(1);
222 }
223 }
224
225
SortBedByChromThenScoreDesc()226 void BedSort::SortBedByChromThenScoreDesc() {
227
228 if (_bed->bedType >= 5) {
229 // loop through each chromosome and merge their BED entries
230 for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
231
232 // bedList is already sorted by start position.
233 vector<BED> &bedList = m->second;
234 sort(bedList.begin(), bedList.end(), sortByScoreDesc);
235
236 for (unsigned int i = 0; i < bedList.size(); ++i) {
237 _bed->reportBedNewLine(bedList[i]);
238 }
239 }
240 }
241 else {
242 cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl;
243 exit(1);
244 }
245 }
246
247