1 /*****************************************************************************
2   sortBed.cpp
3 
4   (c) 2009 - Aaron Quinlan
5   Hall Laboratory
6   Department of Biochemistry and Molecular Genetics
7   University of Virginia
8   aaronquinlan@gmail.com
9 
10   Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include <set>
13 #include "lineFileUtilities.h"
14 #include "sortBed.h"
15 
16 //
17 // Constructor
18 //
BedSort(string & bedFile,bool printHeader,string & faidxFile)19 BedSort::BedSort(string &bedFile, bool printHeader,string &faidxFile):_faidxFile(faidxFile) {
20     _bedFile = bedFile;
21     _bed = new BedFile(bedFile);
22 
23     _bed->loadBedFileIntoMapNoBin();
24     // report the header first if asked.
25     if (printHeader == true) {
26         _bed->PrintHeader();
27     }
28 }
29 
30 //
31 // Destructor
32 //
~BedSort(void)33 BedSort::~BedSort(void) {
34 }
35 
36 
SortBed()37 void BedSort::SortBed() {
38 
39     // loop through each chromosome and merge their BED entries
40     for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
41 
42         // bedList is already sorted by start position.
43         const vector<BED> &bedList = m->second;
44 
45         for (unsigned int i = 0; i < bedList.size(); ++i) {
46             _bed->reportBedNewLine(bedList[i]);
47         }
48     }
49 }
50 
SortBedOnFaidx()51 void BedSort::SortBedOnFaidx()
52 		{
53 		set<string> all_chromosomes;
54 		if(_faidxFile.empty())
55 				{
56 				cerr << "[sortBed] File for fasta index undefined." << endl;
57 				exit(EXIT_FAILURE);
58 				}
59 		/* read FAIDX file */
60 		ifstream faidx(_faidxFile.c_str(),ios::in);
61 		if(!faidx.is_open()) {
62 				cerr << "Cannot open \""<< _faidxFile << "\""
63 						<< strerror(errno)
64 						<< endl;
65 				exit(EXIT_FAILURE);
66 				}
67 		string line;
68 		while(getline(faidx,line,'\n')) {
69 				if(line.empty()) continue;
70 				string::size_type tab= line.find('\t');
71 				if(tab!=string::npos) {
72 						line.erase(tab);
73 						}
74 				if(all_chromosomes.find(line) != all_chromosomes.end()) {
75 						cerr  << "Chromosome \"" << line
76 									<<"\" defined twice in "
77 									<< _faidxFile
78 									<< endl;
79 						exit(EXIT_FAILURE);
80 						}
81 				_tid2chrom[_tid2chrom.size()]=line;
82 				all_chromosomes.insert(line);
83 				}
84 		faidx.close();
85 		/** end read FAIDX */
86 
87 		//check BED chromosomes
88 		for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
89 					if( all_chromosomes.find(m->first) ==  all_chromosomes.end()) {
90 						cerr  << "Chromosome \"" << m->first
91 									<<"\" undefined in "
92 									<< _faidxFile
93 									<< endl;
94 						exit(EXIT_FAILURE);
95 						}
96 				}
97 
98 
99 		//loop over each chromosome using the faidx index
100 		for(size_t tid=0; tid <_tid2chrom.size(); ++tid)
101 			{
102 			string chrom = _tid2chrom[tid];
103 			masterBedMapNoBin::iterator m = _bed->bedMapNoBin.find(chrom);
104 
105 			if( m == _bed->bedMapNoBin.end() ) continue; //this chromosome is not present in BED
106 
107 		    // bedList is already sorted by start position.
108 		    const vector<BED> &bedList = m->second;
109 
110 		    for (unsigned int i = 0; i < bedList.size(); ++i) {
111 		        _bed->reportBedNewLine(bedList[i]);
112 			  }
113 			}
114 
115 		}
116 
117 
SortBedBySizeAsc()118 void BedSort::SortBedBySizeAsc() {
119 
120     vector<BED> masterList;
121     masterList.reserve(1000000);
122 
123     // loop through each chromosome and merge their BED entries
124     for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
125 
126         // bedList is already sorted by start position.
127         const vector<BED> &bedList = m->second;
128 
129         // add the entries from this chromosome to the current list
130         for (unsigned int i = 0; i < bedList.size(); ++i) {
131             masterList.push_back(bedList[i]);
132         }
133     }
134 
135     // sort the master list by size (asc.)
136     sort(masterList.begin(), masterList.end(), sortBySizeAsc);
137 
138     // report the entries in ascending order
139     for (unsigned int i = 0; i < masterList.size(); ++i) {
140         _bed->reportBedNewLine(masterList[i]);
141     }
142 }
143 
144 
SortBedBySizeDesc()145 void BedSort::SortBedBySizeDesc() {
146 
147     vector<BED> masterList;
148     masterList.reserve(1000000);
149 
150     // loop through each chromosome and merge their BED entries
151     for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
152 
153         // bedList is already sorted by start position.
154         const vector<BED> &bedList = m->second;
155 
156         // add the entries from this chromosome to the current list
157         for (unsigned int i = 0; i < bedList.size(); ++i) {
158             masterList.push_back(bedList[i]);
159         }
160     }
161 
162     // sort the master list by size (asc.)
163     sort(masterList.begin(), masterList.end(), sortBySizeDesc);
164 
165     // report the entries in ascending order
166     for (unsigned int i = 0; i < masterList.size(); ++i) {
167         _bed->reportBedNewLine(masterList[i]);
168     }
169 }
170 
SortBedByChromThenSizeAsc()171 void BedSort::SortBedByChromThenSizeAsc() {
172 
173     // loop through each chromosome and merge their BED entries
174     for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
175 
176         // bedList is already sorted by start position.
177         vector<BED> &bedList = m->second;
178         sort(bedList.begin(), bedList.end(), sortBySizeAsc);
179 
180         for (unsigned int i = 0; i < bedList.size(); ++i) {
181             _bed->reportBedNewLine(bedList[i]);
182         }
183     }
184 }
185 
186 
SortBedByChromThenSizeDesc()187 void BedSort::SortBedByChromThenSizeDesc() {
188 
189     // loop through each chromosome and merge their BED entries
190     for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
191 
192         // bedList is already sorted by start position.
193         vector<BED> &bedList = m->second;
194 
195         sort(bedList.begin(), bedList.end(), sortBySizeDesc);
196 
197         for (unsigned int i = 0; i < bedList.size(); ++i) {
198             _bed->reportBedNewLine(bedList[i]);
199         }
200     }
201 }
202 
203 
SortBedByChromThenScoreAsc()204 void BedSort::SortBedByChromThenScoreAsc() {
205 
206     if (_bed->bedType >= 5) {
207         // loop through each chromosome and merge their BED entries
208         for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
209 
210             // bedList is already sorted by start position.
211             vector<BED> &bedList = m->second;
212             sort(bedList.begin(), bedList.end(), sortByScoreAsc);
213 
214             for (unsigned int i = 0; i < bedList.size(); ++i) {
215                 _bed->reportBedNewLine(bedList[i]);
216             }
217         }
218     }
219     else {
220         cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater.  Exiting." << endl;
221         exit(1);
222     }
223 }
224 
225 
SortBedByChromThenScoreDesc()226 void BedSort::SortBedByChromThenScoreDesc() {
227 
228     if (_bed->bedType >= 5) {
229         // loop through each chromosome and merge their BED entries
230         for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
231 
232             // bedList is already sorted by start position.
233             vector<BED> &bedList = m->second;
234             sort(bedList.begin(), bedList.end(), sortByScoreDesc);
235 
236             for (unsigned int i = 0; i < bedList.size(); ++i) {
237                 _bed->reportBedNewLine(bedList[i]);
238             }
239         }
240     }
241     else {
242         cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater.  Exiting." << endl;
243         exit(1);
244     }
245 }
246 
247