1 /*****************************************************************************
2 expand.cpp
3
4 (c) 2009, 2010, 2011 - Aaron Quinlan
5 Center for Public Health Genomics
6 University of Virginia
7 aaronquinlan@gmail.com
8
9 Licenced under the MIT license.
10 ******************************************************************************/
11 #include <vector>
12 #include <map>
13 #include <numeric>
14 #include <algorithm>
15 #include <iterator>
16 #include <iostream>
17 #include <iomanip>
18 #include <fstream>
19 #include <sstream>
20 #include <stdlib.h>
21 #include <math.h>
22 #include <limits.h>
23 #include <string.h>
24 #include <exception>
25 #include <stdexcept> // out_of_range exception
26
27 #include "version.h"
28 #include "lineFileUtilities.h"
29 #include "tabFile.h"
30 #include "VectorOps.h"
31 using namespace std;
32
33
34 // define our program name
35 #define PROGRAM_NAME "bedtools expand"
36 // define our parameter checking macro
37 #define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && \
38 (actualLen == paramLen))
39 #define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
40
41 // function declarations
42 void expand_help(void);
43 void Expand(const string &inFile,
44 const vector<int> &expColumns);
45
expand_main(int argc,char * argv[])46 int expand_main(int argc, char* argv[]) {
47
48 // input files
49 string inFile = "stdin";
50 string groupColumnsString = "1,2,3";
51 string expColumnString;
52
53 // our configuration variables
54 bool showHelp = false;
55 bool haveExpColumns = false;
56
57 // check to see if we should print out some help
58 if(argc <= 1) showHelp = true;
59
60 for(int i = 1; i < argc; i++) {
61 int parameterLength = (int)strlen(argv[i]);
62
63 if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
64 (PARAMETER_CHECK("--help", 5, parameterLength))) {
65 showHelp = true;
66 }
67 }
68
69 if(showHelp) expand_help();
70
71 // do some parsing (all of these parameters require 2 strings)
72 for(int i = 1; i < argc; i++) {
73
74 int parameterLength = (int)strlen(argv[i]);
75
76 if(PARAMETER_CHECK("-i", 2, parameterLength)) {
77 if ((i+1) < argc) {
78 inFile = argv[i + 1];
79 i++;
80 }
81 }
82 else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
83 if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
84 cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl;
85 expand_help();
86 break;
87 }
88 else {
89 haveExpColumns = true;
90 expColumnString = argv[i + 1];
91 i++;
92 }
93 }
94 else {
95 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
96 showHelp = true;
97 }
98 }
99
100 if (!haveExpColumns) {
101 cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl;
102 showHelp = true;
103 }
104
105
106
107 if (!showHelp) {
108 vector<int> expColumns;
109 Tokenize(expColumnString, expColumns, ',');
110
111 // sanity check the exp columns
112 for(size_t i = 0; i < expColumns.size(); ++i) {
113 int expCol = expColumns[i];
114 if (expCol < 1) {
115 cerr << endl << "*****" << endl << "*****ERROR: expansion columns must be >=1. " << endl << "*****" << endl;
116 expand_help();
117 }
118 }
119 Expand(inFile, expColumns);
120 }
121 else {
122 expand_help();
123 }
124 return 0;
125 }
126
expand_help(void)127 void expand_help(void) {
128
129 cerr << "\nTool: bedtools expand " << endl;
130 cerr << "Version: " << VERSION << "\n";
131 cerr << "Summary: Replicate lines in a file based on columns of comma-separated values." << endl << endl;
132
133 cerr << "Usage:\t " << PROGRAM_NAME << " -c [COLS] " << endl;
134
135 cerr << "Options: " << endl;
136 cerr << "\t-i\t" << "Input file. Assumes \"stdin\" if omitted." << endl << endl;
137
138 cerr << "\t-c \t" << "Specify the column (1-based) that should be summarized." << endl;
139 cerr << "\t\t- Required." << endl;
140
141 cerr << "Examples: " << endl;
142 cerr << " $ cat test.txt" << endl;
143 cerr << " chr1 10 20 1,2,3 10,20,30" << endl;
144 cerr << " chr1 40 50 4,5,6 40,50,60" << endl << endl;
145
146 cerr << " $ bedtools expand test.txt -c 5" << endl;
147 cerr << " chr1 10 20 1,2,3 10" << endl;
148 cerr << " chr1 10 20 1,2,3 20" << endl;
149 cerr << " chr1 10 20 1,2,3 30" << endl;
150 cerr << " chr1 40 50 4,5,6 40" << endl;
151 cerr << " chr1 40 50 4,5,6 50" << endl;
152 cerr << " chr1 40 50 4,5,6 60" << endl << endl;
153
154 cerr << " $ bedtools expand test.txt -c 4,5" << endl;
155 cerr << " chr1 10 20 1 10" << endl;
156 cerr << " chr1 10 20 2 20" << endl;
157 cerr << " chr1 10 20 3 30" << endl;
158 cerr << " chr1 40 50 4 40" << endl;
159 cerr << " chr1 40 50 5 50" << endl;
160 cerr << " chr1 40 50 6 60" << endl;
161
162 // end the program here
163 exit(1);
164
165 }
166
167
Expand(const string & inFile,const vector<int> & expColumns)168 void Expand (const string &inFile,
169 const vector<int> &expColumns)
170 {
171
172 // current line number
173 int lineNum = 0;
174 // string representing current line
175 string inLine;
176
177 // vector of strings holding the tokenized current line
178 vector<string> inFields;
179 inFields.reserve(20);
180
181 // build a map of the columns to be expanded
182 // to allow quic lookups to test if a column is
183 // "normal" or whether it is one of the columns
184 // that is being expaded
185 map<int, bool> expColMap;
186 for (size_t c = 0; c < expColumns.size(); c++)
187 expColMap[expColumns[c]] = true;
188
189 // open a new tab file, loop through it line by line
190 // and expand each line into multiple lines according to the
191 // columns the user has requested.
192 //
193 TabLineStatus tabLineStatus;
194 TabFile *_tab = new TabFile(inFile);
195 _tab->Open();
196 while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
197 lineNum++;
198 if (tabLineStatus == TAB_VALID) {
199
200 // a list containing the expanded values (inner) for each column (outer)
201 vector< vector<string> > expandedCols;
202
203 // expand each requested column into a vector
204 int prev_size = -1;
205 for (size_t c = 0; c < expColumns.size(); c++)
206 {
207 vector<string> expansion;
208 if ((expColumns[c]-1) >= (int) inFields.size()) {
209 cerr << endl
210 << "*****" << endl
211 << "*****ERROR: Requested column number exceeds number of columns." << endl
212 << "***** This was violated at line: " << lineNum << endl
213 << "*****" << endl;
214 exit(1);
215 }
216
217 // expand the requested column into a vector
218 Tokenize(inFields[expColumns[c]-1], expansion, ',');
219
220 if ((int) expansion.size() != prev_size && prev_size >= 0) {
221 cerr << endl
222 << "*****" << endl
223 << "*****ERROR: Each expanded column must have the same number of elements." << endl
224 << "***** This was violated at line: " << lineNum << endl
225 << "*****" << endl;
226 exit(1);
227 }
228 else {
229 expandedCols.push_back(expansion);
230 }
231 prev_size = expansion.size();
232 }
233
234 // now replicate/expand the original line based on the
235 // values in the requested columns
236 size_t totalCols = inFields.size();
237 for (size_t n = 0; n < expandedCols[0].size(); n++)
238 {
239 int numExpColsSeen = 0;
240 for (size_t c = 0; c < totalCols; c++)
241 {
242 // normal column, print as-is
243 if (!expColMap[c+1]) {
244 printf("%s", inFields[c].c_str());
245 }
246 // expanded column, grab relevant value from expanded vector.
247 else {
248 // expandedCols[i][j]
249 // i == column, j = row
250 printf("%s", expandedCols[numExpColsSeen][n].c_str());
251 numExpColsSeen++;
252 }
253 // add a tab if not the very last value
254 if (c < totalCols - 1)
255 printf("\t");
256 }
257 printf("\n");
258 }
259 }
260 inFields.clear();
261 }
262 _tab->Close();
263 }
264
265