1 /*****************************************************************************
2 expand.cpp
3 
4 (c) 2009, 2010, 2011 - Aaron Quinlan
5 Center for Public Health Genomics
6 University of Virginia
7 aaronquinlan@gmail.com
8 
9 Licenced under the MIT license.
10 ******************************************************************************/
11 #include <vector>
12 #include <map>
13 #include <numeric>
14 #include <algorithm>
15 #include <iterator>
16 #include <iostream>
17 #include <iomanip>
18 #include <fstream>
19 #include <sstream>
20 #include <stdlib.h>
21 #include <math.h>
22 #include <limits.h>
23 #include <string.h>
24 #include <exception>
25 #include <stdexcept> // out_of_range exception
26 
27 #include "version.h"
28 #include "lineFileUtilities.h"
29 #include "tabFile.h"
30 #include "VectorOps.h"
31 using namespace std;
32 
33 
34 // define our program name
35 #define PROGRAM_NAME "bedtools expand"
36 // define our parameter checking macro
37 #define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && \
38                                                               (actualLen == paramLen))
39 #define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
40 
41 // function declarations
42 void expand_help(void);
43 void Expand(const string &inFile,
44             const vector<int> &expColumns);
45 
expand_main(int argc,char * argv[])46 int expand_main(int argc, char* argv[]) {
47 
48     // input files
49     string inFile             = "stdin";
50     string groupColumnsString = "1,2,3";
51     string expColumnString;
52 
53     // our configuration variables
54     bool showHelp          = false;
55     bool haveExpColumns     = false;
56 
57     // check to see if we should print out some help
58     if(argc <= 1) showHelp = true;
59 
60     for(int i = 1; i < argc; i++) {
61         int parameterLength = (int)strlen(argv[i]);
62 
63         if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
64         (PARAMETER_CHECK("--help", 5, parameterLength))) {
65             showHelp = true;
66         }
67     }
68 
69     if(showHelp) expand_help();
70 
71     // do some parsing (all of these parameters require 2 strings)
72     for(int i = 1; i < argc; i++) {
73 
74         int parameterLength = (int)strlen(argv[i]);
75 
76         if(PARAMETER_CHECK("-i", 2, parameterLength)) {
77             if ((i+1) < argc) {
78                 inFile     = argv[i + 1];
79                 i++;
80             }
81         }
82         else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
83             if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
84                 cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl;
85                 expand_help();
86                 break;
87             }
88             else {
89                 haveExpColumns       = true;
90                 expColumnString     = argv[i + 1];
91                 i++;
92             }
93         }
94         else {
95             cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
96             showHelp = true;
97         }
98     }
99 
100     if (!haveExpColumns) {
101         cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl;
102         showHelp = true;
103     }
104 
105 
106 
107     if (!showHelp) {
108         vector<int> expColumns;
109         Tokenize(expColumnString, expColumns, ',');
110 
111         // sanity check the exp columns
112         for(size_t i = 0; i < expColumns.size(); ++i) {
113             int expCol = expColumns[i];
114             if (expCol < 1) {
115                 cerr << endl << "*****" << endl << "*****ERROR: expansion columns must be >=1. " << endl << "*****" << endl;
116                 expand_help();
117             }
118         }
119         Expand(inFile, expColumns);
120     }
121     else {
122         expand_help();
123     }
124     return 0;
125 }
126 
expand_help(void)127 void expand_help(void) {
128 
129     cerr << "\nTool:    bedtools expand " << endl;
130     cerr << "Version: " << VERSION << "\n";
131     cerr << "Summary: Replicate lines in a file based on columns of comma-separated values." << endl << endl;
132 
133     cerr << "Usage:\t " << PROGRAM_NAME << " -c [COLS] " << endl;
134 
135     cerr << "Options: " << endl;
136     cerr << "\t-i\t"        << "Input file. Assumes \"stdin\" if omitted." << endl << endl;
137 
138     cerr << "\t-c \t"    << "Specify the column (1-based) that should be summarized." << endl;
139     cerr                 << "\t\t- Required." << endl;
140 
141     cerr << "Examples: " << endl;
142     cerr << "  $ cat test.txt" << endl;
143     cerr << "  chr1	10	20	1,2,3	10,20,30" << endl;
144     cerr << "  chr1	40	50	4,5,6	40,50,60" << endl << endl;
145 
146     cerr << "  $ bedtools expand test.txt -c 5" << endl;
147     cerr << "  chr1	10	20	1,2,3	10" << endl;
148     cerr << "  chr1	10	20	1,2,3	20" << endl;
149     cerr << "  chr1	10	20	1,2,3	30" << endl;
150     cerr << "  chr1	40	50	4,5,6	40" << endl;
151     cerr << "  chr1	40	50	4,5,6	50" << endl;
152     cerr << "  chr1	40	50	4,5,6	60" << endl << endl;
153 
154     cerr << "  $ bedtools expand test.txt -c 4,5" << endl;
155     cerr << "  chr1	10	20	1	10" << endl;
156     cerr << "  chr1	10	20	2	20" << endl;
157     cerr << "  chr1	10	20	3	30" << endl;
158     cerr << "  chr1	40	50	4	40" << endl;
159     cerr << "  chr1	40	50	5	50" << endl;
160     cerr << "  chr1	40	50	6	60" << endl;
161 
162     // end the program here
163     exit(1);
164 
165 }
166 
167 
Expand(const string & inFile,const vector<int> & expColumns)168 void Expand (const string &inFile,
169     const vector<int> &expColumns)
170 {
171 
172     // current line number
173     int lineNum = 0;
174     // string representing current line
175     string inLine;
176 
177     // vector of strings holding the tokenized current line
178     vector<string>  inFields;
179     inFields.reserve(20);
180 
181     // build a map of the columns to be expanded
182     // to allow quic lookups to test if a column is
183     // "normal" or whether it is one of the columns
184     // that is being expaded
185     map<int, bool> expColMap;
186     for (size_t c = 0; c < expColumns.size(); c++)
187         expColMap[expColumns[c]] = true;
188 
189     // open a new tab file, loop through it line by line
190     // and expand each line into multiple lines according to the
191     // columns the user has requested.
192     //
193     TabLineStatus tabLineStatus;
194     TabFile *_tab = new TabFile(inFile);
195     _tab->Open();
196     while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
197         lineNum++;
198         if (tabLineStatus == TAB_VALID) {
199 
200             // a list containing the expanded values (inner) for each column (outer)
201             vector< vector<string> >  expandedCols;
202 
203             // expand each requested column into a vector
204             int prev_size = -1;
205             for (size_t c = 0; c < expColumns.size(); c++)
206             {
207                 vector<string> expansion;
208                 if ((expColumns[c]-1) >= (int) inFields.size()) {
209                     cerr << endl
210                     << "*****" << endl
211                     << "*****ERROR: Requested column number exceeds number of columns." << endl
212                     << "*****       This was violated at line: " << lineNum << endl
213                     << "*****" << endl;
214                     exit(1);
215                 }
216 
217                 // expand the requested column into a vector
218                 Tokenize(inFields[expColumns[c]-1], expansion, ',');
219 
220                 if ((int) expansion.size() != prev_size && prev_size >= 0) {
221                     cerr << endl
222                     << "*****" << endl
223                     << "*****ERROR: Each expanded column must have the same number of elements." << endl
224                     << "*****       This was violated at line: " << lineNum << endl
225                     << "*****" << endl;
226                     exit(1);
227                 }
228                 else {
229                     expandedCols.push_back(expansion);
230                 }
231                 prev_size = expansion.size();
232             }
233 
234             // now replicate/expand the original line based on the
235             // values in the requested columns
236             size_t totalCols  = inFields.size();
237             for (size_t n = 0; n < expandedCols[0].size(); n++)
238             {
239                 int numExpColsSeen = 0;
240                 for (size_t c = 0; c < totalCols; c++)
241                 {
242                     // normal column, print as-is
243                     if (!expColMap[c+1]) {
244                         printf("%s", inFields[c].c_str());
245                     }
246                     // expanded column, grab relevant value from expanded vector.
247                     else {
248                         // expandedCols[i][j]
249                         // i == column, j = row
250                         printf("%s", expandedCols[numExpColsSeen][n].c_str());
251                         numExpColsSeen++;
252                     }
253                     // add a tab if not the very last value
254                     if (c < totalCols - 1)
255                         printf("\t");
256                 }
257                 printf("\n");
258             }
259         }
260         inFields.clear();
261     }
262     _tab->Close();
263 }
264 
265