1 #include "CommonHelp.h"
2
groupby_help(void)3 void groupby_help(void) {
4
5 cerr << "\nTool: bedtools groupby " << endl;
6 cerr << "Version: " << VERSION << "\n";
7 cerr << "Summary: Summarizes a dataset column based upon" << endl;
8 cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl;
9
10 cerr << "Usage:\t " << "bedtools groupby" << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl;
11 cerr << "\t " << "cat [FILE] | " << "bedtools groupby" << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl << endl;
12
13 cerr << "Options: " << endl;
14 cerr << "\t-i\t\t" << "Input file. Assumes \"stdin\" if omitted." << endl << endl;
15
16 cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl;
17 cerr << "\t\t\tThe columns must be comma separated." << endl;
18 cerr << "\t\t\t- Default: 1,2,3" << endl << endl;
19
20 cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl;
21 cerr << "\t\t\t- Required." << endl << endl;
22
23 cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl;
24 cerr << "\t\t\tValid operations:" << endl;
25 cerr << "\t\t\t sum, count, count_distinct, min, max," << endl;
26 cerr << "\t\t\t mean, median, mode, antimode," << endl;
27 cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
28 cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
29 cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
30 cerr << "\t\t\t distinct_sort_num (as distinct, but sorted numerically, ascending), " << endl;
31 cerr << "\t\t\t distinct_sort_num_desc (as distinct, but sorted numerically, descending), " << endl;
32 cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
33 cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
34 cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
35 cerr << "\t\t\t first (i.e., print first value)" << endl;
36 cerr << "\t\t\t last (i.e., print last value)" << endl;
37
38 cerr << "\t\t\t- Default: sum" << endl << endl;
39
40 cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
41 cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
42 cerr << "\t\tmultiple columns, that operation will be applied to all columns." << endl;
43 cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
44 cerr << "\t\tand will be applied in respective order." << endl;
45 cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
46 cerr << "\t\tthe mean of column 4, and the count of column 6." << endl;
47 cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
48
49 cerr << "\t-full\t\t" << "Print all columns from input file. The first line in the group is used." << endl;
50 cerr << "\t\t\tDefault: print only grouped columns." << endl << endl;
51
52 cerr << "\t-inheader\t" << "Input file has a header line - the first line will be ignored." << endl << endl ;
53
54 cerr << "\t-outheader\t" << "Print header line in the output, detailing the column names. " << endl;
55 cerr << "\t\t\tIf the input file has headers (-inheader), the output file" << endl;
56 cerr << "\t\t\twill use the input's column names." << endl;
57 cerr << "\t\t\tIf the input file has no headers, the output file" << endl;
58 cerr << "\t\t\twill use \"col_1\", \"col_2\", etc. as the column names." << endl << endl;
59
60 cerr << "\t-header\t\t" << "same as '-inheader -outheader'" << endl << endl;
61
62 cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl;
63
64 cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
65 cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl;
66 cerr << "\t\t- Example: -delim \"|\"" << endl;
67 cerr << "\t\t- Default: \",\"." << endl << endl;
68
69 cerr << "Examples: " << endl;
70 cerr << "\t$ cat ex1.out" << endl;
71 cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000 ATAT" << endl;
72 cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000 CGCG" << endl << endl;
73 cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl;
74 cerr << "\tchr1 10 20 A 11000" << endl << endl;
75 cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl;
76 cerr << "\tchr1 10 20 A 11000 10000" << endl << endl;
77 cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
78 cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
79 cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
80 cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
81 cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl;
82 cerr << "\tchr1 10 20 A ATATCGCG" << endl << endl;
83
84 cerr << "Notes: " << endl;
85 cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl;
86 cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl;
87
88
89 // end the program here
90 exit(1);
91
92 }
93