1
2 /******************************************************************************
3 *
4 * This file is part of meryl, a genomic k-kmer counter with nice features.
5 *
6 * This software is based on:
7 * 'Canu' v2.0 (https://github.com/marbl/canu)
8 * which is based on:
9 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
10 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
11 *
12 * Except as indicated otherwise, this is a 'United States Government Work',
13 * and is released in the public domain.
14 *
15 * File 'README.licenses' in the root directory of this distribution
16 * contains full conditions and disclaimers.
17 */
18
19 #include "meryl-lookup.H"
20
21
22 void
helpBED(char const * progname)23 helpBED(char const *progname) {
24
25 if (progname) {
26 fprintf(stderr, "usage: %s [-bed | -bed-runs] \\\n", progname);
27 fprintf(stderr, " -sequence input.fasta \\\n");
28 fprintf(stderr, " -output output.bed\\\n");
29 fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
30 fprintf(stderr, " -labels <input1name> [<input2name>] [...]\n");
31 fprintf(stderr, "\n");
32 }
33
34 fprintf(stderr, " -bed:\n");
35 fprintf(stderr, " Generate a BED format file showing the location of kmers in\n");
36 fprintf(stderr, " any input database on each sequence in 'input1.fasta'.\n");
37 fprintf(stderr, " Each kmer is reported in a separate bed record.\n");
38 fprintf(stderr, "\n");
39 fprintf(stderr, " -bed-runs:\n");
40 fprintf(stderr, " Generate a BED format file showing the location of kmers in\n");
41 fprintf(stderr, " any input database on each sequence in 'input1.fasta'.\n");
42 fprintf(stderr, " Overlapping kmers are combined into a single bed record.\n");
43 fprintf(stderr, "\n");
44
45 if (progname == nullptr)
46 return;
47
48 fprintf(stderr, " If multiple databases are supplied, the output file reports\n");
49 fprintf(stderr, " the location of kmers in any database. If the -labels option\n");
50 fprintf(stderr, " is supplied, each line will be annotated with the label of the\n");
51 fprintf(stderr, " databse the kmer is found in. If no -labels are supplied, a\n");
52 fprintf(stderr, " single line will be emitted regardless of how many databases the\n");
53 fprintf(stderr, " kmer is found in.\n");
54 fprintf(stderr, "\n");
55 fprintf(stderr, " For example, with two input databases (-mers A.meryl B.meryl) and\n");
56 fprintf(stderr, " labels supplied (-labels A B), with the first two kmers in the input\n");
57 fprintf(stderr, " sequence found in both databases, the output will be:\n");
58 fprintf(stderr, " sequence1 <tab> 0 <tab> 21 <tab> A\n");
59 fprintf(stderr, " sequence1 <tab> 0 <tab> 21 <tab> B\n");
60 fprintf(stderr, " sequence1 <tab> 1 <tab> 22 <tab> A\n");
61 fprintf(stderr, " sequence1 <tab> 1 <tab> 22 <tab> B\n");
62 fprintf(stderr, "\n");
63 fprintf(stderr, " Without the -labels option, the output will be:\n");
64 fprintf(stderr, " sequence1 <tab> 0 <tab> 21\n");
65 fprintf(stderr, " sequence1 <tab> 1 <tab> 22\n");
66 fprintf(stderr, "\n");
67 fprintf(stderr, " If -bed-runs is used, the output will bed:\n");
68 fprintf(stderr, " sequence1 <tab> 0 <tab> 22 <tab> A\n");
69 fprintf(stderr, " sequence1 <tab> 0 <tab> 22 <tab> B\n");
70 fprintf(stderr, "\n");
71 fprintf(stderr, " Output lines are written in the order sequences appear in the input\n");
72 fprintf(stderr, " file, and are in increasing position within the sequence itself.\n");
73 fprintf(stderr, "\n");
74 }
75
76 void
helpWIGcount(char const * progname)77 helpWIGcount(char const *progname) {
78
79 if (progname) {
80 fprintf(stderr, "usage: %s -wig-count \\\n", progname);
81 fprintf(stderr, " -sequence input.fasta \\\n");
82 fprintf(stderr, " -output output.wig \\\n");
83 fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
84 }
85
86 fprintf(stderr, " -wig-count:\n");
87 fprintf(stderr, " Generate a WIGGLE format file showing the multiplicity of the\n");
88 fprintf(stderr, " kmer starting at each position in the sequence, if it exists in\n");
89 fprintf(stderr, " an input kmer database.\n");
90 fprintf(stderr, "\n");
91
92 if (progname == nullptr)
93 return;
94
95 fprintf(stderr, " If multiple databases are supplied, the reported multiplicity\n");
96 fprintf(stderr, " is the sum of multiplicities in all databases. The -labels\n");
97 fprintf(stderr, " option is not used.\n");
98 fprintf(stderr, "\n");
99 fprintf(stderr, " Exactly one input -sequence must be supplied.\n");
100 fprintf(stderr, "\n");
101 fprintf(stderr, " If no -output path is supplied, output is written to stdout.\n");
102 fprintf(stderr, "\n");
103 fprintf(stderr, " The output file has format:\n");
104 fprintf(stderr, " variableStep chrom=<sequence_name>\n");
105 fprintf(stderr, " <position> <tab> <sum_of_multiplicities>\n");
106 fprintf(stderr, " <position> <tab> <sum_of_multiplicities>\n");
107 fprintf(stderr, "\n");
108
109 }
110
111 void
helpWIGdepth(char const * progname)112 helpWIGdepth(char const *progname) {
113
114 if (progname) {
115 fprintf(stderr, "usage: %s -wig-depth \\\n", progname);
116 fprintf(stderr, " -sequence input.fasta \\\n");
117 fprintf(stderr, " -output output.wig \\\n");
118 fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
119 }
120
121 fprintf(stderr, " -wig-depth:\n");
122 fprintf(stderr, " Generate a WIGGLE format file showing the number of kmers in\n");
123 fprintf(stderr, " any input database that cover each position in the sequence.\n");
124 fprintf(stderr, "\n");
125
126 if (progname == nullptr)
127 return;
128
129 fprintf(stderr, " If multiple databases are supplied, the depth does not change\n");
130 fprintf(stderr, " when the kmer is present in more than one database.\n");
131 fprintf(stderr, "\n");
132 fprintf(stderr, " Exactly one input -sequence must be supplied.\n");
133 fprintf(stderr, "\n");
134 fprintf(stderr, " If no -output path is supplied, output is written to stdout.\n");
135 fprintf(stderr, "\n");
136 fprintf(stderr, " The output file has format:\n");
137 fprintf(stderr, " variableStep chrom=<sequence_name>\n");
138 fprintf(stderr, " <position> <tab> <kmer_depth>\n");
139 fprintf(stderr, " <position> <tab> <kmer_depth>\n");
140 fprintf(stderr, "\n");
141 }
142
143 void
helpExistence(char const * progname)144 helpExistence(char const *progname) {
145
146 if (progname) {
147 fprintf(stderr, "usage: %s -wig-depth \\\n", progname);
148 fprintf(stderr, " -sequence input.fasta \\\n");
149 fprintf(stderr, " -output output.wig \\\n");
150 fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
151 }
152
153 fprintf(stderr, " -existence:\n");
154 fprintf(stderr, " Generate a tab-delimited line for each input sequence with the\n");
155 fprintf(stderr, " number of kmers in the sequence, in the database and common to both.\n");
156 fprintf(stderr, "\n");
157
158 if (progname == nullptr)
159 return;
160
161 fprintf(stderr, " The number of kmers in common is counted individually for each\n");
162 fprintf(stderr, " database, but still reported on a single output line.\n");
163 fprintf(stderr, "\n");
164 fprintf(stderr, " Exactly one input -sequence must be supplied.\n");
165 fprintf(stderr, "\n");
166 fprintf(stderr, " The -labels option is not used.\n");
167 fprintf(stderr, "\n");
168 fprintf(stderr, " The output file has format:\n");
169 fprintf(stderr, " <sequence_name> <tab> <kmers_in_sequence> <tab> <kmers_in_db> <tab> <kmers_shared> [...]\n");
170 fprintf(stderr, "\n");
171 fprintf(stderr, " With one input database:\n");
172 fprintf(stderr, " sequence1 <tab> 8415 <tab> 12856825 <tab> 8145\n");
173 fprintf(stderr, "\n");
174 fprintf(stderr, " With two input databases:\n");
175 fprintf(stderr, " sequence1 <tab> 8415 <tab> 12856825 <tab> 8145 <tab> 575757256 <tab> 8354\n");
176 fprintf(stderr, "\n");
177 }
178
179 void
helpIncludeExclude(char const * progname)180 helpIncludeExclude(char const *progname) {
181
182 if (progname) {
183 fprintf(stderr, "usage: %s [-include | -exclude] \\\n", progname);
184 fprintf(stderr, " -sequence <input1.fasta> [<input2.fasta>] \\\n");
185 fprintf(stderr, " -output <output1> [<output2>] \\\n");
186 fprintf(stderr, " -mers <input1.meryl> [-estimate] \\\n");
187 fprintf(stderr, " -10x\n");
188 fprintf(stderr, "\n");
189 }
190
191 fprintf(stderr, " -include:\n");
192 fprintf(stderr, " -exclude:\n");
193 fprintf(stderr, " Copy sequences from 'input1.fasta' (and 'input2.fasta') to the\n");
194 fprintf(stderr, " corresponding output file if the sequence has at least one kmer\n");
195 fprintf(stderr, " present (include) or no kmers present (exclude) in 'input1.meryl'.\n");
196 fprintf(stderr, "\n");
197
198 if (progname == nullptr)
199 return;
200
201 fprintf(stderr, " -10x:\n");
202 fprintf(stderr, " When -10x is supplied, the first 23 bp of every sequence in input1.fasta\n");
203 fprintf(stderr, " will be ignored while looking up for kmer existence.\n");
204 fprintf(stderr, "\n");
205 fprintf(stderr, " Exactly one input database must be supplied. The -labels option is\n");
206 fprintf(stderr, " not used.\n");
207 fprintf(stderr, "\n");
208 fprintf(stderr, " When one input sequence is supplied, each sequence is copied to the\n");
209 fprintf(stderr, " output file if kmers exist (-include) or do not exist (-exclude) in\n");
210 fprintf(stderr, " the input databse.\n");
211 fprintf(stderr, "\n");
212 fprintf(stderr, " When two input sequence is supplied, the pair of sequences are copied\n");
213 fprintf(stderr, " the the output files if kmers from either sequence exist (-include) or\n");
214 fprintf(stderr, " do not exist (-exclude) in the input databse.\n");
215 fprintf(stderr, "\n");
216 }
217
218
219
220 void
help(char const * progname)221 help(char const *progname) {
222 fprintf(stderr, "usage: %s <report-type> \\\n", progname);
223 fprintf(stderr, " -sequence <input1.fasta> [<input2.fasta>] \\\n");
224 fprintf(stderr, " -output <output1> [<output2>] \\\n");
225 fprintf(stderr, " -mers <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
226 fprintf(stderr, " -labels <input1name> [<input2name>] [...]\n");
227 fprintf(stderr, "\n");
228 fprintf(stderr, " Compare kmers in input sequences against kmers in input meryl databases.\n");
229 fprintf(stderr, "\n");
230 fprintf(stderr, " Input sequences (-sequence) can be FASTA or FASTQ, uncompressed, or\n");
231 fprintf(stderr, " compressed with gzip, xz, or bzip2.\n");
232 fprintf(stderr, "\n");
233 fprintf(stderr, " To compute and report only estimated memory usage, add option '-estimate'.\n");
234 fprintf(stderr, "\n");
235 fprintf(stderr, " Report types:\n");
236 fprintf(stderr, " Run `%s <report-type> -help` for details on each method.\n", progname);
237 fprintf(stderr, "\n");
238 fprintf(stderr, "\n");
239 helpBED();
240 helpWIGcount();
241 helpWIGdepth();
242 helpExistence();
243 helpIncludeExclude();
244 }
245