1 
2 /******************************************************************************
3  *
4  *  This file is part of meryl, a genomic k-kmer counter with nice features.
5  *
6  *  This software is based on:
7  *    'Canu' v2.0              (https://github.com/marbl/canu)
8  *  which is based on:
9  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
10  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
11  *
12  *  Except as indicated otherwise, this is a 'United States Government Work',
13  *  and is released in the public domain.
14  *
15  *  File 'README.licenses' in the root directory of this distribution
16  *  contains full conditions and disclaimers.
17  */
18 
19 #include "meryl-lookup.H"
20 
21 
22 void
helpBED(char const * progname)23 helpBED(char const *progname) {
24 
25   if (progname) {
26     fprintf(stderr, "usage: %s [-bed | -bed-runs] \\\n", progname);
27     fprintf(stderr, "         -sequence input.fasta \\\n");
28     fprintf(stderr, "         -output   output.bed\\\n");
29     fprintf(stderr, "         -mers     <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
30     fprintf(stderr, "         -labels   <input1name>   [<input2name>]   [...]\n");
31     fprintf(stderr, "\n");
32   }
33 
34   fprintf(stderr, "  -bed:\n");
35   fprintf(stderr, "     Generate a BED format file showing the location of kmers in\n");
36   fprintf(stderr, "     any input database on each sequence in 'input1.fasta'.\n");
37   fprintf(stderr, "     Each kmer is reported in a separate bed record.\n");
38   fprintf(stderr, "\n");
39   fprintf(stderr, "  -bed-runs:\n");
40   fprintf(stderr, "     Generate a BED format file showing the location of kmers in\n");
41   fprintf(stderr, "     any input database on each sequence in 'input1.fasta'.\n");
42   fprintf(stderr, "     Overlapping kmers are combined into a single bed record.\n");
43   fprintf(stderr, "\n");
44 
45   if (progname == nullptr)
46     return;
47 
48   fprintf(stderr, "     If multiple databases are supplied, the output file reports\n");
49   fprintf(stderr, "     the location of kmers in any database.  If the -labels option\n");
50   fprintf(stderr, "     is supplied, each line will be annotated with the label of the\n");
51   fprintf(stderr, "     databse the kmer is found in.  If no -labels are supplied, a\n");
52   fprintf(stderr, "     single line will be emitted regardless of how many databases the\n");
53   fprintf(stderr, "     kmer is found in.\n");
54   fprintf(stderr, "\n");
55   fprintf(stderr, "     For example, with two input databases (-mers A.meryl B.meryl) and\n");
56   fprintf(stderr, "     labels supplied (-labels A B), with the first two kmers in the input\n");
57   fprintf(stderr, "     sequence found in both databases, the output will be:\n");
58   fprintf(stderr, "       sequence1 <tab> 0 <tab> 21 <tab> A\n");
59   fprintf(stderr, "       sequence1 <tab> 0 <tab> 21 <tab> B\n");
60   fprintf(stderr, "       sequence1 <tab> 1 <tab> 22 <tab> A\n");
61   fprintf(stderr, "       sequence1 <tab> 1 <tab> 22 <tab> B\n");
62   fprintf(stderr, "\n");
63   fprintf(stderr, "     Without the -labels option, the output will be:\n");
64   fprintf(stderr, "       sequence1 <tab> 0 <tab> 21\n");
65   fprintf(stderr, "       sequence1 <tab> 1 <tab> 22\n");
66   fprintf(stderr, "\n");
67   fprintf(stderr, "     If -bed-runs is used, the output will bed:\n");
68   fprintf(stderr, "       sequence1 <tab> 0 <tab> 22 <tab> A\n");
69   fprintf(stderr, "       sequence1 <tab> 0 <tab> 22 <tab> B\n");
70   fprintf(stderr, "\n");
71   fprintf(stderr, "     Output lines are written in the order sequences appear in the input\n");
72   fprintf(stderr, "     file, and are in increasing position within the sequence itself.\n");
73   fprintf(stderr, "\n");
74 }
75 
76 void
helpWIGcount(char const * progname)77 helpWIGcount(char const *progname) {
78 
79   if (progname) {
80     fprintf(stderr, "usage: %s -wig-count \\\n", progname);
81     fprintf(stderr, "         -sequence input.fasta \\\n");
82     fprintf(stderr, "         -output   output.wig \\\n");
83     fprintf(stderr, "         -mers     <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
84   }
85 
86   fprintf(stderr, "  -wig-count:\n");
87   fprintf(stderr, "     Generate a WIGGLE format file showing the multiplicity of the\n");
88   fprintf(stderr, "     kmer starting at each position in the sequence, if it exists in\n");
89   fprintf(stderr, "     an input kmer database.\n");
90   fprintf(stderr, "\n");
91 
92   if (progname == nullptr)
93     return;
94 
95   fprintf(stderr, "     If multiple databases are supplied, the reported multiplicity\n");
96   fprintf(stderr, "     is the sum of multiplicities in all databases.  The -labels\n");
97   fprintf(stderr, "     option is not used.\n");
98   fprintf(stderr, "\n");
99   fprintf(stderr, "     Exactly one input -sequence must be supplied.\n");
100   fprintf(stderr, "\n");
101   fprintf(stderr, "     If no -output path is supplied, output is written to stdout.\n");
102   fprintf(stderr, "\n");
103   fprintf(stderr, "     The output file has format:\n");
104   fprintf(stderr, "         variableStep chrom=<sequence_name>\n");
105   fprintf(stderr, "         <position> <tab> <sum_of_multiplicities>\n");
106   fprintf(stderr, "         <position> <tab> <sum_of_multiplicities>\n");
107   fprintf(stderr, "\n");
108 
109 }
110 
111 void
helpWIGdepth(char const * progname)112 helpWIGdepth(char const *progname) {
113 
114   if (progname) {
115     fprintf(stderr, "usage: %s -wig-depth \\\n", progname);
116     fprintf(stderr, "         -sequence input.fasta \\\n");
117     fprintf(stderr, "         -output   output.wig \\\n");
118     fprintf(stderr, "         -mers     <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
119   }
120 
121   fprintf(stderr, "  -wig-depth:\n");
122   fprintf(stderr, "     Generate a WIGGLE format file showing the number of kmers in\n");
123   fprintf(stderr, "     any input database that cover each position in the sequence.\n");
124   fprintf(stderr, "\n");
125 
126   if (progname == nullptr)
127     return;
128 
129   fprintf(stderr, "     If multiple databases are supplied, the depth does not change\n");
130   fprintf(stderr, "     when the kmer is present in more than one database.\n");
131   fprintf(stderr, "\n");
132   fprintf(stderr, "     Exactly one input -sequence must be supplied.\n");
133   fprintf(stderr, "\n");
134   fprintf(stderr, "     If no -output path is supplied, output is written to stdout.\n");
135   fprintf(stderr, "\n");
136   fprintf(stderr, "     The output file has format:\n");
137   fprintf(stderr, "         variableStep chrom=<sequence_name>\n");
138   fprintf(stderr, "         <position> <tab> <kmer_depth>\n");
139   fprintf(stderr, "         <position> <tab> <kmer_depth>\n");
140   fprintf(stderr, "\n");
141 }
142 
143 void
helpExistence(char const * progname)144 helpExistence(char const *progname) {
145 
146   if (progname) {
147     fprintf(stderr, "usage: %s -wig-depth \\\n", progname);
148     fprintf(stderr, "         -sequence input.fasta \\\n");
149     fprintf(stderr, "         -output   output.wig \\\n");
150     fprintf(stderr, "         -mers     <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
151   }
152 
153   fprintf(stderr, "  -existence:\n");
154   fprintf(stderr, "     Generate a tab-delimited line for each input sequence with the\n");
155   fprintf(stderr, "     number of kmers in the sequence, in the database and common to both.\n");
156   fprintf(stderr, "\n");
157 
158   if (progname == nullptr)
159     return;
160 
161   fprintf(stderr, "     The number of kmers in common is counted individually for each\n");
162   fprintf(stderr, "     database, but still reported on a single output line.\n");
163   fprintf(stderr, "\n");
164   fprintf(stderr, "     Exactly one input -sequence must be supplied.\n");
165   fprintf(stderr, "\n");
166   fprintf(stderr, "     The -labels option is not used.\n");
167   fprintf(stderr, "\n");
168   fprintf(stderr, "     The output file has format:\n");
169   fprintf(stderr, "         <sequence_name> <tab> <kmers_in_sequence> <tab> <kmers_in_db> <tab> <kmers_shared> [...]\n");
170   fprintf(stderr, "\n");
171   fprintf(stderr, "     With one input database:\n");
172   fprintf(stderr, "         sequence1 <tab> 8415 <tab> 12856825 <tab> 8145\n");
173   fprintf(stderr, "\n");
174   fprintf(stderr, "     With two input databases:\n");
175   fprintf(stderr, "         sequence1 <tab> 8415 <tab> 12856825 <tab> 8145 <tab> 575757256 <tab> 8354\n");
176   fprintf(stderr, "\n");
177 }
178 
179 void
helpIncludeExclude(char const * progname)180 helpIncludeExclude(char const *progname) {
181 
182   if (progname) {
183     fprintf(stderr, "usage: %s [-include | -exclude] \\\n", progname);
184     fprintf(stderr, "         -sequence <input1.fasta> [<input2.fasta>] \\\n");
185     fprintf(stderr, "         -output   <output1>      [<output2>] \\\n");
186     fprintf(stderr, "         -mers     <input1.meryl> [-estimate] \\\n");
187     fprintf(stderr, "         -10x\n");
188     fprintf(stderr, "\n");
189   }
190 
191   fprintf(stderr, "  -include:\n");
192   fprintf(stderr, "  -exclude:\n");
193   fprintf(stderr, "     Copy sequences from 'input1.fasta' (and 'input2.fasta') to the\n");
194   fprintf(stderr, "     corresponding output file if the sequence has at least one kmer\n");
195   fprintf(stderr, "     present (include) or no kmers present (exclude) in 'input1.meryl'.\n");
196   fprintf(stderr, "\n");
197 
198   if (progname == nullptr)
199     return;
200 
201   fprintf(stderr, "  -10x:\n");
202   fprintf(stderr, "     When -10x is supplied, the first 23 bp of every sequence in input1.fasta\n");
203   fprintf(stderr, "     will be ignored while looking up for kmer existence.\n");
204   fprintf(stderr, "\n");
205   fprintf(stderr, "     Exactly one input database must be supplied.  The -labels option is\n");
206   fprintf(stderr, "     not used.\n");
207   fprintf(stderr, "\n");
208   fprintf(stderr, "     When one input sequence is supplied, each sequence is copied to the\n");
209   fprintf(stderr, "     output file if kmers exist (-include) or do not exist (-exclude) in\n");
210   fprintf(stderr, "     the input databse.\n");
211   fprintf(stderr, "\n");
212   fprintf(stderr, "     When two input sequence is supplied, the pair of sequences are copied\n");
213   fprintf(stderr, "     the the output files if kmers from either sequence exist (-include) or\n");
214   fprintf(stderr, "     do not exist (-exclude) in the input databse.\n");
215   fprintf(stderr, "\n");
216 }
217 
218 
219 
220 void
help(char const * progname)221 help(char const *progname) {
222   fprintf(stderr, "usage: %s <report-type> \\\n", progname);
223   fprintf(stderr, "         -sequence <input1.fasta> [<input2.fasta>] \\\n");
224   fprintf(stderr, "         -output   <output1>      [<output2>] \\\n");
225   fprintf(stderr, "         -mers     <input1.meryl> [<input2.meryl>] [...] [-estimate] \\\n");
226   fprintf(stderr, "         -labels   <input1name>   [<input2name>]   [...]\n");
227   fprintf(stderr, "\n");
228   fprintf(stderr, "  Compare kmers in input sequences against kmers in input meryl databases.\n");
229   fprintf(stderr, "\n");
230   fprintf(stderr, "  Input sequences (-sequence) can be FASTA or FASTQ, uncompressed, or\n");
231   fprintf(stderr, "  compressed with gzip, xz, or bzip2.\n");
232   fprintf(stderr, "\n");
233   fprintf(stderr, "  To compute and report only estimated memory usage, add option '-estimate'.\n");
234   fprintf(stderr, "\n");
235   fprintf(stderr, "  Report types:\n");
236   fprintf(stderr, "    Run `%s <report-type> -help` for details on each method.\n", progname);
237   fprintf(stderr, "\n");
238   fprintf(stderr, "\n");
239   helpBED();
240   helpWIGcount();
241   helpWIGdepth();
242   helpExistence();
243   helpIncludeExclude();
244 }
245