1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <sstream>
28
29 #include "cmdline.hpp"
30 #include "support2.hpp"
31
32 #define TOOL_NAME "fastq-dump"
33 #define TOOL_ORIGINAL_NAME TOOL_NAME "-orig"
34
35 namespace sratools2
36 {
37
38 struct FastqParams final : CmnOptAndAccessions
39 {
40 ncbi::String accession_replacement;
41 ncbi::String table_name;
42 ncbi::String read_filter;
43 ncbi::String aligned_region;
44 ncbi::String matepair_dist;
45 ncbi::String outdir;
46 ncbi::String dumpcs;
47 ncbi::String defline_seq;
48 ncbi::String defline_qual;
49 ncbi::String fasta;
50 std::vector < ncbi::String > spot_groups;
51 bool split_spot, clip, qual_filter, qual_filter1;
52 bool aligned, unaligned, skip_tech, to_stdout, gzip, bzip;
53 bool split_files, split3, spot_group, group_in_dirs;
54 bool keep_empty_files, dumpbase, no_q_for_cskey;
55 bool origfmt, readids, helicos;
56 ncbi::U64 minSpotId;
57 ncbi::U32 minSpotIdCount;
58 ncbi::U64 maxSpotId;
59 ncbi::U32 maxSpotIdCount;
60 ncbi::U32 minReadLen;
61 ncbi::U32 minReadLenCount;
62 ncbi::U32 ReadFilterCount;
63 ncbi::U32 DumpcsCount;
64 ncbi::U32 QOffsetCount;
65 ncbi::U32 QOffset;
66
FastqParamssratools2::FastqParams67 explicit FastqParams(WhatImposter const &what)
68 : CmnOptAndAccessions(what)
69 , accession_replacement( "" )
70 , split_spot( false )
71 , clip( false )
72 , qual_filter( false )
73 , qual_filter1( false )
74 , aligned( false )
75 , unaligned( false )
76 , skip_tech( false )
77 , to_stdout( false )
78 , gzip( false )
79 , bzip( false )
80 , split_files( false )
81 , split3( false )
82 , spot_group( false )
83 , group_in_dirs( false )
84 , keep_empty_files( false )
85 , dumpbase( false )
86 , no_q_for_cskey( false )
87 , origfmt( false )
88 , readids( false )
89 , helicos( false )
90 , minSpotId( 0 )
91 , minSpotIdCount( 0 )
92 , maxSpotId( 0 )
93 , maxSpotIdCount( 0 )
94 , minReadLen( 0 )
95 , minReadLenCount( 0 )
96 , ReadFilterCount( 0 )
97 , DumpcsCount( 0 )
98 , QOffsetCount( 0 )
99 , QOffset( 0 )
100 {}
101
addsratools2::FastqParams102 void add( ncbi::Cmdline &cmdline ) override
103 {
104 cmdline . addOption ( accession_replacement, nullptr, "A", "accession", "<accession>",
105 "Replaces accession derived from <path> in filename(s) and deflines (only for single table dump)" );
106
107 cmdline . addOption ( table_name, nullptr, "", "table", "<table-name>",
108 "Table name within cSRA object, default is \"SEQUENCE\"" );
109
110 cmdline . addOption ( split_spot, "", "split-spot", "Split spots into individual reads" );
111
112 cmdline . addOption ( minSpotId, &minSpotIdCount, "N", "minSpotId", "<rowid>", "Minimum spot id" );
113 cmdline . addOption ( maxSpotId, &maxSpotIdCount, "X", "maxSpotId", "<rowid>", "Maximum spot id" );
114
115 cmdline . addListOption( spot_groups, ',', 255, "", "spot-groups", "<[list]>",
116 "Filter by SPOT_GROUP (member): name[,...]" );
117
118 cmdline . addOption ( clip, "W", "clip", "Remove adapter sequences from reads" );
119
120 cmdline . addOption ( minReadLen, &minReadLenCount, "M", "minReadLen", "<len>",
121 "Filter by sequence length >= <len>" );
122
123 cmdline . addOption ( read_filter, &ReadFilterCount, "R", "read-filter", "<filter>",
124 "Split into files by READ_FILTER value [split], optionally filter by value: [pass|reject|criteria|redacted]" );
125
126 cmdline . addOption ( qual_filter, "E", "qual-filter",
127 "Filter used in early 1000 Genomes data: no sequences starting or ending with >= 10N" );
128
129 cmdline . addOption ( qual_filter1, "", "qual-filter-1",
130 "Filter used in current 1000 Genomes data" );
131
132 cmdline . addOption ( aligned, "", "aligned", "Dump only aligned sequences" );
133 cmdline . addOption ( unaligned, "", "unaligned", "Dump only unaligned sequences" );
134
135 cmdline . addOption ( aligned_region, nullptr, "", "aligned-region", "<name[:from-to]>",
136 "Filter by position on genome. Name can eiter by accession.version (ex: NC_000001.10) "
137 "or file specific name (ex: \"chr1\" or \"1\". \"from\" and \"to\" are 1-based coordinates" );
138
139 cmdline . addOption ( matepair_dist, nullptr, "", "matepair_distance", "<from-to|unknown>",
140 "Filter by distance between matepairs. Use \"unknown\" to find matepairs split between "
141 "the references. Use from-to to limit matepair distance on the same reference" );
142
143 cmdline . addOption ( skip_tech, "", "skip-technical", "Dump only biological reads" );
144
145 cmdline . addOption ( outdir, nullptr, "O", "outdir", "<path>",
146 "Output directory, default is working directory '.'" );
147
148 cmdline . addOption ( to_stdout, "Z", "stdout", "Output to stdout, "
149 "all split data become joined into single stream" );
150
151 cmdline . addOption ( gzip, "", "gzip", "Compress output using gzip: deprecated, not recommended" );
152 cmdline . addOption ( bzip, "", "bzip2", "Compress output using bzip2: deprecated, not recommended" );
153
154 cmdline . addOption ( split_files, "", "split-files",
155 "Write reads into separate files. Read number will be suffixed to the file name. "
156 "NOTE! The `--split-3` option is recommended. In cases where not all spots have the same "
157 "number of reads, this option will produce files that WILL CAUSE ERRORS in most programs "
158 "which process split pair fastq files." );
159
160 cmdline . addOption ( split3, "", "split-3",
161 "3-way splitting for mate-pairs. For each spot, if there are two biological reads "
162 "satisfying filter conditions, the first is placed in the `*_1.fastq` file, and the "
163 "second is placed in the `*_2.fastq` file. If there is only one biological read "
164 "satisfying the filter conditions, it is placed in the `*.fastq` file.All other "
165 "reads in the spot are ignored." );
166
167 cmdline . addOption ( spot_group, "G", "spot-group", "Split into files by SPOT_GROUP (member name)" );
168 cmdline . addOption ( group_in_dirs, "T", "group-in-dirs", "Split into subdirectories instead of files" );
169 cmdline . addOption ( keep_empty_files, "K", "keep-empty-files", "Do not delete empty files" );
170
171 cmdline . addOption ( dumpcs, &DumpcsCount, "C", "dumpcs", "<cskey>",
172 "Formats sequence using color space (default for SOLiD), \"cskey\" may be specified for translation"
173 " or else specify \"dflt\" to use the default value" );
174
175 cmdline . addOption ( dumpbase, "B", "dumpbase",
176 "Formats sequence using base space (default for other than SOLiD)." );
177
178 cmdline . addOption ( QOffset, &QOffsetCount, "Q", "offset", "<integer",
179 "Offset to use for quality conversion, default is 33" );
180
181 // !!! double duty option: mandatory value introduced, "default" ---> "--fasta"
182 // other values ---> "--fasta value"
183 cmdline . addOption ( fasta, nullptr, "", "fasta", "<line-width>",
184 "FASTA only, no qualities, with can be \"default\" or \"0\" for no wrapping" );
185
186 cmdline . addOption ( no_q_for_cskey, "", "suppress-qual-for-cskey", "suppress quality-value for cskey" );
187 cmdline . addOption ( origfmt, "F", "origfmt", "Defline contains only original sequence name" );
188 cmdline . addOption ( readids, "I",
189 "readids", "Append read id after spot id as 'accession.spot.readid' on defline" );
190 cmdline . addOption ( helicos, "", "helicos", "Helicos style defline" );
191
192 cmdline . addOption ( defline_seq, nullptr, "", "defline-seq", "<fmt>",
193 "Defline format specification for sequence." );
194
195 cmdline . addOption ( defline_qual, nullptr, "", "defline-qual", "<fmt>",
196 "Defline format specification for quality. <fmt> is string of characters and/or "
197 "variables. The variables can be one of: $ac - accession, $si spot id, $sn spot "
198 "name, $sg spot group (barcode), $sl spot length in bases, $ri read number, $rn "
199 "read name, $rl read length in bases. '[]' could be used for an optional output: if "
200 "all vars in [] yield empty values whole group is not printed. Empty value is empty "
201 "string or for numeric variables. Ex: @$sn[_$rn]/$ri '_$rn' is omitted if name is empty" );
202
203 CmnOptAndAccessions::add(cmdline);
204
205 // add a silent option...
206 cmdline . startSilentOptions();
207 cmdline . addOption ( split3, "", "split-e", "See split-3" );
208
209 }
210
showsratools2::FastqParams211 std::ostream &show(std::ostream &ss) const override
212 {
213 if ( !accession_replacement.isEmpty() ) ss << "acc-replace : " << accession_replacement << std::endl;
214 if ( !table_name.isEmpty() ) ss << "table-name : " << table_name << std::endl;
215 if ( split_spot ) ss << "split-spot" << std::endl;
216 if ( minSpotIdCount > 0 ) ss << "minSpotId : " << minSpotId << std::endl;
217 if ( maxSpotIdCount > 0 ) ss << "maxSpotId : " << maxSpotId << std::endl;
218 print_vec( ss, spot_groups, "spot-groups : " );
219 if ( clip ) ss << "clip" << std::endl;
220 if ( minReadLenCount > 0 ) ss << "minReadLen : " << minReadLen << std::endl;
221 if ( ReadFilterCount > 0 ) ss << "read-filter : '" << read_filter << "'" << std::endl;
222 if ( qual_filter ) ss << "qual-filter" << std::endl;
223 if ( qual_filter1 ) ss << "qual-filter-1" << std::endl;
224 if ( aligned ) ss << "aligned" << std::endl;
225 if ( unaligned ) ss << "unaligned" << std::endl;
226 if ( !aligned_region.isEmpty() ) ss << "aligned-region : " << aligned_region << std::endl;
227 if ( !matepair_dist.isEmpty() ) ss << "matepair-dist : " << matepair_dist << std::endl;
228 if ( skip_tech ) ss << "skip-tech" << std::endl;
229 if ( !outdir.isEmpty() ) ss << "outdir : " << outdir << std::endl;
230 if ( to_stdout ) ss << "stdout" << std::endl;
231 if ( gzip ) ss << "gzip" << std::endl;
232 if ( bzip ) ss << "bzip2" << std::endl;
233 if ( split_files ) ss << "split-files" << std::endl;
234 if ( split3 ) ss << "split-3" << std::endl;
235 if ( spot_group ) ss << "splot-gropu" << std::endl;
236 if ( group_in_dirs ) ss << "group-in-dirs" << std::endl;
237 if ( keep_empty_files ) ss << "keep-empty-files" << std::endl;
238 if ( DumpcsCount > 0 ) ss << "dumpcs : '" << dumpcs << "'" << std::endl;
239 if ( dumpbase ) ss << "dumpbase" << std::endl;
240 if ( QOffsetCount > 0 ) ss << "offset : " << QOffset << std::endl;
241 if ( !fasta.isEmpty() ) ss << "fasta : " << fasta << std::endl;
242 if ( no_q_for_cskey ) ss << "suppress-qual-for-cskey" << std::endl;
243 if ( origfmt ) ss << "origfmt" << std::endl;
244 if ( readids ) ss << "readids" << std::endl;
245 if ( helicos ) ss << "helicos" << std::endl;
246 if ( !defline_seq.isEmpty() ) ss << "defline-seq: '" << defline_seq << "'" << std::endl;
247 if ( !defline_qual.isEmpty() ) ss << "defline-qual: '" << defline_qual << "'" << std::endl;
248 return CmnOptAndAccessions::show(ss);
249 }
250
populate_argv_buildersratools2::FastqParams251 void populate_argv_builder( ArgvBuilder & builder, int acc_index, std::vector<ncbi::String> const &accessions ) const override
252 {
253 populate_common_argv_builder(builder, acc_index, accessions, fastq_dump);
254
255 if ( !accession_replacement.isEmpty() ) builder . add_option( "-A", accession_replacement );
256 if ( !table_name.isEmpty() ) builder . add_option( "--table", table_name );
257 if ( split_spot ) builder . add_option( "--split-spot" );
258 if ( minSpotIdCount > 0 ) builder . add_option( "-N", minSpotId );
259 if ( maxSpotIdCount > 0 ) builder . add_option( "-X", maxSpotId );
260 if ( spot_groups.size() > 0 ) {
261 auto list = spot_groups[0].toSTLString();
262 int i = 0;
263 for (auto & value : spot_groups) {
264 if (i++ > 0) {
265 list += ',' + value.toSTLString();
266 }
267 }
268 builder.add_option("--spot-groups", list);
269 }
270 if ( clip ) builder . add_option( "-W" );
271 if ( minReadLenCount > 0 ) builder . add_option( "-M", minReadLen );
272
273 // problem-child: !!! read-filter has dual form: with and without value !!!
274 if ( ReadFilterCount > 0 )
275 {
276 if ( read_filter == "split" )
277 builder . add_option( "-R" );
278 else
279 builder . add_option( "-R", read_filter );
280 }
281
282 if ( qual_filter ) builder . add_option( "-E" );
283 if ( qual_filter1 ) builder . add_option( "--qual-filter-1" );
284 if ( aligned ) builder . add_option( "--aligned" );
285 if ( unaligned ) builder . add_option( "--unaligned" );
286 if ( !aligned_region.isEmpty() ) builder . add_option( "--aligned-region", aligned_region );
287 if ( !matepair_dist.isEmpty() ) builder . add_option( "--matepair-distance", matepair_dist );
288 if ( skip_tech ) builder . add_option( "--skip-technical" );
289 if ( !outdir.isEmpty() ) builder . add_option( "-O", outdir );
290 if ( to_stdout ) builder . add_option( "-Z" );
291 if ( gzip ) builder . add_option( "--gzip" );
292 if ( bzip ) builder . add_option( "--bizp2" );
293 if ( split_files ) builder . add_option( "--split-files" );
294 if ( split3 ) builder . add_option( "--split-3" );
295 if ( spot_group ) builder . add_option( "--spot-group" );
296 if ( group_in_dirs ) builder . add_option( "-T" );
297 if ( keep_empty_files ) builder . add_option( "-K" );
298
299 // problem-child: !!! dumpcs has dual form: with and without value !!!
300 if ( DumpcsCount > 0 )
301 {
302 if ( dumpcs == "default" )
303 builder . add_option( "-C" );
304 else
305 builder . add_option( "-C", dumpcs );
306 }
307
308 if ( dumpbase ) builder . add_option( "-B" );
309 if ( QOffsetCount > 0 ) builder . add_option( "-Q", QOffset );
310
311 // problem-child: !!! fasta has dual form: with and without value !!!
312 // we have ommited the possibility to specify the line-width optionally
313 if ( !fasta.isEmpty() )
314 {
315 if ( fasta.equal( "default" ) )
316 builder . add_option( "--fasta" );
317 else
318 builder . add_option( "--fasta", fasta );
319 }
320
321 if ( no_q_for_cskey ) builder . add_option( "--suppress-qual-for-cskey" );
322 if ( origfmt ) builder . add_option( "-F" );
323 if ( readids ) builder . add_option( "-I" );
324 if ( helicos ) builder . add_option( "--helicos" );
325 if ( !defline_seq.isEmpty() ) builder . add_option( "--defline-seq", defline_seq );
326 if ( !defline_qual.isEmpty() ) builder . add_option( "--defline-qual", defline_qual );
327 }
328
checksratools2::FastqParams329 bool check() const override
330 {
331 int problems = 0;
332
333 if (!fasta.isEmpty() && fasta != "default") {
334 for (auto && ch : fasta.toSTLString()) {
335 if (ch < '0' || ch > '9') {
336 std::cerr << "fasta requires an integer value >= 0 or \"default\"" << std::endl;
337 problems++;
338 break;
339 }
340 }
341 }
342 if ( bzip && gzip )
343 {
344 std::cerr << "bzip2 and gzip cannot both be used at the same time" << std::endl;
345 problems++;
346 }
347 if ( !read_filter.isEmpty() )
348 {
349 if ( !is_one_of( read_filter, 5, "split", "pass", "reject", "criterial", "redacted" ) )
350 {
351 std::cerr << "invalid read-filter-value: " << read_filter << std::endl;
352 problems++;
353 }
354
355 }
356
357 return CmnOptAndAccessions::check() && ( problems == 0 );
358 }
359
runsratools2::FastqParams360 int run() const override {
361 auto const theirArgv0 = what.toolpath.getPathFor(TOOL_NAME).fullpath();
362 {
363 auto const realpath = what.toolpath.getPathFor(TOOL_ORIGINAL_NAME);
364 if (realpath.executable())
365 return ToolExec::run(TOOL_NAME, realpath.fullpath(), theirArgv0, *this, accessions);
366 }
367 throw std::runtime_error(TOOL_NAME " was not found or is not executable.");
368 }
369
370 };
371
impersonate_fastq_dump(const Args & args,WhatImposter const & what)372 int impersonate_fastq_dump( const Args &args, WhatImposter const &what )
373 {
374 #if DEBUG || _DEBUGGING
375 FastqParams temp(what);
376 auto ¶ms = *randomized(&temp, what);
377 #else
378 FastqParams params(what);
379 #endif
380 return Impersonator::run(args, params);
381 }
382
383 } // namespace
384