1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include <sstream>
28 
29 #include "cmdline.hpp"
30 #include "support2.hpp"
31 
32 #define TOOL_NAME "fastq-dump"
33 #define TOOL_ORIGINAL_NAME TOOL_NAME "-orig"
34 
35 namespace sratools2
36 {
37 
38 struct FastqParams final : CmnOptAndAccessions
39 {
40     ncbi::String accession_replacement;
41     ncbi::String table_name;
42     ncbi::String read_filter;
43     ncbi::String aligned_region;
44     ncbi::String matepair_dist;
45     ncbi::String outdir;
46     ncbi::String dumpcs;
47     ncbi::String defline_seq;
48     ncbi::String defline_qual;
49     ncbi::String fasta;
50     std::vector < ncbi::String > spot_groups;
51     bool split_spot, clip, qual_filter, qual_filter1;
52     bool aligned, unaligned, skip_tech, to_stdout, gzip, bzip;
53     bool split_files, split3, spot_group, group_in_dirs;
54     bool keep_empty_files, dumpbase, no_q_for_cskey;
55     bool origfmt, readids, helicos;
56     ncbi::U64 minSpotId;
57     ncbi::U32 minSpotIdCount;
58     ncbi::U64 maxSpotId;
59     ncbi::U32 maxSpotIdCount;
60     ncbi::U32 minReadLen;
61     ncbi::U32 minReadLenCount;
62     ncbi::U32 ReadFilterCount;
63     ncbi::U32 DumpcsCount;
64     ncbi::U32 QOffsetCount;
65     ncbi::U32 QOffset;
66 
FastqParamssratools2::FastqParams67     explicit FastqParams(WhatImposter const &what)
68     : CmnOptAndAccessions(what)
69     , accession_replacement( "" )
70     , split_spot( false )
71     , clip( false )
72     , qual_filter( false )
73     , qual_filter1( false )
74     , aligned( false )
75     , unaligned( false )
76     , skip_tech( false )
77     , to_stdout( false )
78     , gzip( false )
79     , bzip( false )
80     , split_files( false )
81     , split3( false )
82     , spot_group( false )
83     , group_in_dirs( false )
84     , keep_empty_files( false )
85     , dumpbase( false )
86     , no_q_for_cskey( false )
87     , origfmt( false )
88     , readids( false )
89     , helicos( false )
90     , minSpotId( 0 )
91     , minSpotIdCount( 0 )
92     , maxSpotId( 0 )
93     , maxSpotIdCount( 0 )
94     , minReadLen( 0 )
95     , minReadLenCount( 0 )
96     , ReadFilterCount( 0 )
97     , DumpcsCount( 0 )
98     , QOffsetCount( 0 )
99     , QOffset( 0 )
100     {}
101 
addsratools2::FastqParams102     void add( ncbi::Cmdline &cmdline ) override
103     {
104         cmdline . addOption ( accession_replacement, nullptr, "A", "accession", "<accession>",
105             "Replaces accession derived from <path> in filename(s) and deflines (only for single table dump)" );
106 
107         cmdline . addOption ( table_name, nullptr, "", "table", "<table-name>",
108             "Table name within cSRA object, default is \"SEQUENCE\"" );
109 
110         cmdline . addOption ( split_spot, "", "split-spot", "Split spots into individual reads" );
111 
112         cmdline . addOption ( minSpotId, &minSpotIdCount, "N", "minSpotId", "<rowid>", "Minimum spot id" );
113         cmdline . addOption ( maxSpotId, &maxSpotIdCount, "X", "maxSpotId", "<rowid>", "Maximum spot id" );
114 
115         cmdline . addListOption( spot_groups, ',', 255, "", "spot-groups", "<[list]>",
116             "Filter by SPOT_GROUP (member): name[,...]" );
117 
118         cmdline . addOption ( clip, "W", "clip", "Remove adapter sequences from reads" );
119 
120         cmdline . addOption ( minReadLen, &minReadLenCount, "M", "minReadLen", "<len>",
121             "Filter by sequence length >= <len>" );
122 
123         cmdline . addOption ( read_filter, &ReadFilterCount, "R", "read-filter", "<filter>",
124             "Split into files by READ_FILTER value [split], optionally filter by value: [pass|reject|criteria|redacted]" );
125 
126         cmdline . addOption ( qual_filter, "E", "qual-filter",
127             "Filter used in early 1000 Genomes data: no sequences starting or ending with >= 10N" );
128 
129         cmdline . addOption ( qual_filter1, "", "qual-filter-1",
130             "Filter used in current 1000 Genomes data" );
131 
132         cmdline . addOption ( aligned, "", "aligned", "Dump only aligned sequences" );
133         cmdline . addOption ( unaligned, "", "unaligned", "Dump only unaligned sequences" );
134 
135         cmdline . addOption ( aligned_region, nullptr, "", "aligned-region", "<name[:from-to]>",
136             "Filter by position on genome. Name can eiter by accession.version (ex: NC_000001.10) "
137             "or file specific name (ex: \"chr1\" or \"1\". \"from\" and \"to\" are 1-based coordinates" );
138 
139         cmdline . addOption ( matepair_dist, nullptr, "", "matepair_distance", "<from-to|unknown>",
140             "Filter by distance between matepairs. Use \"unknown\" to find matepairs split between "
141             "the references. Use from-to to limit matepair distance on the same reference" );
142 
143         cmdline . addOption ( skip_tech, "", "skip-technical", "Dump only biological reads" );
144 
145         cmdline . addOption ( outdir, nullptr, "O", "outdir", "<path>",
146             "Output directory, default is working directory '.'" );
147 
148         cmdline . addOption ( to_stdout, "Z", "stdout", "Output to stdout, "
149             "all split data become joined into single stream" );
150 
151         cmdline . addOption ( gzip, "", "gzip", "Compress output using gzip: deprecated, not recommended" );
152         cmdline . addOption ( bzip, "", "bzip2", "Compress output using bzip2: deprecated, not recommended" );
153 
154         cmdline . addOption ( split_files, "", "split-files",
155             "Write reads into separate files. Read number will be suffixed to the file name. "
156             "NOTE! The `--split-3` option is recommended. In cases where not all spots have the same "
157             "number of reads, this option will produce files that WILL CAUSE ERRORS in most programs "
158             "which process split pair fastq files." );
159 
160         cmdline . addOption ( split3, "", "split-3",
161             "3-way splitting for mate-pairs. For each spot, if there are two biological reads "
162             "satisfying filter conditions, the first is placed in the `*_1.fastq` file, and the "
163             "second is placed in the `*_2.fastq` file. If there is only one biological read "
164             "satisfying the filter conditions, it is placed in the `*.fastq` file.All other "
165             "reads in the spot are ignored." );
166 
167         cmdline . addOption ( spot_group, "G", "spot-group", "Split into files by SPOT_GROUP (member name)" );
168         cmdline . addOption ( group_in_dirs, "T", "group-in-dirs", "Split into subdirectories instead of files" );
169         cmdline . addOption ( keep_empty_files, "K", "keep-empty-files", "Do not delete empty files" );
170 
171         cmdline . addOption ( dumpcs, &DumpcsCount, "C", "dumpcs", "<cskey>",
172             "Formats sequence using color space (default for SOLiD), \"cskey\" may be specified for translation"
173             " or else specify \"dflt\" to use the default value" );
174 
175         cmdline . addOption ( dumpbase, "B", "dumpbase",
176             "Formats sequence using base space (default for other than SOLiD)." );
177 
178         cmdline . addOption ( QOffset, &QOffsetCount, "Q", "offset", "<integer",
179             "Offset to use for quality conversion, default is 33" );
180 
181         // !!! double duty option: mandatory value introduced, "default" ---> "--fasta"
182         // other values ---> "--fasta value"
183         cmdline . addOption ( fasta, nullptr, "", "fasta", "<line-width>",
184             "FASTA only, no qualities, with can be \"default\" or \"0\" for no wrapping" );
185 
186         cmdline . addOption ( no_q_for_cskey, "", "suppress-qual-for-cskey", "suppress quality-value for cskey" );
187         cmdline . addOption ( origfmt, "F", "origfmt", "Defline contains only original sequence name" );
188         cmdline . addOption ( readids, "I",
189             "readids", "Append read id after spot id as 'accession.spot.readid' on defline" );
190         cmdline . addOption ( helicos, "", "helicos", "Helicos style defline" );
191 
192         cmdline . addOption ( defline_seq, nullptr, "", "defline-seq", "<fmt>",
193             "Defline format specification for sequence." );
194 
195         cmdline . addOption ( defline_qual, nullptr, "", "defline-qual", "<fmt>",
196             "Defline format specification for quality. <fmt> is string of characters and/or "
197             "variables. The variables can be one of: $ac - accession, $si spot id, $sn spot "
198             "name, $sg spot group (barcode), $sl spot length in bases, $ri read number, $rn "
199             "read name, $rl read length in bases. '[]' could be used for an optional output: if "
200             "all vars in [] yield empty values whole group is not printed. Empty value is empty "
201             "string or for numeric variables. Ex: @$sn[_$rn]/$ri '_$rn' is omitted if name is empty" );
202 
203         CmnOptAndAccessions::add(cmdline);
204 
205         // add a silent option...
206         cmdline . startSilentOptions();
207         cmdline . addOption ( split3, "", "split-e", "See split-3" );
208 
209     }
210 
showsratools2::FastqParams211     std::ostream &show(std::ostream &ss) const override
212     {
213         if ( !accession_replacement.isEmpty() )  ss << "acc-replace : " << accession_replacement << std::endl;
214         if ( !table_name.isEmpty() )  ss << "table-name : " << table_name << std::endl;
215         if ( split_spot ) ss << "split-spot" << std::endl;
216         if ( minSpotIdCount > 0 ) ss << "minSpotId : " << minSpotId << std::endl;
217         if ( maxSpotIdCount > 0 ) ss << "maxSpotId : " << maxSpotId << std::endl;
218         print_vec( ss, spot_groups, "spot-groups : " );
219         if ( clip ) ss << "clip" << std::endl;
220         if ( minReadLenCount > 0 ) ss << "minReadLen : " << minReadLen << std::endl;
221         if ( ReadFilterCount > 0 )  ss << "read-filter : '" << read_filter << "'" << std::endl;
222         if ( qual_filter ) ss << "qual-filter" << std::endl;
223         if ( qual_filter1 ) ss << "qual-filter-1" << std::endl;
224         if ( aligned ) ss << "aligned" << std::endl;
225         if ( unaligned ) ss << "unaligned" << std::endl;
226         if ( !aligned_region.isEmpty() )  ss << "aligned-region : " << aligned_region << std::endl;
227         if ( !matepair_dist.isEmpty() )  ss << "matepair-dist : " << matepair_dist << std::endl;
228         if ( skip_tech ) ss << "skip-tech" << std::endl;
229         if ( !outdir.isEmpty() )  ss << "outdir : " << outdir << std::endl;
230         if ( to_stdout ) ss << "stdout" << std::endl;
231         if ( gzip ) ss << "gzip" << std::endl;
232         if ( bzip ) ss << "bzip2" << std::endl;
233         if ( split_files ) ss << "split-files" << std::endl;
234         if ( split3 ) ss << "split-3" << std::endl;
235         if ( spot_group ) ss << "splot-gropu" << std::endl;
236         if ( group_in_dirs ) ss << "group-in-dirs" << std::endl;
237         if ( keep_empty_files ) ss << "keep-empty-files" << std::endl;
238         if ( DumpcsCount > 0 )  ss << "dumpcs : '" << dumpcs << "'" << std::endl;
239         if ( dumpbase ) ss << "dumpbase" << std::endl;
240         if ( QOffsetCount > 0 ) ss << "offset : " << QOffset << std::endl;
241         if ( !fasta.isEmpty() ) ss << "fasta : " << fasta << std::endl;
242         if ( no_q_for_cskey ) ss << "suppress-qual-for-cskey" << std::endl;
243         if ( origfmt ) ss << "origfmt" << std::endl;
244         if ( readids ) ss << "readids" << std::endl;
245         if ( helicos ) ss << "helicos" << std::endl;
246         if ( !defline_seq.isEmpty() ) ss << "defline-seq: '" << defline_seq << "'" << std::endl;
247         if ( !defline_qual.isEmpty() ) ss << "defline-qual: '" << defline_qual << "'" << std::endl;
248         return CmnOptAndAccessions::show(ss);
249     }
250 
populate_argv_buildersratools2::FastqParams251     void populate_argv_builder( ArgvBuilder & builder, int acc_index, std::vector<ncbi::String> const &accessions ) const override
252     {
253         populate_common_argv_builder(builder, acc_index, accessions, fastq_dump);
254 
255         if ( !accession_replacement.isEmpty() ) builder . add_option( "-A", accession_replacement );
256         if ( !table_name.isEmpty() ) builder . add_option( "--table", table_name );
257         if ( split_spot ) builder . add_option( "--split-spot" );
258         if ( minSpotIdCount > 0 ) builder . add_option( "-N", minSpotId );
259         if ( maxSpotIdCount > 0 ) builder . add_option( "-X", maxSpotId );
260         if ( spot_groups.size() > 0 ) {
261             auto list = spot_groups[0].toSTLString();
262             int i = 0;
263             for (auto & value : spot_groups) {
264                 if (i++ > 0) {
265                     list += ',' + value.toSTLString();
266                 }
267             }
268             builder.add_option("--spot-groups", list);
269         }
270         if ( clip ) builder . add_option( "-W" );
271         if ( minReadLenCount > 0 ) builder . add_option( "-M", minReadLen );
272 
273         // problem-child: !!! read-filter has dual form: with and without value !!!
274         if ( ReadFilterCount > 0 )
275         {
276             if ( read_filter == "split" )
277                 builder . add_option( "-R" );
278             else
279                 builder . add_option( "-R", read_filter );
280         }
281 
282         if ( qual_filter ) builder . add_option( "-E" );
283         if ( qual_filter1 ) builder . add_option( "--qual-filter-1" );
284         if ( aligned ) builder . add_option( "--aligned" );
285         if ( unaligned ) builder . add_option( "--unaligned" );
286         if ( !aligned_region.isEmpty() ) builder . add_option( "--aligned-region", aligned_region );
287         if ( !matepair_dist.isEmpty() ) builder . add_option( "--matepair-distance", matepair_dist );
288         if ( skip_tech ) builder . add_option( "--skip-technical" );
289         if ( !outdir.isEmpty() ) builder . add_option( "-O", outdir );
290         if ( to_stdout ) builder . add_option( "-Z" );
291         if ( gzip ) builder . add_option( "--gzip" );
292         if ( bzip ) builder . add_option( "--bizp2" );
293         if ( split_files ) builder . add_option( "--split-files" );
294         if ( split3 ) builder . add_option( "--split-3" );
295         if ( spot_group ) builder . add_option( "--spot-group" );
296         if ( group_in_dirs ) builder . add_option( "-T" );
297         if ( keep_empty_files ) builder . add_option( "-K" );
298 
299         // problem-child: !!! dumpcs has dual form: with and without value !!!
300         if ( DumpcsCount > 0 )
301         {
302             if ( dumpcs == "default" )
303                 builder . add_option( "-C" );
304             else
305                 builder . add_option( "-C", dumpcs );
306         }
307 
308         if ( dumpbase ) builder . add_option( "-B" );
309         if ( QOffsetCount > 0 ) builder . add_option( "-Q", QOffset );
310 
311         // problem-child: !!! fasta has dual form: with and without value !!!
312         // we have ommited the possibility to specify the line-width optionally
313         if ( !fasta.isEmpty() )
314         {
315             if ( fasta.equal( "default" ) )
316                 builder . add_option( "--fasta" );
317             else
318                 builder . add_option( "--fasta", fasta );
319         }
320 
321         if ( no_q_for_cskey ) builder . add_option( "--suppress-qual-for-cskey" );
322         if ( origfmt ) builder . add_option( "-F" );
323         if ( readids ) builder . add_option( "-I" );
324         if ( helicos ) builder . add_option( "--helicos" );
325         if ( !defline_seq.isEmpty() ) builder . add_option( "--defline-seq", defline_seq );
326         if ( !defline_qual.isEmpty() ) builder . add_option( "--defline-qual", defline_qual );
327     }
328 
checksratools2::FastqParams329     bool check() const override
330     {
331         int problems = 0;
332 
333         if (!fasta.isEmpty() && fasta != "default") {
334             for (auto && ch : fasta.toSTLString()) {
335                 if (ch < '0' || ch > '9') {
336                     std::cerr << "fasta requires an integer value >= 0 or \"default\"" << std::endl;
337                     problems++;
338                     break;
339                 }
340             }
341         }
342         if ( bzip && gzip )
343         {
344             std::cerr << "bzip2 and gzip cannot both be used at the same time" << std::endl;
345             problems++;
346         }
347         if ( !read_filter.isEmpty() )
348         {
349             if ( !is_one_of( read_filter, 5, "split", "pass", "reject", "criterial", "redacted" ) )
350             {
351                 std::cerr << "invalid read-filter-value: " << read_filter << std::endl;
352                 problems++;
353             }
354 
355         }
356 
357         return CmnOptAndAccessions::check() && ( problems == 0 );
358     }
359 
runsratools2::FastqParams360     int run() const override {
361         auto const theirArgv0 = what.toolpath.getPathFor(TOOL_NAME).fullpath();
362         {
363             auto const realpath = what.toolpath.getPathFor(TOOL_ORIGINAL_NAME);
364             if (realpath.executable())
365                 return ToolExec::run(TOOL_NAME, realpath.fullpath(), theirArgv0, *this, accessions);
366         }
367         throw std::runtime_error(TOOL_NAME " was not found or is not executable.");
368     }
369 
370 };
371 
impersonate_fastq_dump(const Args & args,WhatImposter const & what)372 int impersonate_fastq_dump( const Args &args, WhatImposter const &what )
373 {
374 #if DEBUG || _DEBUGGING
375     FastqParams temp(what);
376     auto &params = *randomized(&temp, what);
377 #else
378     FastqParams params(what);
379 #endif
380     return Impersonator::run(args, params);
381 }
382 
383 } // namespace
384