1 // ==========================================================================
2 //                                 FX Tools
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 
35 #include <map>
36 
37 #include <seqan/arg_parse.h>
38 #include <seqan/basic.h>
39 #include <seqan/seq_io.h>
40 #include <seqan/sequence.h>
41 
42 // ==========================================================================
43 // Classes
44 // ==========================================================================
45 
46 // --------------------------------------------------------------------------
47 // Class AppOptions
48 // --------------------------------------------------------------------------
49 
50 // This struct stores the options from the command line.
51 //
52 // You might want to rename this to reflect the name of your app.
53 
54 struct AppOptions
55 {
56     // Verbosity level.  0 -- quiet, 1 -- normal, 2 -- verbose, 3 -- very verbose.
57     int verbosity;
58 
59     // The input file name is a string.
60     seqan::CharString inFilename;
61 
62     // The out file name is an out file.
63     seqan::CharString outFilename;
64 
AppOptionsAppOptions65     AppOptions() :
66         verbosity(1)
67     {}
68 };
69 
70 // --------------------------------------------------------------------------
71 // Class FastqStats
72 // --------------------------------------------------------------------------
73 
74 struct FastqStats
75 {
76     // -----------------------------------------------------------------------
77     // Members with Results
78     // -----------------------------------------------------------------------
79 
80     unsigned maxLength;
81 
82     // Number of bases in column i.
83     seqan::String<int64_t> numBases;
84     // Smallest score in column i.
85     seqan::String<int32_t> minScores;
86     // Largest score in column i.
87     seqan::String<int32_t> maxScores;
88     // Sum of scores in column i.
89     seqan::String<int64_t> sumScores;
90     // Mean of scores in column i.
91     seqan::String<double> meanScores;
92     // First quartile quality score (Q1) for column i.
93     seqan::String<double> firstQuartiles;
94     // Median quality score for column i.
95     seqan::String<double> medianScores;
96     // Third quartile quality score (Q3) for column i.
97     seqan::String<double> thirdQuartiles;
98     // Inter-quartile range  (Q3-Q1) for column i.
99     seqan::String<double> interQuartileRanges;
100     // Left-whisker value for boxplotting for column i.
101     seqan::String<int32_t> leftWhiskers;
102     // Right-whisker value for boxplotting for column i.
103     seqan::String<int32_t> rightWhiskers;
104     // Number of nucleotides A, C, G, T, N for column i.
105     seqan::String<seqan::String<int64_t> > nucleotideCounts;
106 
107     // -----------------------------------------------------------------------
108     // Histogram Members
109     // -----------------------------------------------------------------------
110 
111     // Quality histogram.
112     seqan::String<std::map<int32_t, int32_t> > qualHistos;
113 
114     // -----------------------------------------------------------------------
115     // Constructor
116     // -----------------------------------------------------------------------
117 
FastqStatsFastqStats118     FastqStats() : maxLength(0)
119     {}
120 
121     // -----------------------------------------------------------------------
122     // Member Functions
123     // -----------------------------------------------------------------------
124 
125     // Resize members to read of length.
resizeToReadLengthFastqStats126     void resizeToReadLength(unsigned n)
127     {
128         if (maxLength == n)
129             return;
130         maxLength = n;
131 
132         resize(numBases, n, 0);
133         resize(minScores, n, 80);
134         resize(maxScores, n, 0);
135         resize(sumScores, n, 0);
136         resize(meanScores, n, 0);
137         resize(firstQuartiles, n, 0);
138         resize(medianScores, n, 0);
139         resize(thirdQuartiles, n, 0);
140         resize(interQuartileRanges, n, 0);
141         resize(leftWhiskers, n, 0);
142         resize(rightWhiskers, n, 0);
143         resize(nucleotideCounts, n);
144         for (unsigned i = 0; i < n; ++i)
145             resize(nucleotideCounts[i], 5, 0);
146 
147         resize(qualHistos, n);
148     }
149 
150     // Update histogram and statistics for the given read.
registerReadFastqStats151     void registerRead(seqan::Dna5String const & seq, seqan::CharString const & quals)
152     {
153         resizeToReadLength(length(seq));
154 
155         // Update nucleotide counts.
156         for (unsigned i = 0; i < length(seq); ++i)
157         {
158             numBases[i] += 1;
159             nucleotideCounts[i][ordValue(seq[i])] += 1;
160         }
161 
162         // Update quality histograms and statistics.
163         for (unsigned i = 0; i < length(quals); ++i)
164         {
165             int qual = quals[i] - '!';  // PHRED scores.
166             if (numBases[i] == 0u || minScores[i] > qual)
167                 minScores[i] = qual;
168             if (numBases[i] == 0u || maxScores[i] < qual)
169                 maxScores[i] = qual;
170             qualHistos[i][qual] += 1;
171             sumScores[i] += qual;
172         }
173     }
174 
175     // Compute statistics after updating for the last read.
finalizeStatsFastqStats176     void finalizeStats()
177     {
178         // Compute score means.
179         for (unsigned i = 0; i < length(meanScores); ++i)
180             meanScores[i] = (1.0 * sumScores[i]) / numBases[i];
181         // Compute score medians and quartiles.
182         for (unsigned i = 0; i < length(qualHistos); ++i)
183         {
184             if (qualHistos[i].size() == 0u)
185                 continue;  // Skip if empty.
186 
187             // TODO(holtgrew): Quartile/median computation not mathematically correct yet.
188             unsigned n = numBases[i];  // Number of bases.
189             unsigned firstQN = n / 4;
190             unsigned medianN = n / 2;
191             unsigned thirdQN = (3 * n) / 4;
192             unsigned count = 0;  // Number of bases up to here.
193             for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
194             {
195                 if (count < firstQN && count + it->second >= firstQN)
196                     firstQuartiles[i] = it->first;
197                 if (count < medianN && count + it->second >= medianN)
198                     medianScores[i] = it->first;
199                 if (count < thirdQN && count + it->second >= thirdQN)
200                     thirdQuartiles[i] = it->first;
201 
202                 count += it->second;
203             }
204             interQuartileRanges[i] = (thirdQuartiles[i] - firstQuartiles[i]);
205         }
206 
207         // Compute whiskers as the data point that is still within 1.5 IQR of the lower quartile.
208         for (unsigned i = 0; i < length(qualHistos); ++i)
209         {
210             if (qualHistos[i].size() == 0u)
211                 continue;  // Skip if empty.
212             leftWhiskers[i] = static_cast<int>(firstQuartiles[i]);
213             rightWhiskers[i] = static_cast<int>(thirdQuartiles[i]);
214             double leftWhiskerBound = ((double)firstQuartiles[i]) - 1.5 * interQuartileRanges[i];
215             double rightWhiskerBound = ((double)thirdQuartiles[i]) + 1.5 * interQuartileRanges[i];
216             for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
217                 if (leftWhiskers[i] > it->first && it->first >= leftWhiskerBound)
218                     leftWhiskers[i] = it->first;
219             for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
220                 if (rightWhiskers[i] < it->first && it->first <= rightWhiskerBound)
221                     rightWhiskers[i] = it->first;
222         }
223     }
224 };
225 
226 // ==========================================================================
227 // Functions
228 // ==========================================================================
229 
230 // --------------------------------------------------------------------------
231 // Function parseCommandLine()
232 // --------------------------------------------------------------------------
233 
234 seqan::ArgumentParser::ParseResult
parseCommandLine(AppOptions & options,int argc,char const ** argv)235 parseCommandLine(AppOptions & options, int argc, char const ** argv)
236 {
237     // Setup ArgumentParser.
238     seqan::ArgumentParser parser("fx_fastq_stats");
239     // Set short description, version, and date.
240     setShortDescription(parser, "Compute FASTQ statistics.");
241     setCategory(parser, "NGS Quality Control");
242     setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
243     setDate(parser, SEQAN_DATE);
244 
245     // Define usage line and long description.
246     addUsageLine(parser, "\\fB-i\\fP \\fIINPUT.fq\\fP \\fB-o\\fP \\fIOUTPUT.fq.stats.tsv\\fP");
247     addDescription(parser,
248                    "Read a FASTQ file with reads sequences of the same length.  Writes out a TSV file with one record for "
249                    "each column/position with statistics on the nucleotides and qualities.");
250 
251     addSection(parser, "Input / Output");
252     addOption(parser, seqan::ArgParseOption("i", "input", "Input FASTQ file.", seqan::ArgParseOption::INPUT_FILE, "INPUT"));
253     setValidValues(parser, "input", "fastq fq");
254     setRequired(parser, "input");
255     addOption(parser, seqan::ArgParseOption("o", "output", "Output TSV file.", seqan::ArgParseOption::OUTPUT_FILE, "OUTPUT"));
256     setRequired(parser, "output");
257     setValidValues(parser, "output", "fq_stats_tsv");
258 
259     // Parse command line.
260     seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
261 
262     // Only extract  options if the program will continue after parseCommandLine()
263     if (res != seqan::ArgumentParser::PARSE_OK)
264         return res;
265 
266     seqan::getOptionValue(options.inFilename, parser, "input");
267     seqan::getOptionValue(options.outFilename, parser, "output");
268 
269     return seqan::ArgumentParser::PARSE_OK;
270 }
271 
272 // --------------------------------------------------------------------------
273 // Function main()
274 // --------------------------------------------------------------------------
275 
main(int argc,char const ** argv)276 int main(int argc, char const ** argv)
277 {
278     // Parse the command line.
279     seqan::ArgumentParser parser;
280     AppOptions options;
281     seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
282     // If there was an error parsing or built-in argument parser functionality
283     // was triggered then we exit the program.  The return code is 1 if there
284     // were errors and 0 if there were none.
285     if (res != seqan::ArgumentParser::PARSE_OK)
286         return res == seqan::ArgumentParser::PARSE_ERROR;
287 
288     // Open input file.
289     seqan::SeqFileIn seqFile;
290     if (!open(seqFile, toCString(options.inFilename)))
291     {
292         std::cerr << "ERROR: Could not open file " << options.inFilename << " for reading.\n";
293         return 1;
294     }
295 
296     // Read sequences and build result.
297     FastqStats stats;
298     seqan::CharString id;
299     seqan::Dna5String seq;
300     seqan::CharString quals;
301     while (!atEnd(seqFile))
302     {
303         readRecord(id, seq, quals, seqFile);
304         if (empty(quals))  // Fill with Q40 if there are no qualities.
305             resize(quals, length(seq), '!' + 40);
306 
307         // Update statistics.
308         stats.registerRead(seq, quals);
309     }
310 
311     // Finalize statistics and write to output.
312     stats.finalizeStats();
313 
314     std::ostream * out = &std::cout;
315     std::ofstream outStream;
316     if (options.outFilename != "-")
317     {
318         outStream.open(toCString(options.outFilename), std::ios::binary | std::ios::out);
319         if (!outStream.good())
320         {
321             std::cerr << "ERROR: Could not open file " << options.outFilename << " for writing.\n";
322             return 1;
323         }
324         out = &outStream;
325     }
326 
327     *out << "#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmedian\tQ3\tIQR\tlW\trW\tA_count\tC_count\tG_count\tT_count\tN_count\n";
328     for (unsigned i = 0; i < stats.maxLength; ++i)
329     {
330         *out << i << "\t"
331              << stats.numBases[i] << "\t"
332              << stats.minScores[i] << "\t"
333              << stats.maxScores[i] << "\t"
334              << stats.sumScores[i] << "\t"
335              << stats.meanScores[i] << "\t"
336              << stats.firstQuartiles[i] << "\t"
337              << stats.medianScores[i] << "\t"
338              << stats.thirdQuartiles[i] << "\t"
339              << stats.interQuartileRanges[i] << "\t"
340              << stats.leftWhiskers[i] << "\t"
341              << stats.rightWhiskers[i] << "\t"
342              << stats.nucleotideCounts[i][0] << "\t"
343              << stats.nucleotideCounts[i][1] << "\t"
344              << stats.nucleotideCounts[i][2] << "\t"
345              << stats.nucleotideCounts[i][3] << "\t"
346              << stats.nucleotideCounts[i][4] << "\n";
347     }
348 
349     return 0;
350 }
351