1 // ==========================================================================
2 // FX Tools
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34
35 #include <map>
36
37 #include <seqan/arg_parse.h>
38 #include <seqan/basic.h>
39 #include <seqan/seq_io.h>
40 #include <seqan/sequence.h>
41
42 // ==========================================================================
43 // Classes
44 // ==========================================================================
45
46 // --------------------------------------------------------------------------
47 // Class AppOptions
48 // --------------------------------------------------------------------------
49
50 // This struct stores the options from the command line.
51 //
52 // You might want to rename this to reflect the name of your app.
53
54 struct AppOptions
55 {
56 // Verbosity level. 0 -- quiet, 1 -- normal, 2 -- verbose, 3 -- very verbose.
57 int verbosity;
58
59 // The input file name is a string.
60 seqan::CharString inFilename;
61
62 // The out file name is an out file.
63 seqan::CharString outFilename;
64
AppOptionsAppOptions65 AppOptions() :
66 verbosity(1)
67 {}
68 };
69
70 // --------------------------------------------------------------------------
71 // Class FastqStats
72 // --------------------------------------------------------------------------
73
74 struct FastqStats
75 {
76 // -----------------------------------------------------------------------
77 // Members with Results
78 // -----------------------------------------------------------------------
79
80 unsigned maxLength;
81
82 // Number of bases in column i.
83 seqan::String<int64_t> numBases;
84 // Smallest score in column i.
85 seqan::String<int32_t> minScores;
86 // Largest score in column i.
87 seqan::String<int32_t> maxScores;
88 // Sum of scores in column i.
89 seqan::String<int64_t> sumScores;
90 // Mean of scores in column i.
91 seqan::String<double> meanScores;
92 // First quartile quality score (Q1) for column i.
93 seqan::String<double> firstQuartiles;
94 // Median quality score for column i.
95 seqan::String<double> medianScores;
96 // Third quartile quality score (Q3) for column i.
97 seqan::String<double> thirdQuartiles;
98 // Inter-quartile range (Q3-Q1) for column i.
99 seqan::String<double> interQuartileRanges;
100 // Left-whisker value for boxplotting for column i.
101 seqan::String<int32_t> leftWhiskers;
102 // Right-whisker value for boxplotting for column i.
103 seqan::String<int32_t> rightWhiskers;
104 // Number of nucleotides A, C, G, T, N for column i.
105 seqan::String<seqan::String<int64_t> > nucleotideCounts;
106
107 // -----------------------------------------------------------------------
108 // Histogram Members
109 // -----------------------------------------------------------------------
110
111 // Quality histogram.
112 seqan::String<std::map<int32_t, int32_t> > qualHistos;
113
114 // -----------------------------------------------------------------------
115 // Constructor
116 // -----------------------------------------------------------------------
117
FastqStatsFastqStats118 FastqStats() : maxLength(0)
119 {}
120
121 // -----------------------------------------------------------------------
122 // Member Functions
123 // -----------------------------------------------------------------------
124
125 // Resize members to read of length.
resizeToReadLengthFastqStats126 void resizeToReadLength(unsigned n)
127 {
128 if (maxLength == n)
129 return;
130 maxLength = n;
131
132 resize(numBases, n, 0);
133 resize(minScores, n, 80);
134 resize(maxScores, n, 0);
135 resize(sumScores, n, 0);
136 resize(meanScores, n, 0);
137 resize(firstQuartiles, n, 0);
138 resize(medianScores, n, 0);
139 resize(thirdQuartiles, n, 0);
140 resize(interQuartileRanges, n, 0);
141 resize(leftWhiskers, n, 0);
142 resize(rightWhiskers, n, 0);
143 resize(nucleotideCounts, n);
144 for (unsigned i = 0; i < n; ++i)
145 resize(nucleotideCounts[i], 5, 0);
146
147 resize(qualHistos, n);
148 }
149
150 // Update histogram and statistics for the given read.
registerReadFastqStats151 void registerRead(seqan::Dna5String const & seq, seqan::CharString const & quals)
152 {
153 resizeToReadLength(length(seq));
154
155 // Update nucleotide counts.
156 for (unsigned i = 0; i < length(seq); ++i)
157 {
158 numBases[i] += 1;
159 nucleotideCounts[i][ordValue(seq[i])] += 1;
160 }
161
162 // Update quality histograms and statistics.
163 for (unsigned i = 0; i < length(quals); ++i)
164 {
165 int qual = quals[i] - '!'; // PHRED scores.
166 if (numBases[i] == 0u || minScores[i] > qual)
167 minScores[i] = qual;
168 if (numBases[i] == 0u || maxScores[i] < qual)
169 maxScores[i] = qual;
170 qualHistos[i][qual] += 1;
171 sumScores[i] += qual;
172 }
173 }
174
175 // Compute statistics after updating for the last read.
finalizeStatsFastqStats176 void finalizeStats()
177 {
178 // Compute score means.
179 for (unsigned i = 0; i < length(meanScores); ++i)
180 meanScores[i] = (1.0 * sumScores[i]) / numBases[i];
181 // Compute score medians and quartiles.
182 for (unsigned i = 0; i < length(qualHistos); ++i)
183 {
184 if (qualHistos[i].size() == 0u)
185 continue; // Skip if empty.
186
187 // TODO(holtgrew): Quartile/median computation not mathematically correct yet.
188 unsigned n = numBases[i]; // Number of bases.
189 unsigned firstQN = n / 4;
190 unsigned medianN = n / 2;
191 unsigned thirdQN = (3 * n) / 4;
192 unsigned count = 0; // Number of bases up to here.
193 for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
194 {
195 if (count < firstQN && count + it->second >= firstQN)
196 firstQuartiles[i] = it->first;
197 if (count < medianN && count + it->second >= medianN)
198 medianScores[i] = it->first;
199 if (count < thirdQN && count + it->second >= thirdQN)
200 thirdQuartiles[i] = it->first;
201
202 count += it->second;
203 }
204 interQuartileRanges[i] = (thirdQuartiles[i] - firstQuartiles[i]);
205 }
206
207 // Compute whiskers as the data point that is still within 1.5 IQR of the lower quartile.
208 for (unsigned i = 0; i < length(qualHistos); ++i)
209 {
210 if (qualHistos[i].size() == 0u)
211 continue; // Skip if empty.
212 leftWhiskers[i] = static_cast<int>(firstQuartiles[i]);
213 rightWhiskers[i] = static_cast<int>(thirdQuartiles[i]);
214 double leftWhiskerBound = ((double)firstQuartiles[i]) - 1.5 * interQuartileRanges[i];
215 double rightWhiskerBound = ((double)thirdQuartiles[i]) + 1.5 * interQuartileRanges[i];
216 for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
217 if (leftWhiskers[i] > it->first && it->first >= leftWhiskerBound)
218 leftWhiskers[i] = it->first;
219 for (std::map<int32_t, int32_t>::const_iterator it = qualHistos[i].begin(); it != qualHistos[i].end(); ++it)
220 if (rightWhiskers[i] < it->first && it->first <= rightWhiskerBound)
221 rightWhiskers[i] = it->first;
222 }
223 }
224 };
225
226 // ==========================================================================
227 // Functions
228 // ==========================================================================
229
230 // --------------------------------------------------------------------------
231 // Function parseCommandLine()
232 // --------------------------------------------------------------------------
233
234 seqan::ArgumentParser::ParseResult
parseCommandLine(AppOptions & options,int argc,char const ** argv)235 parseCommandLine(AppOptions & options, int argc, char const ** argv)
236 {
237 // Setup ArgumentParser.
238 seqan::ArgumentParser parser("fx_fastq_stats");
239 // Set short description, version, and date.
240 setShortDescription(parser, "Compute FASTQ statistics.");
241 setCategory(parser, "NGS Quality Control");
242 setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
243 setDate(parser, SEQAN_DATE);
244
245 // Define usage line and long description.
246 addUsageLine(parser, "\\fB-i\\fP \\fIINPUT.fq\\fP \\fB-o\\fP \\fIOUTPUT.fq.stats.tsv\\fP");
247 addDescription(parser,
248 "Read a FASTQ file with reads sequences of the same length. Writes out a TSV file with one record for "
249 "each column/position with statistics on the nucleotides and qualities.");
250
251 addSection(parser, "Input / Output");
252 addOption(parser, seqan::ArgParseOption("i", "input", "Input FASTQ file.", seqan::ArgParseOption::INPUT_FILE, "INPUT"));
253 setValidValues(parser, "input", "fastq fq");
254 setRequired(parser, "input");
255 addOption(parser, seqan::ArgParseOption("o", "output", "Output TSV file.", seqan::ArgParseOption::OUTPUT_FILE, "OUTPUT"));
256 setRequired(parser, "output");
257 setValidValues(parser, "output", "fq_stats_tsv");
258
259 // Parse command line.
260 seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
261
262 // Only extract options if the program will continue after parseCommandLine()
263 if (res != seqan::ArgumentParser::PARSE_OK)
264 return res;
265
266 seqan::getOptionValue(options.inFilename, parser, "input");
267 seqan::getOptionValue(options.outFilename, parser, "output");
268
269 return seqan::ArgumentParser::PARSE_OK;
270 }
271
272 // --------------------------------------------------------------------------
273 // Function main()
274 // --------------------------------------------------------------------------
275
main(int argc,char const ** argv)276 int main(int argc, char const ** argv)
277 {
278 // Parse the command line.
279 seqan::ArgumentParser parser;
280 AppOptions options;
281 seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
282 // If there was an error parsing or built-in argument parser functionality
283 // was triggered then we exit the program. The return code is 1 if there
284 // were errors and 0 if there were none.
285 if (res != seqan::ArgumentParser::PARSE_OK)
286 return res == seqan::ArgumentParser::PARSE_ERROR;
287
288 // Open input file.
289 seqan::SeqFileIn seqFile;
290 if (!open(seqFile, toCString(options.inFilename)))
291 {
292 std::cerr << "ERROR: Could not open file " << options.inFilename << " for reading.\n";
293 return 1;
294 }
295
296 // Read sequences and build result.
297 FastqStats stats;
298 seqan::CharString id;
299 seqan::Dna5String seq;
300 seqan::CharString quals;
301 while (!atEnd(seqFile))
302 {
303 readRecord(id, seq, quals, seqFile);
304 if (empty(quals)) // Fill with Q40 if there are no qualities.
305 resize(quals, length(seq), '!' + 40);
306
307 // Update statistics.
308 stats.registerRead(seq, quals);
309 }
310
311 // Finalize statistics and write to output.
312 stats.finalizeStats();
313
314 std::ostream * out = &std::cout;
315 std::ofstream outStream;
316 if (options.outFilename != "-")
317 {
318 outStream.open(toCString(options.outFilename), std::ios::binary | std::ios::out);
319 if (!outStream.good())
320 {
321 std::cerr << "ERROR: Could not open file " << options.outFilename << " for writing.\n";
322 return 1;
323 }
324 out = &outStream;
325 }
326
327 *out << "#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmedian\tQ3\tIQR\tlW\trW\tA_count\tC_count\tG_count\tT_count\tN_count\n";
328 for (unsigned i = 0; i < stats.maxLength; ++i)
329 {
330 *out << i << "\t"
331 << stats.numBases[i] << "\t"
332 << stats.minScores[i] << "\t"
333 << stats.maxScores[i] << "\t"
334 << stats.sumScores[i] << "\t"
335 << stats.meanScores[i] << "\t"
336 << stats.firstQuartiles[i] << "\t"
337 << stats.medianScores[i] << "\t"
338 << stats.thirdQuartiles[i] << "\t"
339 << stats.interQuartileRanges[i] << "\t"
340 << stats.leftWhiskers[i] << "\t"
341 << stats.rightWhiskers[i] << "\t"
342 << stats.nucleotideCounts[i][0] << "\t"
343 << stats.nucleotideCounts[i][1] << "\t"
344 << stats.nucleotideCounts[i][2] << "\t"
345 << stats.nucleotideCounts[i][3] << "\t"
346 << stats.nucleotideCounts[i][4] << "\n";
347 }
348
349 return 0;
350 }
351