1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // process_shortreads -- clean raw reads using a sliding window approach;
23 //   split reads by barcode if barcodes provided, correct barcodes
24 //   within one basepair, truncate reads on request.
25 //
26 
27 #include "process_shortreads.h"
28 
29 //
30 // Global variables to hold command-line options.
31 //
32 FileT  in_file_type  = FileT::unknown;
33 FileT  out_file_type = FileT::unknown;
34 string in_file;
35 string in_file_p1;
36 string in_file_p2;
37 string in_path_1;
38 string in_path_2;
39 string out_path;
40 string barcode_file;
41 char  *adapter_1;
42 char  *adapter_2;
43 barcodet barcode_type    = null_null;
44 bool     retain_header   = false;
45 bool     filter_adapter  = false;
46 bool     paired          = false;
47 bool     clean           = false;
48 bool     quality         = false;
49 bool     recover         = false;
50 bool     interleaved     = false;
51 bool     merge           = false;
52 bool     discards        = false;
53 bool     overhang        = true;
54 bool     matepair        = false;
55 bool     filter_illumina = false;
56 bool     trim_reads      = true;
57 uint     truncate_seq    = 0;
58 int      barcode_dist_1  = 1;
59 int      barcode_dist_2  = -1;
60 double   win_size        = 0.15;
61 int      score_limit     = 10;
62 int      len_limit       = 31;
63 int      num_threads     = 1;
64 
65 //
66 // How to shift FASTQ-encoded quality scores from ASCII down to raw scores
67 //     score = encoded letter - 64; Illumina version 1.3 - 1.5
68 //     score = encoded letter - 33; Sanger / Illumina version 1.6+
69 int qual_offset  = 33;
70 
71 //
72 // Handle variable-size barcodes.
73 //
74 uint min_bc_size_1 = 0;
75 uint max_bc_size_1 = 0;
76 uint min_bc_size_2 = 0;
77 uint max_bc_size_2 = 0;
78 
79 //
80 // Kmer data for adapter filtering.
81 //
82 int kmer_size = 5;
83 int distance  = 1;
84 int adp_1_len = 0;
85 int adp_2_len = 0;
86 AdapterHash adp_1_kmers, adp_2_kmers;
87 
main(int argc,char * argv[])88 int main (int argc, char* argv[]) {
89     IF_NDEBUG_TRY
90 
91     parse_command_line(argc, argv);
92 
93     //
94     // If input files are gzipped, output gziped files, unless the user chooses an output type.
95     //
96     if (out_file_type == FileT::unknown) {
97         if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
98             out_file_type = FileT::gzfastq;
99         else
100             out_file_type = FileT::fastq;
101     }
102 
103     cerr << "Using Phred+" << qual_offset << " encoding for quality scores.\n"
104          << "Reads trimmed shorter than " << len_limit << " nucleotides will be discarded.\n";
105     if (truncate_seq > 0)
106         cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
107     if (filter_illumina)
108         cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
109     if (filter_adapter) {
110         cerr << "Filtering reads for adapter sequence:\n";
111         if (adapter_1 != NULL) {
112             cerr << "  " << adapter_1 << "\n";
113             init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
114         }
115         if (adapter_2 != NULL) {
116             cerr << "  " << adapter_2 << "\n";
117             init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
118         }
119         cerr << "    " << distance << " mismatches allowed to adapter sequence.\n";
120     }
121 
122     vector<pair<string, string> >        files;
123     vector<BarcodePair>                  barcodes;
124     set<string>                          se_bc, pe_bc;
125     map<BarcodePair, ofstream *>         pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs;
126     map<BarcodePair, gzFile *>           pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs;
127     map<string, map<string, long> >      counters;
128     map<BarcodePair, map<string, long> > barcode_log;
129 
130     build_file_list(files);
131     load_barcodes(barcode_file, barcodes, se_bc, pe_bc, min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2);
132     if (recover && barcode_type != null_null) {
133         if (barcode_type == index_null || barcode_type == inline_null)
134             cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
135         else
136             cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
137     }
138 
139     if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
140         open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
141     else
142         open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
143 
144     int result = 1;
145 
146     for (uint i = 0; i < files.size(); i++) {
147         cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
148 
149         counters[files[i].first]["total"]        = 0;
150         counters[files[i].first]["ill_filtered"] = 0;
151         counters[files[i].first]["low_quality"]  = 0;
152         counters[files[i].first]["trimmed"]      = 0;
153         counters[files[i].first]["adapter"]      = 0;
154         counters[files[i].first]["ambiguous"]    = 0;
155         counters[files[i].first]["retained"]     = 0;
156         counters[files[i].first]["orphaned"]     = 0;
157         counters[files[i].first]["recovered"]    = 0;
158 
159         if (paired) {
160             if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
161                     result = process_paired_reads(files[i].first, files[i].second,
162                                               se_bc, pe_bc,
163                                               pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
164                                               counters[files[i].first], barcode_log);
165             else
166                 result = process_paired_reads(files[i].first, files[i].second,
167                                               se_bc, pe_bc,
168                                               pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
169                                               counters[files[i].first], barcode_log);
170         } else {
171             if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
172                     result = process_reads(files[i].first,
173                                        se_bc, pe_bc,
174                                        pair_1_gzfhs,
175                                        counters[files[i].first], barcode_log);
176             else
177                 result = process_reads(files[i].first,
178                                        se_bc, pe_bc,
179                                        pair_1_fhs,
180                                        counters[files[i].first], barcode_log);
181         }
182         cerr <<        "  "
183              << counters[files[i].first]["total"] << " total reads; ";
184         if (filter_illumina)
185             cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
186         cerr
187              << "-" << counters[files[i].first]["ambiguous"]   << " ambiguous barcodes; "
188              << "+" << counters[files[i].first]["recovered"]   << " recovered; "
189              << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
190              << counters[files[i].first]["retained"] << " retained reads.\n"
191              << "    ";
192         if (filter_adapter)
193             cerr << counters[files[i].first]["adapter"] << " reads with adapter sequence; ";
194         cerr << counters[files[i].first]["trimmed"]     << " trimmed reads; "
195              << counters[files[i].first]["orphaned"]    << " orphaned paired-ends.\n";
196 
197         if (!result) {
198             cerr << "Error processing reads.\n";
199             break;
200         }
201     }
202 
203     cerr << "Closing files, flushing buffers...\n";
204     if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
205         close_file_handles(pair_1_gzfhs);
206         if (paired) {
207             close_file_handles(rem_1_gzfhs);
208             close_file_handles(rem_2_gzfhs);
209             close_file_handles(pair_2_gzfhs);
210         }
211     } else {
212         close_file_handles(pair_1_fhs);
213         if (paired) {
214             close_file_handles(rem_1_fhs);
215             close_file_handles(rem_2_fhs);
216             close_file_handles(pair_2_fhs);
217         }
218     }
219 
220     print_results(argc, argv, barcodes, counters, barcode_log);
221 
222     return 0;
223     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
224 }
225 
226 template<typename fhType>
227 int
process_paired_reads(string prefix_1,string prefix_2,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,fhType * > & pair_1_fhs,map<BarcodePair,fhType * > & pair_2_fhs,map<BarcodePair,fhType * > & rem_1_fhs,map<BarcodePair,fhType * > & rem_2_fhs,map<string,long> & counter,map<BarcodePair,map<string,long>> & barcode_log)228 process_paired_reads(string prefix_1,
229                      string prefix_2,
230                      set<string> &se_bc, set<string> &pe_bc,
231                      map<BarcodePair, fhType *> &pair_1_fhs,
232                      map<BarcodePair, fhType *> &pair_2_fhs,
233                      map<BarcodePair, fhType *> &rem_1_fhs,
234                      map<BarcodePair, fhType *> &rem_2_fhs,
235                      map<string, long> &counter,
236                      map<BarcodePair, map<string, long> > &barcode_log) {
237     Input *fh_1=NULL, *fh_2=NULL;
238     RawRead  *r_1=NULL, *r_2=NULL;
239     ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
240 
241     int return_val = 1;
242 
243     string path_1 = in_path_1 + prefix_1;
244     string path_2 = in_path_2 + prefix_2;
245 
246     if (interleaved)
247         cerr << "  Reading data from:\n  " << path_1 << "\n";
248     else
249         cerr << "  Reading data from:\n  " << path_1 << " and\n  " << path_2 << "\n";
250 
251     if (in_file_type == FileT::fastq) {
252         fh_1 = new Fastq(path_1.c_str());
253         fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
254     } else if (in_file_type == FileT::gzfastq) {
255         fh_1 = new GzFastq(path_1.c_str());
256         fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
257     } else if (in_file_type == FileT::bam) {
258         fh_1 = new BamUnAln(path_1.c_str());
259         fh_2 = fh_1;
260     } else if (in_file_type == FileT::bustard) {
261         fh_1 = new Bustard(path_1.c_str());
262         fh_2 = interleaved ? fh_1 : new Bustard(path_2.c_str());
263     }
264 
265     //
266     // Open a file for recording discarded reads
267     //
268     if (discards) {
269         path_1 = out_path + prefix_1 + ".discards";
270         discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
271 
272         if (discard_fh_1->fail()) {
273             cerr << "Error opening discard output file '" << path_1 << "'\n";
274             exit(1);
275         }
276 
277         path_2 = out_path + prefix_2 + ".discards";
278         discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
279 
280         if (discard_fh_1->fail()) {
281             cerr << "Error opening discard output file '" << path_2 << "'\n";
282             exit(1);
283         }
284     }
285 
286     //
287     // Read in the first record, initializing the Seq object s. Then
288     // initialize the Read object r, then loop, using the same objects.
289     //
290     Seq *s_1 = fh_1->next_seq();
291     Seq *s_2 = fh_2->next_seq();
292     if (s_1 == NULL || s_2 == NULL) {
293         cerr << "Attempting to read first pair of input records, unable to allocate "
294              << "Seq object (Was the correct input type specified?).\n";
295         exit(1);
296     }
297 
298     r_1 = new RawRead(strlen(s_1->seq), 1, min_bc_size_1, win_size);
299     r_2 = new RawRead(strlen(s_2->seq), 2, min_bc_size_2, win_size);
300 
301     BarcodePair bc;
302     //
303     // If no barcodes were specified, set the barcode object to be the input file names.
304     //
305     if (max_bc_size_1 == 0)
306         bc.set(prefix_1, prefix_2);
307 
308     long i = 1;
309 
310     do {
311         if (i % 10000 == 0) cerr << "  Processing short read " << i << "       \r";
312 
313         parse_input_record(s_1, r_1);
314         parse_input_record(s_2, r_2);
315         counter["total"] += 2;
316 
317         if (barcode_type != null_null &&
318             barcode_type != inline_null &&
319             barcode_type != index_null)
320             bc.set(r_1->se_bc, r_2->pe_bc);
321         else if (barcode_type != null_null)
322             bc.set(r_1->se_bc);
323 
324         process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
325 
326         //
327         // Adjust the size of the read to accommodate truncating the sequence and variable
328         // barcode lengths. With standard Illumina data we want to output constant length
329         // reads even as the barcode size may change. Other technologies, like IonTorrent
330         // need to be truncated uniformly.
331         //
332         if (truncate_seq > 0) {
333             if (truncate_seq + r_1->inline_bc_len <= r_1->len)
334                 r_1->set_len(truncate_seq + r_1->inline_bc_len);
335             if (truncate_seq + r_2->inline_bc_len <= r_2->len)
336                 r_2->set_len(truncate_seq + r_2->inline_bc_len);
337         } else {
338             if (barcode_type == inline_null || barcode_type == inline_inline ||        barcode_type == inline_index)
339                 r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
340             if (barcode_type == inline_index ||        barcode_type == index_index)
341                 r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
342         }
343 
344         if (r_1->retain)
345             process_singlet(r_1, false, barcode_log[bc], counter);
346         if (r_2->retain)
347             process_singlet(r_2, true,  barcode_log[bc], counter);
348 
349         if (matepair) {
350             rev_complement(r_1->seq, r_1->inline_bc_len, overhang);
351             reverse_qual(r_1->phred, r_1->inline_bc_len, overhang);
352         }
353 
354         int result_1 = 1;
355         int result_2 = 1;
356 
357         if (r_1->retain && r_2->retain) {
358             if (retain_header) {
359                 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
360                     write_fastq(pair_1_fhs[bc], s_1, r_1) :
361                     write_fasta(pair_1_fhs[bc], s_1, r_1);
362                 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
363                     write_fastq(pair_2_fhs[bc], s_2, r_2) :
364                     write_fasta(pair_2_fhs[bc], s_2, r_2);
365             } else {
366                 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
367                     write_fastq(pair_1_fhs[bc], r_1, overhang) :
368                     write_fasta(pair_1_fhs[bc], r_1, overhang);
369                 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
370                     write_fastq(pair_2_fhs[bc], r_2, overhang) :
371                     write_fasta(pair_2_fhs[bc], r_2, overhang);
372             }
373 
374         } else if (r_1->retain && !r_2->retain) {
375             //
376             // Write to a remainder file.
377             //
378             if (retain_header)
379                 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
380                     write_fastq(rem_1_fhs[bc], s_1, r_1) :
381                     write_fasta(rem_1_fhs[bc], s_1, r_1);
382             else
383                 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
384                     write_fastq(rem_1_fhs[bc], r_1, overhang) :
385                     write_fasta(rem_1_fhs[bc], r_1, overhang);
386 
387         } else if (!r_1->retain && r_2->retain) {
388             // Write to a remainder file.
389             if (retain_header)
390                 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
391                     write_fastq(rem_2_fhs[bc], s_2, r_2) :
392                     write_fasta(rem_2_fhs[bc], s_2, r_2);
393             else
394                 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
395                     write_fastq(rem_2_fhs[bc], r_2, overhang) :
396                     write_fasta(rem_2_fhs[bc], r_2, overhang);
397         }
398 
399         if (!result_1 || !result_2) {
400             cerr << "Error writing to output file for '" << bc.str() << "'\n";
401             return_val = -1;
402             break;
403         }
404 
405         if (discards && !r_1->retain)
406             result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
407                 write_fastq(discard_fh_1, s_1) :
408                 write_fasta(discard_fh_1, s_1);
409         if (discards && !r_2->retain)
410             result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
411                 write_fastq(discard_fh_2, s_2) :
412                 write_fasta(discard_fh_2, s_2);
413 
414         delete s_1;
415         delete s_2;
416 
417         if (!result_1 || !result_2) {
418             cerr << "Error writing to discard file for '" << bc.str() << "'\n";
419             return_val = -1;
420             break;
421         }
422 
423         i++;
424     } while ((s_1 = fh_1->next_seq()) != NULL &&
425              (s_2 = fh_2->next_seq()) != NULL);
426 
427     if (discards) {
428         delete discard_fh_1;
429         delete discard_fh_2;
430     }
431 
432     delete fh_1;
433     if (interleaved == false) delete fh_2;
434 
435     return return_val;
436 }
437 
438 template<typename fhType>
439 int
process_reads(string prefix,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,fhType * > & pair_1_fhs,map<string,long> & counter,map<BarcodePair,map<string,long>> & barcode_log)440 process_reads(string prefix,
441               set<string> &se_bc, set<string> &pe_bc,
442               map<BarcodePair, fhType *> &pair_1_fhs,
443               map<string, long> &counter,
444               map<BarcodePair, map<string, long> > &barcode_log) {
445     Input *fh=NULL;
446     RawRead  *r;
447     ofstream *discard_fh=NULL;
448 
449     int return_val = 1;
450 
451     string path = in_path_1 + prefix;
452 
453     if (in_file_type == FileT::fastq)
454         fh = new Fastq(path.c_str());
455     else if (in_file_type == FileT::gzfastq)
456         fh = new GzFastq(path.c_str());
457     else if (in_file_type == FileT::bam)
458         fh = new BamUnAln(path.c_str());
459     else if (in_file_type == FileT::bustard)
460         fh = new Bustard(path.c_str());
461 
462     //
463     // Open a file for recording discarded reads
464     //
465     if (discards) {
466         path = path + ".discards";
467         discard_fh = new ofstream(path.c_str(), ifstream::out);
468 
469         if (discard_fh->fail()) {
470             cerr << "Error opening discard output file '" << path << "'\n";
471             exit(1);
472         }
473     }
474 
475     //
476     // Read in the first record, initializing the Seq object s. Then
477     // initialize the Read object r, then loop, using the same objects.
478     //
479     Seq *s = fh->next_seq();
480     if (s == NULL) {
481         cerr << "Attempting to read first input record, unable to allocate "
482              << "Seq object (Was the correct input type specified?).\n";
483         exit(1);
484     }
485 
486     r = new RawRead(strlen(s->seq), 1, min_bc_size_1, win_size);
487 
488     BarcodePair bc;
489     //
490     // If no barcodes were specified, set the barcode object to be the input file name so
491     // that reads are written to an output file of the same name as the input file.
492     //
493     if (max_bc_size_1 == 0)
494         bc.set(prefix);
495 
496     //cerr << "Length: " << r->len << "; Window length: " << r->win_len << "; Stop position: " << r->stop_pos << "\n";
497 
498     long i = 1;
499 
500     do {
501         if (i % 10000 == 0) cerr << "  Processing short read " << i << "       \r";
502         counter["total"]++;
503 
504         parse_input_record(s, r);
505 
506         if (barcode_type == inline_null ||
507             barcode_type == index_null)
508             bc.set(r->se_bc);
509         else if (barcode_type == index_inline ||
510                  barcode_type == inline_index)
511             bc.set(r->se_bc, r->pe_bc);
512 
513         process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
514 
515         //
516         // Adjust the size of the read to accommodate truncating the sequence and variable
517         // barcode lengths. With standard Illumina data we want to output constant length
518         // reads even as the barcode size may change. Other technologies, like IonTorrent
519         // need to be truncated uniformly.
520         //
521         if (truncate_seq > 0) {
522             if (truncate_seq + r->inline_bc_len <= r->len)
523                 r->set_len(truncate_seq + r->inline_bc_len);
524         } else {
525             if (barcode_type == inline_null || barcode_type == inline_inline ||        barcode_type == inline_index)
526                 r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
527         }
528 
529         if (r->retain)
530             process_singlet(r, false, barcode_log[bc], counter);
531 
532         int result = 1;
533 
534         if (r->retain) {
535             if (retain_header)
536                 result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
537                     write_fastq(pair_1_fhs[bc], s, r) :
538                     write_fasta(pair_1_fhs[bc], s, r);
539             else
540                 result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
541                     write_fastq(pair_1_fhs[bc], r, overhang) :
542                     write_fasta(pair_1_fhs[bc], r, overhang);
543         }
544 
545         if (!result) {
546             cerr << "Error writing to output file for '" << bc.str() << "'\n";
547             return_val = -1;
548             break;
549         }
550 
551         if (discards && !r->retain)
552             result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
553                 write_fastq(discard_fh, s) :
554                 write_fasta(discard_fh, s);
555 
556         if (!result) {
557             cerr << "Error writing to discard file for '" << bc.str() << "'\n";
558             return_val = -1;
559             break;
560         }
561 
562         delete s;
563         i++;
564     } while ((s = fh->next_seq()) != NULL);
565 
566     if (discards) delete discard_fh;
567 
568     //
569     // Close the file and delete the Input object.
570     //
571     delete fh;
572 
573     return return_val;
574 }
575 
576 inline int
process_singlet(RawRead * href,bool paired_end,map<string,long> & bc_log,map<string,long> & counter)577 process_singlet(RawRead *href,
578                 bool paired_end,
579                 map<string, long> &bc_log, map<string, long> &counter)
580 {
581     if (filter_illumina && href->filter) {
582         counter["ill_filtered"]++;
583         href->retain = 0;
584         return 0;
585     }
586 
587     //
588     // Drop this sequence if it has any uncalled nucleotides
589     //
590     if (clean) {
591         for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
592             if (*p == '.' || *p == 'N') {
593                 counter["low_quality"]++;
594                 href->retain = 0;
595                 return 0;
596             }
597     }
598 
599     bool adapter_trim = false;
600     bool quality_trim = false;
601 
602     //
603     // Drop or trim this sequence if it has low quality scores
604     //
605     if (quality) {
606         int res = check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len);
607 
608         if (trim_reads) {
609             if (res == 0) {
610                 counter["low_quality"]++;
611                 href->retain = 0;
612                 return 0;
613             } else if (res < 0) {
614                 quality_trim = true;
615             }
616         } else {
617             if (res <= 0) {
618                 counter["low_quality"]++;
619                 href->retain = 0;
620                 return 0;
621             }
622         }
623     }
624 
625     //
626     // Drop or trim this sequence if it contains adapter sequence.
627     //
628     if (filter_adapter) {
629         int res = 1;
630         if (paired_end == true  && adp_2_len > 0)
631             res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
632                                      kmer_size, distance, len_limit);
633         if (paired_end == false && adp_1_len > 0)
634             res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
635                                      kmer_size, distance, len_limit);
636         if (res == 0) {
637             counter["adapter"]++;
638             href->retain = 0;
639             return 0;
640 
641         } else if (res < 0) {
642             counter["adapter"]++;
643             adapter_trim = true;
644         }
645     }
646 
647     if (adapter_trim || quality_trim)
648         counter["trimmed"]++;
649 
650     if (barcode_type != null_null)
651         bc_log["retained"]++;
652     counter["retained"]++;
653 
654     return 0;
655 }
656 
dist(const char * res_enz,char * seq)657 int dist(const char *res_enz, char *seq) {
658     const char *p; char *q;
659 
660     int dist = 0;
661 
662     for (p = res_enz, q = seq; *p != '\0'; p++, q++)
663         if (*p != *q) dist++;
664 
665     return dist;
666 }
667 
668 int
print_results(int argc,char ** argv,vector<BarcodePair> & barcodes,map<string,map<string,long>> & counters,map<BarcodePair,map<string,long>> & barcode_log)669 print_results(int argc, char **argv,
670               vector<BarcodePair> &barcodes,
671               map<string, map<string, long> > &counters,
672               map<BarcodePair, map<string, long> > &barcode_log)
673 {
674     map<string, map<string, long> >::iterator it;
675 
676     string log_path = out_path + "process_shortreads.log";
677     ofstream log(log_path.c_str());
678 
679     if (log.fail()) {
680         cerr << "Unable to open log file '" << log_path << "'\n";
681         return 0;
682     }
683 
684     cerr << "Outputing details to log: '" << log_path << "'\n\n";
685 
686     init_log(log, argc, argv);
687 
688     log << "File\t"
689         << "Retained Reads\t";
690     if (filter_illumina)
691         log << "Illumina Filtered\t";
692     if (filter_adapter)
693         log << "Adapter Seq" << "\t";
694     log << "Low Quality\t"
695         << "Ambiguous Barcodes\t"
696         << "Trimmed Reads\t"
697         << "Orphaned paired-end reads\t"
698         << "Total\n";
699 
700     for (it = counters.begin(); it != counters.end(); it++) {
701         log << it->first                 << "\t"
702             << it->second["retained"]    << "\t";
703         if (filter_illumina)
704             log << it->second["ill_filtered"] << "\t";
705         if (filter_adapter)
706             log << it->second["adapter"] << "\t";
707         log << it->second["low_quality"] << "\t"
708             << it->second["ambiguous"]   << "\t"
709             << it->second["trimmed"]    << "\t"
710             << it->second["orphaned"]    << "\t"
711             << it->second["total"]       << "\n";
712     }
713 
714     map<string, long> c;
715     c["total"]        = 0;
716     c["low_quality"]  = 0;
717     c["adapter"]      = 0;
718     c["ill_filtered"] = 0;
719     c["ambiguous"]    = 0;
720     c["trimmed"]      = 0;
721     c["orphaned"]     = 0;
722 
723     //
724     // Total up the individual counters
725     //
726     for (it = counters.begin(); it != counters.end(); it++) {
727         c["total"]        += it->second["total"];
728         c["ill_filtered"] += it->second["ill_filtered"];
729         c["adapter"]      += it->second["adapter"];
730         c["low_quality"]  += it->second["low_quality"];
731         c["ambiguous"]    += it->second["ambiguous"];
732         c["trimmed"]      += it->second["trimmed"];
733         c["orphaned"]     += it->second["orphaned"];
734         c["retained"]     += it->second["retained"];
735     }
736 
737     cerr << c["total"] << " total sequences;\n";
738     if (filter_illumina)
739         cerr << "  " << c["ill_filtered"] << " failed Illumina filtered reads;\n";
740     if (filter_adapter)
741         cerr << "  " << c["adapter"] << " reads contained adapter sequence;\n";
742     cerr << "  " << c["ambiguous"]   << " ambiguous barcode drops;\n"
743          << "  " << c["low_quality"] << " low quality read drops;\n"
744          << "  " << c["trimmed"]     << " trimmed reads;\n"
745          << "  " << c["orphaned"]    << " orphaned paired-end reads;\n"
746          << c["retained"] << " retained reads.\n";
747 
748     log        << "\n"
749         << "Total Sequences\t"      << c["total"]       << "\n";
750     if (filter_illumina)
751         log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
752     if (filter_adapter)
753         log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
754     log
755         << "Ambiguous Barcodes\t"   << c["ambiguous"]   << "\n"
756         << "Low Quality\t"          << c["low_quality"] << "\n"
757         << "Trimmed Reads\t"        << c["trimmed"]     << "\n"
758         << "Orphaned Paired-ends\t" << c["orphaned"]    << "\n"
759         << "Retained Reads\t"       << c["retained"]      << "\n";
760 
761     if (max_bc_size_1 == 0) return 0;
762 
763     //
764     // Where barcode filenames specified?
765     //
766     bool bc_names = false;
767     for (uint i = 0; i < barcodes.size(); i++)
768         if (barcodes[i].name_exists()) {
769             bc_names = true;
770             break;
771         }
772 
773     //
774     // Print out barcode information.
775     //
776     log << "\n"
777         << "Barcode\t";
778     if (bc_names)
779         log << "Filename\t";
780     log << "Total\t"
781         << "Retained\n";
782 
783     set<BarcodePair> barcode_list;
784 
785     for (uint i = 0; i < barcodes.size(); i++) {
786         barcode_list.insert(barcodes[i]);
787 
788         log << barcodes[i] << "\t";
789         if (bc_names)
790             log << barcodes[i].name << "\t";
791         if (barcode_log.count(barcodes[i]) == 0)
792             log << "0\t" << "0\t" << "0\n";
793         else
794             log << barcode_log[barcodes[i]]["total"]    << "\t"
795                 << barcode_log[barcodes[i]]["retained"] << "\n";
796     }
797 
798     log << "\n"
799         << "Sequences not recorded\n"
800         << "Barcode\t"
801         << "Total\n";
802 
803     //
804     // Sort unused barcodes by number of occurances.
805     //
806     map<BarcodePair, map<string, long> >::iterator bit;
807     vector<pair<BarcodePair, int> > bcs;
808     for (bit = barcode_log.begin(); bit != barcode_log.end(); bit++)
809         bcs.push_back(make_pair(bit->first, bit->second["total"]));
810     sort(bcs.begin(), bcs.end(), compare_barcodes);
811 
812     for (uint i = 0; i < bcs.size(); i++) {
813         if (barcode_list.count(bcs[i].first)) continue;
814         if (bcs[i].second == 0) continue;
815 
816         log << bcs[i].first << "\t"
817             << bcs[i].second << "\n";
818     }
819 
820     log.close();
821 
822     return 0;
823 }
824 
compare_barcodes(pair<BarcodePair,int> a,pair<BarcodePair,int> b)825 int  compare_barcodes(pair<BarcodePair, int> a, pair<BarcodePair, int> b) {
826     return a.second > b.second;
827 }
828 
parse_command_line(int argc,char * argv[])829 int parse_command_line(int argc, char* argv[]) {
830     int c;
831 
832     while (1) {
833         static struct option long_options[] = {
834             {"help",                 no_argument, NULL, 'h'},
835             {"version",              no_argument, NULL, 'v'},
836             {"quality",              no_argument, NULL, 'q'},
837             {"clean",                no_argument, NULL, 'c'},
838             {"recover",              no_argument, NULL, 'r'},
839             {"discards",             no_argument, NULL, 'D'},
840             {"paired",               no_argument, NULL, 'P'},
841             {"interleaved",          no_argument, NULL, 'I'},
842             {"merge",                no_argument, NULL, 'm'},
843             {"mate-pair",            no_argument, NULL, 'M'},
844             {"no-overhang",          no_argument, NULL, 'O'}, {"no_overhang",          no_argument, NULL, 'O'},
845             {"filter-illumina",      no_argument, NULL, 'F'}, {"filter_illumina",      no_argument, NULL, 'F'},
846             {"retain-header",        no_argument, NULL, 'H'}, {"retain_header",        no_argument, NULL, 'H'},
847             {"no-read-trimming",     no_argument, NULL, 'N'}, {"no_read_trimming",     no_argument, NULL, 'N'},
848             {"null-index",           no_argument, NULL, 'U'}, {"null_index",           no_argument, NULL, 'U'},
849             {"index-null",           no_argument, NULL, 'u'}, {"index_null",           no_argument, NULL, 'u'},
850             {"inline-null",          no_argument, NULL, 'V'}, {"inline_null",          no_argument, NULL, 'V'},
851             {"index-index",          no_argument, NULL, 'W'}, {"index_index",          no_argument, NULL, 'W'},
852             {"inline-inline",        no_argument, NULL, 'x'}, {"inline_inline",        no_argument, NULL, 'x'},
853             {"index-inline",         no_argument, NULL, 'Y'}, {"index_inline",         no_argument, NULL, 'Y'},
854             {"inline-index",         no_argument, NULL, 'Z'}, {"inline_index",         no_argument, NULL, 'Z'},
855             {"barcode-dist-1", required_argument, NULL, 'B'}, {"barcode_dist_1", required_argument, NULL, 'B'},
856             {"barcode-dist-2", required_argument, NULL, 'C'}, {"barcode_dist_2", required_argument, NULL, 'C'},
857             {"infile-type",    required_argument, NULL, 'i'}, {"infile_type",    required_argument, NULL, 'i'},
858             {"outfile-type",   required_argument, NULL, 'y'}, {"outfile_type",   required_argument, NULL, 'y'},
859             {"file",           required_argument, NULL, 'f'},
860             {"file-p1",        required_argument, NULL, '1'}, {"file_p1",        required_argument, NULL, '1'},
861             {"file-p2",        required_argument, NULL, '2'}, {"file_p2",        required_argument, NULL, '2'},
862             {"path",           required_argument, NULL, 'p'},
863             {"outpath",        required_argument, NULL, 'o'},
864             {"truncate",       required_argument, NULL, 't'},
865             {"barcodes",       required_argument, NULL, 'b'},
866             {"window-size",    required_argument, NULL, 'w'}, {"window_size",    required_argument, NULL, 'w'},
867             {"score-limit",    required_argument, NULL, 's'}, {"score_limit",    required_argument, NULL, 's'},
868             {"encoding",       required_argument, NULL, 'E'},
869             {"len-limit",      required_argument, NULL, 'L'}, {"len_limit",      required_argument, NULL, 'L'},
870             {"adapter-1",      required_argument, NULL, 'A'}, {"adapter_1",      required_argument, NULL, 'A'},
871             {"adapter-2",      required_argument, NULL, 'G'}, {"adapter_2",      required_argument, NULL, 'G'},
872             {"adapter-mm",     required_argument, NULL, 'T'}, {"adapter_mm",     required_argument, NULL, 'T'},
873             {0, 0, 0, 0}
874         };
875 
876         // getopt_long stores the option index here.
877         int option_index = 0;
878 
879         c = getopt_long(argc, argv, "hHvcqrINFuVWxYZOPmDi:y:f:o:t:B:C:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
880 
881         // Detect the end of the options.
882         if (c == -1)
883             break;
884 
885         switch (c) {
886         case 'h':
887             help();
888             break;
889         case 'i':
890             if (strcasecmp(optarg, "bustard") == 0)
891                 in_file_type = FileT::bustard;
892             else if (strcasecmp(optarg, "bam") == 0)
893                 in_file_type = FileT::bam;
894             else if (strcasecmp(optarg, "gzfastq") == 0)
895                 in_file_type = FileT::gzfastq;
896             else
897                 in_file_type = FileT::fastq;
898             break;
899         case 'y':
900             if (strcasecmp(optarg, "fastq") == 0)
901                 out_file_type = FileT::fastq;
902             else if (strcasecmp(optarg, "gzfastq") == 0)
903                 out_file_type = FileT::gzfastq;
904             else if (strcasecmp(optarg, "fasta") == 0)
905                 out_file_type = FileT::fasta;
906             else if (strcasecmp(optarg, "gzfasta") == 0)
907                 out_file_type = FileT::gzfasta;
908             break;
909         case 'E':
910             if (strcasecmp(optarg, "phred64") == 0)
911                 qual_offset = 64;
912             else if (strcasecmp(optarg, "phred33") == 0)
913                 qual_offset = 33;
914             break;
915         case 'f':
916             in_file = optarg;
917             break;
918         case 'p':
919             in_path_1 = optarg;
920             in_path_2 = in_path_1;
921             break;
922         case '1':
923             paired     = true;
924             in_file_p1 = optarg;
925             break;
926         case '2':
927             paired     = true;
928             in_file_p2 = optarg;
929             break;
930         case 'P':
931             paired = true;
932             break;
933         case 'I':
934             interleaved = true;
935             break;
936         case 'B':
937             barcode_dist_1 = is_integer(optarg);
938             break;
939         case 'C':
940             barcode_dist_2 = is_integer(optarg);
941             break;
942         case 'o':
943             out_path = optarg;
944             break;
945         case 'm':
946             merge = true;
947             break;
948         case 'M':
949             matepair = true;
950             break;
951         case 'D':
952             discards = true;
953             break;
954         case 'q':
955             quality = true;
956             break;
957         case 'c':
958             clean = true;
959             break;
960         case 'r':
961             recover = true;
962             break;
963         case 'O':
964             overhang = false;
965             break;
966         case 'F':
967             filter_illumina = true;
968             break;
969         case 'H':
970             retain_header = true;
971             break;
972         case 'N':
973             trim_reads = false;
974             break;
975         case 't':
976             truncate_seq = is_integer(optarg);
977             break;
978         case 'b':
979             barcode_file = optarg;
980             if (barcode_type == null_null)
981                 barcode_type = inline_null;
982             break;
983         case 'U':
984             barcode_type = null_index;
985             break;
986         case 'u':
987             barcode_type = index_null;
988             break;
989         case 'V':
990             barcode_type = inline_null;
991             break;
992         case 'W':
993             barcode_type = index_index;
994             break;
995         case 'x':
996             barcode_type = inline_inline;
997             break;
998         case 'Y':
999             barcode_type = index_inline;
1000             break;
1001         case 'Z':
1002             barcode_type = inline_index;
1003             break;
1004         case 'A':
1005             adapter_1 = new char[strlen(optarg) + 1];
1006             strcpy(adapter_1, optarg);
1007             filter_adapter = true;
1008             break;
1009         case 'G':
1010             adapter_2 = new char[strlen(optarg) + 1];
1011             strcpy(adapter_2, optarg);
1012             filter_adapter = true;
1013             break;
1014         case 'T':
1015             distance = is_integer(optarg);
1016             break;
1017         case 'L':
1018             len_limit = is_integer(optarg);
1019             break;
1020         case 'w':
1021             win_size = is_double(optarg);
1022             break;
1023         case 's':
1024             score_limit = is_integer(optarg);
1025             break;
1026         case 'v':
1027             version();
1028             break;
1029         case '?':
1030             // getopt_long already printed an error message.
1031             help();
1032             break;
1033 
1034         default:
1035             cerr << "Unknown command line option '" << (char) c << "'\n";
1036             help();
1037             exit(1);
1038         }
1039     }
1040 
1041     if (optind < argc) {
1042         cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
1043         help();
1044     }
1045 
1046     if (in_file.length() == 0 && in_path_1.length() == 0 && in_file_p1.length() == 0) {
1047         cerr << "You must specify an input file of a directory path to a set of input files.\n";
1048         help();
1049     }
1050 
1051     if (in_file.length() > 0 && in_path_1.length() > 0) {
1052         cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
1053         help();
1054     }
1055 
1056     if (in_file.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
1057         cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
1058         help();
1059     }
1060 
1061     if (in_path_1.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
1062         cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
1063         help();
1064     }
1065 
1066     if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
1067         in_path_1 += "/";
1068 
1069     if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
1070         in_path_2 += "/";
1071 
1072     if (out_path.length() == 0)
1073         out_path = ".";
1074 
1075     if (out_path.at(out_path.length() - 1) != '/')
1076         out_path += "/";
1077 
1078     if (barcode_file.length() == 0) {
1079         overhang = false;
1080         cerr << "No barcodes specified, files will not be demultiplexed.\n";
1081     }
1082 
1083     if (barcode_file.length() > 0 && merge) {
1084         cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
1085         help();
1086     }
1087 
1088     if (in_file_type == FileT::unknown)
1089         in_file_type = FileT::gzfastq;
1090 
1091     if (in_file_type == FileT::bam && paired == true && interleaved == false) {
1092         cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
1093         help();
1094     }
1095 
1096     if (in_file_type == FileT::bam && (barcode_type != inline_null && barcode_type != inline_inline && barcode_type != null_null)) {
1097         cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
1098         help();
1099     }
1100 
1101     if (score_limit < 0 || score_limit > 40) {
1102         cerr << "Score limit must be between 0 and 40.\n";
1103         help();
1104     }
1105 
1106     if (win_size < 0 || win_size >= 1) {
1107         cerr << "Window size is a fraction between 0 and 1.\n";
1108         help();
1109     }
1110 
1111     if (recover && barcode_type != null_null) {
1112         if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
1113             barcode_dist_2 = barcode_dist_1;
1114     }
1115 
1116     return 0;
1117 }
1118 
version()1119 void version() {
1120     cerr << "process_shortreads " << VERSION << "\n\n";
1121 
1122     exit(1);
1123 }
1124 
help()1125 void help() {
1126     cerr << "process_shortreads " << VERSION << "\n"
1127               << "process_shortreads [-f in_file | -p in_dir [-P] [-I] | -1 pair_1 -2 pair_2] -b barcode_file -o out_dir [-i type] [-y type] [-c] [-q] [-r] [-E encoding] [-t len] [-D] [-w size] [-s lim] [-h]\n"
1128               << "  f: path to the input file if processing single-end seqeunces.\n"
1129               << "  i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
1130               << "  p: path to a directory of single-end Illumina files.\n"
1131               << "  1: first input file in a set of paired-end sequences.\n"
1132               << "  2: second input file in a set of paired-end sequences.\n"
1133               << "  P: specify that input is paired (for use with '-p').\n"
1134               << "  I: specify that the paired-end reads are interleaved in single files.\n"
1135               << "  o: path to output the processed files.\n"
1136               << "  y: output type, either 'fastq' or 'fasta' (default gzfastq).\n"
1137               << "  b: a list of barcodes for this run.\n"
1138               << "  c: clean data, remove any read with an uncalled base.\n"
1139               << "  q: discard reads with low quality scores.\n"
1140               << "  r: rescue barcodes.\n"
1141               << "  t: truncate final read length to this value.\n"
1142               << "  E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+/Sanger, default) or 'phred64' (Illumina 1.3-1.5).\n"
1143               << "  D: capture discarded reads to a file.\n"
1144               << "  w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
1145               << "  s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
1146               << "  h: display this help messsage.\n\n"
1147               << "  Barcode options:\n"
1148               << "    --inline-null:   barcode is inline with sequence, occurs only on single-end read (default).\n"
1149               << "    --index-null:    barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
1150               << "    --null-index:    barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
1151               << "    --inline-inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
1152               << "    --index-index:   barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
1153               << "    --inline-index:  barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
1154               << "    --index-inline:  barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
1155               << "  Adapter options:\n"
1156               << "    --adapter-1 <sequence>: provide adaptor sequence that may occur on the first read for filtering.\n"
1157               << "    --adapter-2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
1158               << "      --adapter-mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
1159               << "  Output options:\n"
1160               << "    --retain-header: retain unmodified FASTQ headers in the output.\n"
1161               << "    --merge: if no barcodes are specified, merge all input files into a single output file (or single pair of files).\n\n"
1162               << "  Advanced options:\n"
1163               << "    --no-read-trimming: do not trim low quality reads, just discard them.\n"
1164               << "    --len-limit <limit>: when trimming sequences, specify the minimum length a sequence must be to keep it (default 31bp).\n"
1165               << "    --filter-illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
1166               << "    --barcode-dist: provide the distace between barcodes to allow for barcode rescue (default 2)\n"
1167               << "    --mate-pair: raw reads are circularized mate-pair data, first read will be reverse complemented.\n"
1168               << "    --no-overhang: data does not contain an overhang nucleotide between barcode and seqeunce.\n";
1169 
1170     exit(1);
1171 }
1172