1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // process_shortreads -- clean raw reads using a sliding window approach;
23 // split reads by barcode if barcodes provided, correct barcodes
24 // within one basepair, truncate reads on request.
25 //
26
27 #include "process_shortreads.h"
28
29 //
30 // Global variables to hold command-line options.
31 //
32 FileT in_file_type = FileT::unknown;
33 FileT out_file_type = FileT::unknown;
34 string in_file;
35 string in_file_p1;
36 string in_file_p2;
37 string in_path_1;
38 string in_path_2;
39 string out_path;
40 string barcode_file;
41 char *adapter_1;
42 char *adapter_2;
43 barcodet barcode_type = null_null;
44 bool retain_header = false;
45 bool filter_adapter = false;
46 bool paired = false;
47 bool clean = false;
48 bool quality = false;
49 bool recover = false;
50 bool interleaved = false;
51 bool merge = false;
52 bool discards = false;
53 bool overhang = true;
54 bool matepair = false;
55 bool filter_illumina = false;
56 bool trim_reads = true;
57 uint truncate_seq = 0;
58 int barcode_dist_1 = 1;
59 int barcode_dist_2 = -1;
60 double win_size = 0.15;
61 int score_limit = 10;
62 int len_limit = 31;
63 int num_threads = 1;
64
65 //
66 // How to shift FASTQ-encoded quality scores from ASCII down to raw scores
67 // score = encoded letter - 64; Illumina version 1.3 - 1.5
68 // score = encoded letter - 33; Sanger / Illumina version 1.6+
69 int qual_offset = 33;
70
71 //
72 // Handle variable-size barcodes.
73 //
74 uint min_bc_size_1 = 0;
75 uint max_bc_size_1 = 0;
76 uint min_bc_size_2 = 0;
77 uint max_bc_size_2 = 0;
78
79 //
80 // Kmer data for adapter filtering.
81 //
82 int kmer_size = 5;
83 int distance = 1;
84 int adp_1_len = 0;
85 int adp_2_len = 0;
86 AdapterHash adp_1_kmers, adp_2_kmers;
87
main(int argc,char * argv[])88 int main (int argc, char* argv[]) {
89 IF_NDEBUG_TRY
90
91 parse_command_line(argc, argv);
92
93 //
94 // If input files are gzipped, output gziped files, unless the user chooses an output type.
95 //
96 if (out_file_type == FileT::unknown) {
97 if (in_file_type == FileT::gzfastq || in_file_type == FileT::bam)
98 out_file_type = FileT::gzfastq;
99 else
100 out_file_type = FileT::fastq;
101 }
102
103 cerr << "Using Phred+" << qual_offset << " encoding for quality scores.\n"
104 << "Reads trimmed shorter than " << len_limit << " nucleotides will be discarded.\n";
105 if (truncate_seq > 0)
106 cerr << "Reads will be truncated to " << truncate_seq << "bp\n";
107 if (filter_illumina)
108 cerr << "Discarding reads marked as 'failed' by Illumina's chastity/purity filters.\n";
109 if (filter_adapter) {
110 cerr << "Filtering reads for adapter sequence:\n";
111 if (adapter_1 != NULL) {
112 cerr << " " << adapter_1 << "\n";
113 init_adapter_seq(kmer_size, adapter_1, adp_1_len, adp_1_kmers);
114 }
115 if (adapter_2 != NULL) {
116 cerr << " " << adapter_2 << "\n";
117 init_adapter_seq(kmer_size, adapter_2, adp_2_len, adp_2_kmers);
118 }
119 cerr << " " << distance << " mismatches allowed to adapter sequence.\n";
120 }
121
122 vector<pair<string, string> > files;
123 vector<BarcodePair> barcodes;
124 set<string> se_bc, pe_bc;
125 map<BarcodePair, ofstream *> pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs;
126 map<BarcodePair, gzFile *> pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs;
127 map<string, map<string, long> > counters;
128 map<BarcodePair, map<string, long> > barcode_log;
129
130 build_file_list(files);
131 load_barcodes(barcode_file, barcodes, se_bc, pe_bc, min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2);
132 if (recover && barcode_type != null_null) {
133 if (barcode_type == index_null || barcode_type == inline_null)
134 cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " mismatches.\n";
135 else
136 cerr << "Will attempt to recover barcodes with at most " << barcode_dist_1 << " / " << barcode_dist_2 << " mismatches.\n";
137 }
138
139 if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
140 open_files(files, barcodes, pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs, counters);
141 else
142 open_files(files, barcodes, pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs, counters);
143
144 int result = 1;
145
146 for (uint i = 0; i < files.size(); i++) {
147 cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].first.c_str() << "]\n";
148
149 counters[files[i].first]["total"] = 0;
150 counters[files[i].first]["ill_filtered"] = 0;
151 counters[files[i].first]["low_quality"] = 0;
152 counters[files[i].first]["trimmed"] = 0;
153 counters[files[i].first]["adapter"] = 0;
154 counters[files[i].first]["ambiguous"] = 0;
155 counters[files[i].first]["retained"] = 0;
156 counters[files[i].first]["orphaned"] = 0;
157 counters[files[i].first]["recovered"] = 0;
158
159 if (paired) {
160 if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
161 result = process_paired_reads(files[i].first, files[i].second,
162 se_bc, pe_bc,
163 pair_1_gzfhs, pair_2_gzfhs, rem_1_gzfhs, rem_2_gzfhs,
164 counters[files[i].first], barcode_log);
165 else
166 result = process_paired_reads(files[i].first, files[i].second,
167 se_bc, pe_bc,
168 pair_1_fhs, pair_2_fhs, rem_1_fhs, rem_2_fhs,
169 counters[files[i].first], barcode_log);
170 } else {
171 if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta)
172 result = process_reads(files[i].first,
173 se_bc, pe_bc,
174 pair_1_gzfhs,
175 counters[files[i].first], barcode_log);
176 else
177 result = process_reads(files[i].first,
178 se_bc, pe_bc,
179 pair_1_fhs,
180 counters[files[i].first], barcode_log);
181 }
182 cerr << " "
183 << counters[files[i].first]["total"] << " total reads; ";
184 if (filter_illumina)
185 cerr << "-" << counters[files[i].first]["ill_filtered"] << " failed Illumina reads; ";
186 cerr
187 << "-" << counters[files[i].first]["ambiguous"] << " ambiguous barcodes; "
188 << "+" << counters[files[i].first]["recovered"] << " recovered; "
189 << "-" << counters[files[i].first]["low_quality"] << " low quality reads; "
190 << counters[files[i].first]["retained"] << " retained reads.\n"
191 << " ";
192 if (filter_adapter)
193 cerr << counters[files[i].first]["adapter"] << " reads with adapter sequence; ";
194 cerr << counters[files[i].first]["trimmed"] << " trimmed reads; "
195 << counters[files[i].first]["orphaned"] << " orphaned paired-ends.\n";
196
197 if (!result) {
198 cerr << "Error processing reads.\n";
199 break;
200 }
201 }
202
203 cerr << "Closing files, flushing buffers...\n";
204 if (out_file_type == FileT::gzfastq || out_file_type == FileT::gzfasta) {
205 close_file_handles(pair_1_gzfhs);
206 if (paired) {
207 close_file_handles(rem_1_gzfhs);
208 close_file_handles(rem_2_gzfhs);
209 close_file_handles(pair_2_gzfhs);
210 }
211 } else {
212 close_file_handles(pair_1_fhs);
213 if (paired) {
214 close_file_handles(rem_1_fhs);
215 close_file_handles(rem_2_fhs);
216 close_file_handles(pair_2_fhs);
217 }
218 }
219
220 print_results(argc, argv, barcodes, counters, barcode_log);
221
222 return 0;
223 IF_NDEBUG_CATCH_ALL_EXCEPTIONS
224 }
225
226 template<typename fhType>
227 int
process_paired_reads(string prefix_1,string prefix_2,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,fhType * > & pair_1_fhs,map<BarcodePair,fhType * > & pair_2_fhs,map<BarcodePair,fhType * > & rem_1_fhs,map<BarcodePair,fhType * > & rem_2_fhs,map<string,long> & counter,map<BarcodePair,map<string,long>> & barcode_log)228 process_paired_reads(string prefix_1,
229 string prefix_2,
230 set<string> &se_bc, set<string> &pe_bc,
231 map<BarcodePair, fhType *> &pair_1_fhs,
232 map<BarcodePair, fhType *> &pair_2_fhs,
233 map<BarcodePair, fhType *> &rem_1_fhs,
234 map<BarcodePair, fhType *> &rem_2_fhs,
235 map<string, long> &counter,
236 map<BarcodePair, map<string, long> > &barcode_log) {
237 Input *fh_1=NULL, *fh_2=NULL;
238 RawRead *r_1=NULL, *r_2=NULL;
239 ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
240
241 int return_val = 1;
242
243 string path_1 = in_path_1 + prefix_1;
244 string path_2 = in_path_2 + prefix_2;
245
246 if (interleaved)
247 cerr << " Reading data from:\n " << path_1 << "\n";
248 else
249 cerr << " Reading data from:\n " << path_1 << " and\n " << path_2 << "\n";
250
251 if (in_file_type == FileT::fastq) {
252 fh_1 = new Fastq(path_1.c_str());
253 fh_2 = interleaved ? fh_1 : new Fastq(path_2.c_str());
254 } else if (in_file_type == FileT::gzfastq) {
255 fh_1 = new GzFastq(path_1.c_str());
256 fh_2 = interleaved ? fh_1 : new GzFastq(path_2.c_str());
257 } else if (in_file_type == FileT::bam) {
258 fh_1 = new BamUnAln(path_1.c_str());
259 fh_2 = fh_1;
260 } else if (in_file_type == FileT::bustard) {
261 fh_1 = new Bustard(path_1.c_str());
262 fh_2 = interleaved ? fh_1 : new Bustard(path_2.c_str());
263 }
264
265 //
266 // Open a file for recording discarded reads
267 //
268 if (discards) {
269 path_1 = out_path + prefix_1 + ".discards";
270 discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
271
272 if (discard_fh_1->fail()) {
273 cerr << "Error opening discard output file '" << path_1 << "'\n";
274 exit(1);
275 }
276
277 path_2 = out_path + prefix_2 + ".discards";
278 discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
279
280 if (discard_fh_1->fail()) {
281 cerr << "Error opening discard output file '" << path_2 << "'\n";
282 exit(1);
283 }
284 }
285
286 //
287 // Read in the first record, initializing the Seq object s. Then
288 // initialize the Read object r, then loop, using the same objects.
289 //
290 Seq *s_1 = fh_1->next_seq();
291 Seq *s_2 = fh_2->next_seq();
292 if (s_1 == NULL || s_2 == NULL) {
293 cerr << "Attempting to read first pair of input records, unable to allocate "
294 << "Seq object (Was the correct input type specified?).\n";
295 exit(1);
296 }
297
298 r_1 = new RawRead(strlen(s_1->seq), 1, min_bc_size_1, win_size);
299 r_2 = new RawRead(strlen(s_2->seq), 2, min_bc_size_2, win_size);
300
301 BarcodePair bc;
302 //
303 // If no barcodes were specified, set the barcode object to be the input file names.
304 //
305 if (max_bc_size_1 == 0)
306 bc.set(prefix_1, prefix_2);
307
308 long i = 1;
309
310 do {
311 if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
312
313 parse_input_record(s_1, r_1);
314 parse_input_record(s_2, r_2);
315 counter["total"] += 2;
316
317 if (barcode_type != null_null &&
318 barcode_type != inline_null &&
319 barcode_type != index_null)
320 bc.set(r_1->se_bc, r_2->pe_bc);
321 else if (barcode_type != null_null)
322 bc.set(r_1->se_bc);
323
324 process_barcode(r_1, r_2, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
325
326 //
327 // Adjust the size of the read to accommodate truncating the sequence and variable
328 // barcode lengths. With standard Illumina data we want to output constant length
329 // reads even as the barcode size may change. Other technologies, like IonTorrent
330 // need to be truncated uniformly.
331 //
332 if (truncate_seq > 0) {
333 if (truncate_seq + r_1->inline_bc_len <= r_1->len)
334 r_1->set_len(truncate_seq + r_1->inline_bc_len);
335 if (truncate_seq + r_2->inline_bc_len <= r_2->len)
336 r_2->set_len(truncate_seq + r_2->inline_bc_len);
337 } else {
338 if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
339 r_1->set_len(r_1->len - (max_bc_size_1 - r_1->inline_bc_len));
340 if (barcode_type == inline_index || barcode_type == index_index)
341 r_2->set_len(r_2->len - (max_bc_size_2 - r_2->inline_bc_len));
342 }
343
344 if (r_1->retain)
345 process_singlet(r_1, false, barcode_log[bc], counter);
346 if (r_2->retain)
347 process_singlet(r_2, true, barcode_log[bc], counter);
348
349 if (matepair) {
350 rev_complement(r_1->seq, r_1->inline_bc_len, overhang);
351 reverse_qual(r_1->phred, r_1->inline_bc_len, overhang);
352 }
353
354 int result_1 = 1;
355 int result_2 = 1;
356
357 if (r_1->retain && r_2->retain) {
358 if (retain_header) {
359 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
360 write_fastq(pair_1_fhs[bc], s_1, r_1) :
361 write_fasta(pair_1_fhs[bc], s_1, r_1);
362 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
363 write_fastq(pair_2_fhs[bc], s_2, r_2) :
364 write_fasta(pair_2_fhs[bc], s_2, r_2);
365 } else {
366 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
367 write_fastq(pair_1_fhs[bc], r_1, overhang) :
368 write_fasta(pair_1_fhs[bc], r_1, overhang);
369 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
370 write_fastq(pair_2_fhs[bc], r_2, overhang) :
371 write_fasta(pair_2_fhs[bc], r_2, overhang);
372 }
373
374 } else if (r_1->retain && !r_2->retain) {
375 //
376 // Write to a remainder file.
377 //
378 if (retain_header)
379 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
380 write_fastq(rem_1_fhs[bc], s_1, r_1) :
381 write_fasta(rem_1_fhs[bc], s_1, r_1);
382 else
383 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
384 write_fastq(rem_1_fhs[bc], r_1, overhang) :
385 write_fasta(rem_1_fhs[bc], r_1, overhang);
386
387 } else if (!r_1->retain && r_2->retain) {
388 // Write to a remainder file.
389 if (retain_header)
390 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
391 write_fastq(rem_2_fhs[bc], s_2, r_2) :
392 write_fasta(rem_2_fhs[bc], s_2, r_2);
393 else
394 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
395 write_fastq(rem_2_fhs[bc], r_2, overhang) :
396 write_fasta(rem_2_fhs[bc], r_2, overhang);
397 }
398
399 if (!result_1 || !result_2) {
400 cerr << "Error writing to output file for '" << bc.str() << "'\n";
401 return_val = -1;
402 break;
403 }
404
405 if (discards && !r_1->retain)
406 result_1 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
407 write_fastq(discard_fh_1, s_1) :
408 write_fasta(discard_fh_1, s_1);
409 if (discards && !r_2->retain)
410 result_2 = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
411 write_fastq(discard_fh_2, s_2) :
412 write_fasta(discard_fh_2, s_2);
413
414 delete s_1;
415 delete s_2;
416
417 if (!result_1 || !result_2) {
418 cerr << "Error writing to discard file for '" << bc.str() << "'\n";
419 return_val = -1;
420 break;
421 }
422
423 i++;
424 } while ((s_1 = fh_1->next_seq()) != NULL &&
425 (s_2 = fh_2->next_seq()) != NULL);
426
427 if (discards) {
428 delete discard_fh_1;
429 delete discard_fh_2;
430 }
431
432 delete fh_1;
433 if (interleaved == false) delete fh_2;
434
435 return return_val;
436 }
437
438 template<typename fhType>
439 int
process_reads(string prefix,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,fhType * > & pair_1_fhs,map<string,long> & counter,map<BarcodePair,map<string,long>> & barcode_log)440 process_reads(string prefix,
441 set<string> &se_bc, set<string> &pe_bc,
442 map<BarcodePair, fhType *> &pair_1_fhs,
443 map<string, long> &counter,
444 map<BarcodePair, map<string, long> > &barcode_log) {
445 Input *fh=NULL;
446 RawRead *r;
447 ofstream *discard_fh=NULL;
448
449 int return_val = 1;
450
451 string path = in_path_1 + prefix;
452
453 if (in_file_type == FileT::fastq)
454 fh = new Fastq(path.c_str());
455 else if (in_file_type == FileT::gzfastq)
456 fh = new GzFastq(path.c_str());
457 else if (in_file_type == FileT::bam)
458 fh = new BamUnAln(path.c_str());
459 else if (in_file_type == FileT::bustard)
460 fh = new Bustard(path.c_str());
461
462 //
463 // Open a file for recording discarded reads
464 //
465 if (discards) {
466 path = path + ".discards";
467 discard_fh = new ofstream(path.c_str(), ifstream::out);
468
469 if (discard_fh->fail()) {
470 cerr << "Error opening discard output file '" << path << "'\n";
471 exit(1);
472 }
473 }
474
475 //
476 // Read in the first record, initializing the Seq object s. Then
477 // initialize the Read object r, then loop, using the same objects.
478 //
479 Seq *s = fh->next_seq();
480 if (s == NULL) {
481 cerr << "Attempting to read first input record, unable to allocate "
482 << "Seq object (Was the correct input type specified?).\n";
483 exit(1);
484 }
485
486 r = new RawRead(strlen(s->seq), 1, min_bc_size_1, win_size);
487
488 BarcodePair bc;
489 //
490 // If no barcodes were specified, set the barcode object to be the input file name so
491 // that reads are written to an output file of the same name as the input file.
492 //
493 if (max_bc_size_1 == 0)
494 bc.set(prefix);
495
496 //cerr << "Length: " << r->len << "; Window length: " << r->win_len << "; Stop position: " << r->stop_pos << "\n";
497
498 long i = 1;
499
500 do {
501 if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
502 counter["total"]++;
503
504 parse_input_record(s, r);
505
506 if (barcode_type == inline_null ||
507 barcode_type == index_null)
508 bc.set(r->se_bc);
509 else if (barcode_type == index_inline ||
510 barcode_type == inline_index)
511 bc.set(r->se_bc, r->pe_bc);
512
513 process_barcode(r, NULL, bc, pair_1_fhs, se_bc, pe_bc, barcode_log, counter);
514
515 //
516 // Adjust the size of the read to accommodate truncating the sequence and variable
517 // barcode lengths. With standard Illumina data we want to output constant length
518 // reads even as the barcode size may change. Other technologies, like IonTorrent
519 // need to be truncated uniformly.
520 //
521 if (truncate_seq > 0) {
522 if (truncate_seq + r->inline_bc_len <= r->len)
523 r->set_len(truncate_seq + r->inline_bc_len);
524 } else {
525 if (barcode_type == inline_null || barcode_type == inline_inline || barcode_type == inline_index)
526 r->set_len(r->len - (max_bc_size_1 - r->inline_bc_len));
527 }
528
529 if (r->retain)
530 process_singlet(r, false, barcode_log[bc], counter);
531
532 int result = 1;
533
534 if (r->retain) {
535 if (retain_header)
536 result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
537 write_fastq(pair_1_fhs[bc], s, r) :
538 write_fasta(pair_1_fhs[bc], s, r);
539 else
540 result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
541 write_fastq(pair_1_fhs[bc], r, overhang) :
542 write_fasta(pair_1_fhs[bc], r, overhang);
543 }
544
545 if (!result) {
546 cerr << "Error writing to output file for '" << bc.str() << "'\n";
547 return_val = -1;
548 break;
549 }
550
551 if (discards && !r->retain)
552 result = (out_file_type == FileT::fastq || out_file_type == FileT::gzfastq) ?
553 write_fastq(discard_fh, s) :
554 write_fasta(discard_fh, s);
555
556 if (!result) {
557 cerr << "Error writing to discard file for '" << bc.str() << "'\n";
558 return_val = -1;
559 break;
560 }
561
562 delete s;
563 i++;
564 } while ((s = fh->next_seq()) != NULL);
565
566 if (discards) delete discard_fh;
567
568 //
569 // Close the file and delete the Input object.
570 //
571 delete fh;
572
573 return return_val;
574 }
575
576 inline int
process_singlet(RawRead * href,bool paired_end,map<string,long> & bc_log,map<string,long> & counter)577 process_singlet(RawRead *href,
578 bool paired_end,
579 map<string, long> &bc_log, map<string, long> &counter)
580 {
581 if (filter_illumina && href->filter) {
582 counter["ill_filtered"]++;
583 href->retain = 0;
584 return 0;
585 }
586
587 //
588 // Drop this sequence if it has any uncalled nucleotides
589 //
590 if (clean) {
591 for (char *p = href->seq + href->inline_bc_len; *p != '\0'; p++)
592 if (*p == '.' || *p == 'N') {
593 counter["low_quality"]++;
594 href->retain = 0;
595 return 0;
596 }
597 }
598
599 bool adapter_trim = false;
600 bool quality_trim = false;
601
602 //
603 // Drop or trim this sequence if it has low quality scores
604 //
605 if (quality) {
606 int res = check_quality_scores(href, qual_offset, score_limit, len_limit, href->inline_bc_len);
607
608 if (trim_reads) {
609 if (res == 0) {
610 counter["low_quality"]++;
611 href->retain = 0;
612 return 0;
613 } else if (res < 0) {
614 quality_trim = true;
615 }
616 } else {
617 if (res <= 0) {
618 counter["low_quality"]++;
619 href->retain = 0;
620 return 0;
621 }
622 }
623 }
624
625 //
626 // Drop or trim this sequence if it contains adapter sequence.
627 //
628 if (filter_adapter) {
629 int res = 1;
630 if (paired_end == true && adp_2_len > 0)
631 res = filter_adapter_seq(href, adapter_2, adp_2_len, adp_2_kmers,
632 kmer_size, distance, len_limit);
633 if (paired_end == false && adp_1_len > 0)
634 res = filter_adapter_seq(href, adapter_1, adp_1_len, adp_1_kmers,
635 kmer_size, distance, len_limit);
636 if (res == 0) {
637 counter["adapter"]++;
638 href->retain = 0;
639 return 0;
640
641 } else if (res < 0) {
642 counter["adapter"]++;
643 adapter_trim = true;
644 }
645 }
646
647 if (adapter_trim || quality_trim)
648 counter["trimmed"]++;
649
650 if (barcode_type != null_null)
651 bc_log["retained"]++;
652 counter["retained"]++;
653
654 return 0;
655 }
656
dist(const char * res_enz,char * seq)657 int dist(const char *res_enz, char *seq) {
658 const char *p; char *q;
659
660 int dist = 0;
661
662 for (p = res_enz, q = seq; *p != '\0'; p++, q++)
663 if (*p != *q) dist++;
664
665 return dist;
666 }
667
668 int
print_results(int argc,char ** argv,vector<BarcodePair> & barcodes,map<string,map<string,long>> & counters,map<BarcodePair,map<string,long>> & barcode_log)669 print_results(int argc, char **argv,
670 vector<BarcodePair> &barcodes,
671 map<string, map<string, long> > &counters,
672 map<BarcodePair, map<string, long> > &barcode_log)
673 {
674 map<string, map<string, long> >::iterator it;
675
676 string log_path = out_path + "process_shortreads.log";
677 ofstream log(log_path.c_str());
678
679 if (log.fail()) {
680 cerr << "Unable to open log file '" << log_path << "'\n";
681 return 0;
682 }
683
684 cerr << "Outputing details to log: '" << log_path << "'\n\n";
685
686 init_log(log, argc, argv);
687
688 log << "File\t"
689 << "Retained Reads\t";
690 if (filter_illumina)
691 log << "Illumina Filtered\t";
692 if (filter_adapter)
693 log << "Adapter Seq" << "\t";
694 log << "Low Quality\t"
695 << "Ambiguous Barcodes\t"
696 << "Trimmed Reads\t"
697 << "Orphaned paired-end reads\t"
698 << "Total\n";
699
700 for (it = counters.begin(); it != counters.end(); it++) {
701 log << it->first << "\t"
702 << it->second["retained"] << "\t";
703 if (filter_illumina)
704 log << it->second["ill_filtered"] << "\t";
705 if (filter_adapter)
706 log << it->second["adapter"] << "\t";
707 log << it->second["low_quality"] << "\t"
708 << it->second["ambiguous"] << "\t"
709 << it->second["trimmed"] << "\t"
710 << it->second["orphaned"] << "\t"
711 << it->second["total"] << "\n";
712 }
713
714 map<string, long> c;
715 c["total"] = 0;
716 c["low_quality"] = 0;
717 c["adapter"] = 0;
718 c["ill_filtered"] = 0;
719 c["ambiguous"] = 0;
720 c["trimmed"] = 0;
721 c["orphaned"] = 0;
722
723 //
724 // Total up the individual counters
725 //
726 for (it = counters.begin(); it != counters.end(); it++) {
727 c["total"] += it->second["total"];
728 c["ill_filtered"] += it->second["ill_filtered"];
729 c["adapter"] += it->second["adapter"];
730 c["low_quality"] += it->second["low_quality"];
731 c["ambiguous"] += it->second["ambiguous"];
732 c["trimmed"] += it->second["trimmed"];
733 c["orphaned"] += it->second["orphaned"];
734 c["retained"] += it->second["retained"];
735 }
736
737 cerr << c["total"] << " total sequences;\n";
738 if (filter_illumina)
739 cerr << " " << c["ill_filtered"] << " failed Illumina filtered reads;\n";
740 if (filter_adapter)
741 cerr << " " << c["adapter"] << " reads contained adapter sequence;\n";
742 cerr << " " << c["ambiguous"] << " ambiguous barcode drops;\n"
743 << " " << c["low_quality"] << " low quality read drops;\n"
744 << " " << c["trimmed"] << " trimmed reads;\n"
745 << " " << c["orphaned"] << " orphaned paired-end reads;\n"
746 << c["retained"] << " retained reads.\n";
747
748 log << "\n"
749 << "Total Sequences\t" << c["total"] << "\n";
750 if (filter_illumina)
751 log << "Failed Illumina filtered reads\t" << c["ill_filtered"] << "\n";
752 if (filter_adapter)
753 log << "Reads containing adapter sequence\t" << c["adapter"] << "\n";
754 log
755 << "Ambiguous Barcodes\t" << c["ambiguous"] << "\n"
756 << "Low Quality\t" << c["low_quality"] << "\n"
757 << "Trimmed Reads\t" << c["trimmed"] << "\n"
758 << "Orphaned Paired-ends\t" << c["orphaned"] << "\n"
759 << "Retained Reads\t" << c["retained"] << "\n";
760
761 if (max_bc_size_1 == 0) return 0;
762
763 //
764 // Where barcode filenames specified?
765 //
766 bool bc_names = false;
767 for (uint i = 0; i < barcodes.size(); i++)
768 if (barcodes[i].name_exists()) {
769 bc_names = true;
770 break;
771 }
772
773 //
774 // Print out barcode information.
775 //
776 log << "\n"
777 << "Barcode\t";
778 if (bc_names)
779 log << "Filename\t";
780 log << "Total\t"
781 << "Retained\n";
782
783 set<BarcodePair> barcode_list;
784
785 for (uint i = 0; i < barcodes.size(); i++) {
786 barcode_list.insert(barcodes[i]);
787
788 log << barcodes[i] << "\t";
789 if (bc_names)
790 log << barcodes[i].name << "\t";
791 if (barcode_log.count(barcodes[i]) == 0)
792 log << "0\t" << "0\t" << "0\n";
793 else
794 log << barcode_log[barcodes[i]]["total"] << "\t"
795 << barcode_log[barcodes[i]]["retained"] << "\n";
796 }
797
798 log << "\n"
799 << "Sequences not recorded\n"
800 << "Barcode\t"
801 << "Total\n";
802
803 //
804 // Sort unused barcodes by number of occurances.
805 //
806 map<BarcodePair, map<string, long> >::iterator bit;
807 vector<pair<BarcodePair, int> > bcs;
808 for (bit = barcode_log.begin(); bit != barcode_log.end(); bit++)
809 bcs.push_back(make_pair(bit->first, bit->second["total"]));
810 sort(bcs.begin(), bcs.end(), compare_barcodes);
811
812 for (uint i = 0; i < bcs.size(); i++) {
813 if (barcode_list.count(bcs[i].first)) continue;
814 if (bcs[i].second == 0) continue;
815
816 log << bcs[i].first << "\t"
817 << bcs[i].second << "\n";
818 }
819
820 log.close();
821
822 return 0;
823 }
824
compare_barcodes(pair<BarcodePair,int> a,pair<BarcodePair,int> b)825 int compare_barcodes(pair<BarcodePair, int> a, pair<BarcodePair, int> b) {
826 return a.second > b.second;
827 }
828
parse_command_line(int argc,char * argv[])829 int parse_command_line(int argc, char* argv[]) {
830 int c;
831
832 while (1) {
833 static struct option long_options[] = {
834 {"help", no_argument, NULL, 'h'},
835 {"version", no_argument, NULL, 'v'},
836 {"quality", no_argument, NULL, 'q'},
837 {"clean", no_argument, NULL, 'c'},
838 {"recover", no_argument, NULL, 'r'},
839 {"discards", no_argument, NULL, 'D'},
840 {"paired", no_argument, NULL, 'P'},
841 {"interleaved", no_argument, NULL, 'I'},
842 {"merge", no_argument, NULL, 'm'},
843 {"mate-pair", no_argument, NULL, 'M'},
844 {"no-overhang", no_argument, NULL, 'O'}, {"no_overhang", no_argument, NULL, 'O'},
845 {"filter-illumina", no_argument, NULL, 'F'}, {"filter_illumina", no_argument, NULL, 'F'},
846 {"retain-header", no_argument, NULL, 'H'}, {"retain_header", no_argument, NULL, 'H'},
847 {"no-read-trimming", no_argument, NULL, 'N'}, {"no_read_trimming", no_argument, NULL, 'N'},
848 {"null-index", no_argument, NULL, 'U'}, {"null_index", no_argument, NULL, 'U'},
849 {"index-null", no_argument, NULL, 'u'}, {"index_null", no_argument, NULL, 'u'},
850 {"inline-null", no_argument, NULL, 'V'}, {"inline_null", no_argument, NULL, 'V'},
851 {"index-index", no_argument, NULL, 'W'}, {"index_index", no_argument, NULL, 'W'},
852 {"inline-inline", no_argument, NULL, 'x'}, {"inline_inline", no_argument, NULL, 'x'},
853 {"index-inline", no_argument, NULL, 'Y'}, {"index_inline", no_argument, NULL, 'Y'},
854 {"inline-index", no_argument, NULL, 'Z'}, {"inline_index", no_argument, NULL, 'Z'},
855 {"barcode-dist-1", required_argument, NULL, 'B'}, {"barcode_dist_1", required_argument, NULL, 'B'},
856 {"barcode-dist-2", required_argument, NULL, 'C'}, {"barcode_dist_2", required_argument, NULL, 'C'},
857 {"infile-type", required_argument, NULL, 'i'}, {"infile_type", required_argument, NULL, 'i'},
858 {"outfile-type", required_argument, NULL, 'y'}, {"outfile_type", required_argument, NULL, 'y'},
859 {"file", required_argument, NULL, 'f'},
860 {"file-p1", required_argument, NULL, '1'}, {"file_p1", required_argument, NULL, '1'},
861 {"file-p2", required_argument, NULL, '2'}, {"file_p2", required_argument, NULL, '2'},
862 {"path", required_argument, NULL, 'p'},
863 {"outpath", required_argument, NULL, 'o'},
864 {"truncate", required_argument, NULL, 't'},
865 {"barcodes", required_argument, NULL, 'b'},
866 {"window-size", required_argument, NULL, 'w'}, {"window_size", required_argument, NULL, 'w'},
867 {"score-limit", required_argument, NULL, 's'}, {"score_limit", required_argument, NULL, 's'},
868 {"encoding", required_argument, NULL, 'E'},
869 {"len-limit", required_argument, NULL, 'L'}, {"len_limit", required_argument, NULL, 'L'},
870 {"adapter-1", required_argument, NULL, 'A'}, {"adapter_1", required_argument, NULL, 'A'},
871 {"adapter-2", required_argument, NULL, 'G'}, {"adapter_2", required_argument, NULL, 'G'},
872 {"adapter-mm", required_argument, NULL, 'T'}, {"adapter_mm", required_argument, NULL, 'T'},
873 {0, 0, 0, 0}
874 };
875
876 // getopt_long stores the option index here.
877 int option_index = 0;
878
879 c = getopt_long(argc, argv, "hHvcqrINFuVWxYZOPmDi:y:f:o:t:B:C:b:1:2:p:s:w:E:L:A:G:T:", long_options, &option_index);
880
881 // Detect the end of the options.
882 if (c == -1)
883 break;
884
885 switch (c) {
886 case 'h':
887 help();
888 break;
889 case 'i':
890 if (strcasecmp(optarg, "bustard") == 0)
891 in_file_type = FileT::bustard;
892 else if (strcasecmp(optarg, "bam") == 0)
893 in_file_type = FileT::bam;
894 else if (strcasecmp(optarg, "gzfastq") == 0)
895 in_file_type = FileT::gzfastq;
896 else
897 in_file_type = FileT::fastq;
898 break;
899 case 'y':
900 if (strcasecmp(optarg, "fastq") == 0)
901 out_file_type = FileT::fastq;
902 else if (strcasecmp(optarg, "gzfastq") == 0)
903 out_file_type = FileT::gzfastq;
904 else if (strcasecmp(optarg, "fasta") == 0)
905 out_file_type = FileT::fasta;
906 else if (strcasecmp(optarg, "gzfasta") == 0)
907 out_file_type = FileT::gzfasta;
908 break;
909 case 'E':
910 if (strcasecmp(optarg, "phred64") == 0)
911 qual_offset = 64;
912 else if (strcasecmp(optarg, "phred33") == 0)
913 qual_offset = 33;
914 break;
915 case 'f':
916 in_file = optarg;
917 break;
918 case 'p':
919 in_path_1 = optarg;
920 in_path_2 = in_path_1;
921 break;
922 case '1':
923 paired = true;
924 in_file_p1 = optarg;
925 break;
926 case '2':
927 paired = true;
928 in_file_p2 = optarg;
929 break;
930 case 'P':
931 paired = true;
932 break;
933 case 'I':
934 interleaved = true;
935 break;
936 case 'B':
937 barcode_dist_1 = is_integer(optarg);
938 break;
939 case 'C':
940 barcode_dist_2 = is_integer(optarg);
941 break;
942 case 'o':
943 out_path = optarg;
944 break;
945 case 'm':
946 merge = true;
947 break;
948 case 'M':
949 matepair = true;
950 break;
951 case 'D':
952 discards = true;
953 break;
954 case 'q':
955 quality = true;
956 break;
957 case 'c':
958 clean = true;
959 break;
960 case 'r':
961 recover = true;
962 break;
963 case 'O':
964 overhang = false;
965 break;
966 case 'F':
967 filter_illumina = true;
968 break;
969 case 'H':
970 retain_header = true;
971 break;
972 case 'N':
973 trim_reads = false;
974 break;
975 case 't':
976 truncate_seq = is_integer(optarg);
977 break;
978 case 'b':
979 barcode_file = optarg;
980 if (barcode_type == null_null)
981 barcode_type = inline_null;
982 break;
983 case 'U':
984 barcode_type = null_index;
985 break;
986 case 'u':
987 barcode_type = index_null;
988 break;
989 case 'V':
990 barcode_type = inline_null;
991 break;
992 case 'W':
993 barcode_type = index_index;
994 break;
995 case 'x':
996 barcode_type = inline_inline;
997 break;
998 case 'Y':
999 barcode_type = index_inline;
1000 break;
1001 case 'Z':
1002 barcode_type = inline_index;
1003 break;
1004 case 'A':
1005 adapter_1 = new char[strlen(optarg) + 1];
1006 strcpy(adapter_1, optarg);
1007 filter_adapter = true;
1008 break;
1009 case 'G':
1010 adapter_2 = new char[strlen(optarg) + 1];
1011 strcpy(adapter_2, optarg);
1012 filter_adapter = true;
1013 break;
1014 case 'T':
1015 distance = is_integer(optarg);
1016 break;
1017 case 'L':
1018 len_limit = is_integer(optarg);
1019 break;
1020 case 'w':
1021 win_size = is_double(optarg);
1022 break;
1023 case 's':
1024 score_limit = is_integer(optarg);
1025 break;
1026 case 'v':
1027 version();
1028 break;
1029 case '?':
1030 // getopt_long already printed an error message.
1031 help();
1032 break;
1033
1034 default:
1035 cerr << "Unknown command line option '" << (char) c << "'\n";
1036 help();
1037 exit(1);
1038 }
1039 }
1040
1041 if (optind < argc) {
1042 cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
1043 help();
1044 }
1045
1046 if (in_file.length() == 0 && in_path_1.length() == 0 && in_file_p1.length() == 0) {
1047 cerr << "You must specify an input file of a directory path to a set of input files.\n";
1048 help();
1049 }
1050
1051 if (in_file.length() > 0 && in_path_1.length() > 0) {
1052 cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
1053 help();
1054 }
1055
1056 if (in_file.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
1057 cerr << "You must specify either a single input file (-f) or a set of paired files (-1, -2), not both.\n";
1058 help();
1059 }
1060
1061 if (in_path_1.length() > 0 && (in_file_p1.length() > 0 || in_file_p2.length() > 0)) {
1062 cerr << "You must specify either a file path (-p) or a set of paired files (-1, -2), not both.\n";
1063 help();
1064 }
1065
1066 if (in_path_1.length() > 0 && in_path_1.at(in_path_1.length() - 1) != '/')
1067 in_path_1 += "/";
1068
1069 if (in_path_2.length() > 0 && in_path_2.at(in_path_2.length() - 1) != '/')
1070 in_path_2 += "/";
1071
1072 if (out_path.length() == 0)
1073 out_path = ".";
1074
1075 if (out_path.at(out_path.length() - 1) != '/')
1076 out_path += "/";
1077
1078 if (barcode_file.length() == 0) {
1079 overhang = false;
1080 cerr << "No barcodes specified, files will not be demultiplexed.\n";
1081 }
1082
1083 if (barcode_file.length() > 0 && merge) {
1084 cerr << "You may specify a set of barcodes, or that all files should be merged, not both.\n";
1085 help();
1086 }
1087
1088 if (in_file_type == FileT::unknown)
1089 in_file_type = FileT::gzfastq;
1090
1091 if (in_file_type == FileT::bam && paired == true && interleaved == false) {
1092 cerr << "You may only specify a BAM input file for paired-end data if the read pairs are interleaved.\n";
1093 help();
1094 }
1095
1096 if (in_file_type == FileT::bam && (barcode_type != inline_null && barcode_type != inline_inline && barcode_type != null_null)) {
1097 cerr << "For BAM input files only inline or unbarcoded data can be processed.\n";
1098 help();
1099 }
1100
1101 if (score_limit < 0 || score_limit > 40) {
1102 cerr << "Score limit must be between 0 and 40.\n";
1103 help();
1104 }
1105
1106 if (win_size < 0 || win_size >= 1) {
1107 cerr << "Window size is a fraction between 0 and 1.\n";
1108 help();
1109 }
1110
1111 if (recover && barcode_type != null_null) {
1112 if (barcode_type != index_null && barcode_type != inline_null && barcode_dist_2 < 0)
1113 barcode_dist_2 = barcode_dist_1;
1114 }
1115
1116 return 0;
1117 }
1118
version()1119 void version() {
1120 cerr << "process_shortreads " << VERSION << "\n\n";
1121
1122 exit(1);
1123 }
1124
help()1125 void help() {
1126 cerr << "process_shortreads " << VERSION << "\n"
1127 << "process_shortreads [-f in_file | -p in_dir [-P] [-I] | -1 pair_1 -2 pair_2] -b barcode_file -o out_dir [-i type] [-y type] [-c] [-q] [-r] [-E encoding] [-t len] [-D] [-w size] [-s lim] [-h]\n"
1128 << " f: path to the input file if processing single-end seqeunces.\n"
1129 << " i: input file type, either 'bustard' for the Illumina BUSTARD format, 'bam', 'fastq' (default), or 'gzfastq' for gzipped FASTQ.\n"
1130 << " p: path to a directory of single-end Illumina files.\n"
1131 << " 1: first input file in a set of paired-end sequences.\n"
1132 << " 2: second input file in a set of paired-end sequences.\n"
1133 << " P: specify that input is paired (for use with '-p').\n"
1134 << " I: specify that the paired-end reads are interleaved in single files.\n"
1135 << " o: path to output the processed files.\n"
1136 << " y: output type, either 'fastq' or 'fasta' (default gzfastq).\n"
1137 << " b: a list of barcodes for this run.\n"
1138 << " c: clean data, remove any read with an uncalled base.\n"
1139 << " q: discard reads with low quality scores.\n"
1140 << " r: rescue barcodes.\n"
1141 << " t: truncate final read length to this value.\n"
1142 << " E: specify how quality scores are encoded, 'phred33' (Illumina 1.8+/Sanger, default) or 'phred64' (Illumina 1.3-1.5).\n"
1143 << " D: capture discarded reads to a file.\n"
1144 << " w: set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).\n"
1145 << " s: set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).\n"
1146 << " h: display this help messsage.\n\n"
1147 << " Barcode options:\n"
1148 << " --inline-null: barcode is inline with sequence, occurs only on single-end read (default).\n"
1149 << " --index-null: barcode is provded in FASTQ header (Illumina i5 or i7 read).\n"
1150 << " --null-index: barcode is provded in FASTQ header (Illumina i7 read if both i5 and i7 read are provided).\n"
1151 << " --inline-inline: barcode is inline with sequence, occurs on single and paired-end read.\n"
1152 << " --index-index: barcode is provded in FASTQ header (Illumina i5 and i7 reads).\n"
1153 << " --inline-index: barcode is inline with sequence on single-end read and occurs in FASTQ header (from either i5 or i7 read).\n"
1154 << " --index-inline: barcode occurs in FASTQ header (Illumina i5 or i7 read) and is inline with single-end sequence (for single-end data) on paired-end read (for paired-end data).\n\n"
1155 << " Adapter options:\n"
1156 << " --adapter-1 <sequence>: provide adaptor sequence that may occur on the first read for filtering.\n"
1157 << " --adapter-2 <sequence>: provide adaptor sequence that may occur on the paired-read for filtering.\n"
1158 << " --adapter-mm <mismatches>: number of mismatches allowed in the adapter sequence.\n\n"
1159 << " Output options:\n"
1160 << " --retain-header: retain unmodified FASTQ headers in the output.\n"
1161 << " --merge: if no barcodes are specified, merge all input files into a single output file (or single pair of files).\n\n"
1162 << " Advanced options:\n"
1163 << " --no-read-trimming: do not trim low quality reads, just discard them.\n"
1164 << " --len-limit <limit>: when trimming sequences, specify the minimum length a sequence must be to keep it (default 31bp).\n"
1165 << " --filter-illumina: discard reads that have been marked by Illumina's chastity/purity filter as failing.\n"
1166 << " --barcode-dist: provide the distace between barcodes to allow for barcode rescue (default 2)\n"
1167 << " --mate-pair: raw reads are circularized mate-pair data, first read will be reverse complemented.\n"
1168 << " --no-overhang: data does not contain an overhang nucleotide between barcode and seqeunce.\n";
1169
1170 exit(1);
1171 }
1172