1#!/usr/local/bin/perl
2use strict;
3use warnings;
4use Getopt::Long;
5use IPC::Open3;
6use File::Spec;
7use File::Basename;
8use Cwd;
9
10## This program is Copyright (C) 2012-19, Felix Krueger (felix.krueger@babraham.ac.uk)
11## Edited by Frankie James (github.com/fjames003) for multi-core support
12
13## This program is free software: you can redistribute it and/or modify
14## it under the terms of the GNU General Public License as published by
15## the Free Software Foundation, either version 3 of the License, or
16## (at your option) any later version.
17
18## This program is distributed in the hope that it will be useful,
19## but WITHOUT ANY WARRANTY; without even the implied warranty of
20## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21## GNU General Public License for more details.
22
23## You should have received a copy of the GNU General Public License
24## along with this program. If not, see <http://www.gnu.org/licenses/>.
25
26
27## This script is taking in FastQ sequences and trims them using Cutadapt
28
29## last modified on 07 11 2019
30
31my $DOWARN = 1; # print on screen warning and text by default
32BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
33
34my $trimmer_version = '0.6.4_dev';
35my $cutadapt_version;
36my $python_version;
37
38my ($compression_path,$cores,$cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina,$max_length,$maxn,$trim_n,$hardtrim5,$clock,$polyA,$hardtrim3,$nextseq,$basename,$consider_already_trimmed) = process_commandline();
39my $report_message; # stores result of adapter auto-detection
40
41my @filenames = @ARGV;
42die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
43USAGE:  'trim_galore [options] <filename(s)>'    or    'trim_galore --help'    for more options\n\n" unless (@filenames);
44file_sanity_check($filenames[0]);
45
46if (defined $hardtrim5){
47    warn "Hard-trimming from the 3'-end selected. File(s) will be trimmed to leave the leftmost $hardtrim5 bp on the 5'-end, and Trim Galore will then exit.\n\n";
48    foreach my $file(@filenames){
49		hardtrim_to_5prime_end($file);
50    }
51    exit;
52}
53
54if (defined $hardtrim3){
55    warn "Hard-trimming from 5'-end selected. File(s) will be trimmed to leave the rightmost $hardtrim3 bp on the 3'-end, and Trim Galore will then exit.\n\n";
56    foreach my $file(@filenames){
57		hardtrim_to_3prime_end($file);
58    }
59    exit;
60}
61
62if ($clock){
63    warn "\nIT'S TIME FOR CLOCK PROCESSING!!!\t\t\t\t\t\t\t\t\t[pun intended]\n\n";
64
65    while (@ARGV){
66		my $in1 = shift @ARGV;
67		my $in2 = shift @ARGV;
68		clockwork($in1,$in2);
69    }
70    warn "\nPre-processing finished...\n\nPlease run Trim Galore again to remove adapters, poor quality bases as well as UMI/fixed sequences from the 3'-end of the reads.\nA sample command for this is:\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\ntrim_galore --paired --three_prime_clip_R1 15 --three_prime_clip_R2 15 *.clock_UMI.R1.fq.gz *.clock_UMI.R2.fq.gz\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nTrim Galore Epigenetic Clock processing complete.\n\n";
71    exit;
72}
73
74sub clockwork{
75
76    ### FILEHANDLES
77    my ($in1_fh,$in2_fh); # input filehandles
78    my ($read1_fh,$read2_fh); # output filehandles
79
80    ### INPUT FILES
81    my ($in1,$in2) = @_;
82    if ($in1 =~ /\.gz$/){
83		open ($in1_fh,"$compression_path -d -c $in1 |") or die "Failed to read from file $in1: $!";
84    }
85    else{
86		open ($in1_fh,$in1) or die "Failed to read from file $in1: $!";
87    }
88    if ($in2 =~ /\.gz$/){
89		open ($in2_fh,"$compression_path -d -c $in2 |") or die "Failed to read from file $in2: $!";
90    }
91    else{
92		open ($in2_fh,$in2) or die "Failed to read from file $in2: $!";
93    }
94    # warn " Input file name 1: $in1\n";
95    # warn " Input file name 2: $in2\n";
96
97    ### OUTPUT FILES
98    my $out1 = (split (/\//,$in1))[-1];
99    my $out2 = (split (/\//,$in2))[-1];
100
101    $out1 =~ s/(\.fastq$|\.fastq\.gz$)//;
102    $out1 =~ s/(\.fq$|\.fq\.gz$)//;
103    $out1 .= '.clock_UMI.R1.fq'; # appending to the end
104
105    $out2 =~ s/(\.fastq$|\.fastq\.gz$)//;
106    $out2 =~ s/(\.fq$|\.fq\.gz$)//;
107    $out2 .= '.clock_UMI.R2.fq'; # appending to the end
108
109
110    ### READ 1
111    if ($gzip or $in1 =~ /\.gz$/){
112		if ($dont_gzip){
113			open ($read1_fh,'>',$output_dir.$out1) or die "Can't open '$out1': $!\n";
114		}
115		else{
116			$out1 .= '.gz';
117			open ($read1_fh,"| $compression_path -c - > ${output_dir}${out1}") or die "Can't write to '$out1': $!\n";
118		}
119    }
120    else{
121		open ($read1_fh,'>',$output_dir.$out1) or die "Can't open '$out1': $!\n";
122    }
123    warn "Writing dual trimmed version of the input file '$in1' to '$out1'\n";
124
125    ### READ 2
126    if ($gzip or $in2 =~ /\.gz$/){
127		if ($dont_gzip){
128			open ($read2_fh,'>',$output_dir.$out2) or die "Can't open '$out2': $!\n";
129		}
130		else{
131			$out2 .= '.gz';
132			open ($read2_fh,"| $compression_path -c - > ${output_dir}${out2}") or die "Can't write to '$out2': $!\n";
133		}
134    }
135    else{
136		open ($read2_fh,'>',$output_dir.$out2) or die "Can't open '$out2': $!\n";
137    }
138    warn "Writing dual trimmed version of the input file '$in2' to '$out2'\n                 ---\n";
139    # warn "Output file name 1: $out1\n";
140    # warn "Output file name 2: $out2\n";
141
142
143    my %freqs;
144    my $umi_1;
145    my $umi_2;
146
147    # open ($out2,"| $compression_path -c - > $out1") or die $!;
148    # open (OUT2,"| $compression_path -c - > $out2") or die $!;
149
150    # print "Processing files $in1 and $in2\n";
151    my %r1; # storing the barcodes for R1
152    my %r2; # storing the barcodes for R2
153
154    my %fix1; # storing the fixed sequence (CAGT + A from A-tailing) of R1
155    my %fix2; # storing the fixed sequence (CAGT + A from A-tailing) of R2
156
157    my $count = 0;
158    my $filtered_count = 0;
159    my $r1_contains_rc = 0;
160
161	ORANGE: while (1){
162
163		my $one1 = <$in1_fh>;
164		my $one2 = <$in1_fh>;
165		my $one3 = <$in1_fh>;
166		my $one4 = <$in1_fh>;
167
168		my $two1 = <$in2_fh>;
169		my $two2 = <$in2_fh>;
170		my $two3 = <$in2_fh>;
171		my $two4 = <$in2_fh>;
172
173		last unless  ($one4 and $two4);
174		chomp $one2; # sequence
175		chomp $two2; # sequence
176
177		chomp $one1; # read ID, need to append UMIs to the read ID
178		chomp $two1; # read ID, need to append UMIs to the read ID
179
180		chomp $one4; # quality
181		chomp $two4; # quality
182
183		++$count; # sequence count
184
185		if ($count % 1000000 ==0){
186			warn "Processed $count sequences so far...\n";
187		}
188		my $r1_barcode;
189		my $r2_barcode;
190		my $r1_fix;
191		my $r2_fix;
192
193
194		$r1_barcode = substr($one2,0,8);
195		$r2_barcode = substr($two2,0,8);
196		$r1_fix = substr($one2,8,4);
197		$r2_fix = substr($two2,8,4);
198		# warn "$one2\n$two2\n$r1_barcode\t$r1_fix\n$r2_barcode\t$r2_fix\n\n";sleep(1);
199
200		# this part of code simply counts the different types of sequence that occurred at the constant region where we expected
201		# to read CAGT in both reads. Both R1 and R2 are taken into account at the same time
202		unless ($r1_fix eq 'CAGT' and $r2_fix eq 'CAGT'){
203			$freqs{$r1_fix}++;
204			$freqs{$r2_fix}++;
205			$filtered_count++;
206		}
207
208		### BOTH READ1 AND READ2 should look like this:
209
210		# 0        8    12 13                                    INDEX position
211		# UUUUUUUU CAGT A  FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF   SEQUENCE
212
213		# where:
214		#    U = UMI base
215		# CAGT = fixed sequence
216		#    A = A-tail
217		# FFFF = RRBS-fragment to be aligned
218
219		### Capturing the sequence after the A from A-tailing (FFFFFFFFFFFFFF...)
220
221		my $seq1 = substr($one2,13); # truncated sequence without UMIs or fixed sequence
222		my $seq2 = substr($two2,13);
223
224		my $qual1 = substr($one4,13); # truncated quality string without barcode or fixed sequence
225		my $qual2 = substr($two4,13);
226
227		# warn "$one1\n";
228		$one1 .= ":R1:${r1_barcode}:R2:${r2_barcode}:F1:${r1_fix}:F2:${r2_fix}";
229		#warn "$one1\n";
230
231		#warn "$two1\n";
232		$two1 .= ":R1:${r1_barcode}:R2:${r2_barcode}:F1:${r1_fix}:F2:${r2_fix}";
233		#warn "$two1\n";
234		#   warn "$one2\n          $seq1\n$one4\n          $qual1\n~~\n$two2\n          $seq2\n$two4\n          $qual2\n\n";sleep(1);
235
236		print ${read1_fh} "$one1\n";
237		print ${read1_fh} "$seq1\n";
238		print ${read1_fh} "+\n";      # replacing this with a + for space and format reasons
239		print ${read1_fh} "$qual1\n";
240
241		print ${read2_fh} "$two1\n";
242		print ${read2_fh} "$seq2\n";
243		print ${read2_fh} "+\n";      # replacing this with a + for space and format reasons
244		print ${read2_fh} "$qual2\n";
245
246		# sleep(1);
247
248		$r1{$r1_barcode}++;
249		$r2{$r2_barcode}++;
250
251		$fix1{$r1_fix}++;
252		$fix2{$r2_fix}++;
253    }
254
255    my $perc;
256    if ($count){
257		$perc = sprintf("%.2f",$filtered_count/$count * 100);
258    }
259    else{
260		$perc = 'N/A';
261    }
262
263    warn "Sequences processed in total: $count\nthereof had fixed sequence CAGT in both R1 and R2:\t $filtered_count ($perc%)\n     ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
264}
265
266sub rc{
267    my $string = shift;
268    $string = reverse($string);
269    $string =~ tr/GATC/CTAG/;
270    return $string;
271}
272
273
274########################################################################
275
276
277my $path_to_fastqc = 'fastqc';
278
279
280########################################################################
281
282
283
284
285### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
286unless (defined $cutoff){
287	unless ($nextseq){
288       $cutoff = 20;
289    }
290}
291
292my $phred_score_cutoff = $cutoff; # only relevant for report
293my $adapter_name = '';
294unless (defined $adapter){
295    if ($nextera){
296		$adapter = 'CTGTCTCTTATA';
297		$adapter_name = 'Nextera Transposase sequence; user defined';
298    }
299    elsif($small_rna){
300		$adapter = 'TGGAATTCTCGG';
301		$adapter_name = 'Illumina small RNA adapter; user defined';
302    }
303    elsif($illumina){
304		$adapter = 'AGATCGGAAGAGC';
305		$adapter_name = 'Illumina TruSeq, Sanger iPCR; user defined';
306    }
307    else{ # default
308		### If other -a and/or -a2 were given
309		if ($polyA){ # specialised PolyA trimming
310			($adapter,$adapter_name) = autodetect_polyA_type();
311
312			if ($validate){ # we need to select -a2 as the reverse complement to -a
313				if ($adapter =~ /A+/){
314					$adapter =  extend_adapter_sequence("A",20);  # defaulting to 20
315					$a2 = extend_adapter_sequence("T",150); # defaulting to 150 bp
316				}
317				elsif($adapter =~ /T+/){
318					$adapter =  extend_adapter_sequence("T",20);  # defaulting to 20
319					$a2 = extend_adapter_sequence("A",150);
320				}
321				else{
322					die "Something unexpected happened with the Poly-A autodetection, please check\n";
323				}
324				# warn "Paired-end Poly-A detection, set the following parameters:\n";
325				# warn " -a: $adapter\n-a2: $a2\n   ~~~~~~~~~~ \n\n";sleep(1);
326			}
327			else{
328				# Single end
329				if ($adapter =~ /A+/){
330					$adapter =  extend_adapter_sequence("A",20);  # defaulting to 20
331				}
332				elsif($adapter =~ /T+/){
333					$adapter =  extend_adapter_sequence("T",20);  # defaulting to 20
334				}
335				else{
336					die "Something unexpected happened with the Poly-A autodetection, please check\n";
337				}
338			}
339		}
340		else{ # ADAPTER TRIMMING; DEFAULT
341			($adapter,$adapter_name,$report_message) = autodetect_adapter_type();
342		}
343    }
344}
345else{
346    $adapter_name = 'user defined';
347}
348
349### For smallRNA adapters we are reducing the sequence length cutoff before a sequences gets thrown out entirely to 18bp. We are doing this because some 20-23bp long smallRNAs
350### may be removed if T, TG, or TGG etc gets trimmed off the end (changed b ack up from 16 to 18bp to remove noise from alignment files, 18 11 2015)
351if ($adapter eq 'TGGAATTCTCGG'){
352    unless (defined $length_cutoff){ # user defined length cutoff wins over auto-detection
353		$length_cutoff = 18;
354		warn "Reducing length cutoff to 18bp for small RNA-Seq reads because a cutoff of 20bp may remove some short species of small RNAs if they had been trimmed by 1,2 or 3bp\n";
355    }
356
357    ### If the file is a smallRNA library and paired-end we set the Illumina 5' adapter as the $a2 sequence
358    if ($validate){
359		unless (defined $a2){
360			$a2 = 'GATCGTCGGACT';
361			warn "Setting the Illumina smallRNA 5' adapter as adapter 2: 'GATCGTCGGACT'\n";
362		}
363    }
364}
365
366unless (defined $length_cutoff){
367    $length_cutoff = 20; # non small RNA length cutoff
368}
369
370
371unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
372	$a2 = '';
373}
374
375unless (defined $stringency){
376	$stringency = 1;
377}
378
379if ($phred_encoding == 64){
380    $cutoff += 31;
381}
382
383
384my $file_1;
385my $file_2;
386
387foreach my $filename (@ARGV){
388    trim ($filename);
389}
390
391
392sub trim{
393    my $filename = shift;
394
395    my $output_filename = (split (/\//,$filename))[-1];
396
397    my $report = $output_filename;
398    $report =~ s/$/_trimming_report.txt/;
399
400
401	if ($no_report_file) {
402		$report = File::Spec->devnull;
403		open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n";
404		# warn "Redirecting report output to /dev/null\n";
405	}
406    else{
407		open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n";
408		warn "Writing report to '$output_dir$report'\n";
409    }
410
411    warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
412    print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
413
414    if ($validate){ # paired-end mode
415		warn "Trimming mode: paired-end\n";
416		print REPORT "Trimming mode: paired-end\n";
417    }
418    else{
419		warn "Trimming mode: single-end\n";
420		print REPORT "Trimming mode: single-end\n";
421    }
422
423    warn "Trim Galore version: $trimmer_version\n";
424    print REPORT "Trim Galore version: $trimmer_version\n";
425
426    warn "Cutadapt version: $cutadapt_version\n";
427    print REPORT "Cutadapt version: $cutadapt_version\n";
428
429	if (defined $python_version){
430		warn "Python version: $python_version\n";
431		print REPORT "Python version: $python_version\n";
432	}
433
434    if (defined $cores){
435		my $temp = $cores;
436		$temp =~ s/-j //;
437        warn "Number of cores used for trimming: $temp\n";
438		print REPORT "Number of cores used for trimming: $temp\n";
439    }
440
441    if (defined $phred_score_cutoff){
442        warn "Quality Phred score cutoff: $phred_score_cutoff\n";
443		print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
444    }
445
446
447    warn "Quality encoding type selected: ASCII+$phred_encoding\n";
448    print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
449
450    if ($report_message){
451    	# warn "adding Auto-detection statement\n";
452    	print REPORT $report_message;
453    }
454
455    warn "Adapter sequence: '$adapter' ($adapter_name)\n";
456    print REPORT "Adapter sequence: '$adapter' ($adapter_name)\n";
457
458    if ($error_rate == 0.1){
459        warn "Maximum trimming error rate: $error_rate (default)\n";
460    }
461    else{
462        warn "Maximum trimming error rate: $error_rate\n";
463    }
464
465    if (defined $maxn){
466        warn "Maximum number of tolerated Ns: $maxn\n";
467    }
468
469    if ($nextseq){
470        warn "2-colour high quality G-trimming enabled, with quality cutoff: $nextseq\n";
471        print REPORT "2-colour high quality G-trimming enabled, with quality cutoff: $nextseq\n";
472    }
473
474    print REPORT "Maximum trimming error rate: $error_rate";
475	if ($error_rate == 0.1){
476		print REPORT " (default)\n";
477    }
478    else{
479		print REPORT "\n";
480    }
481
482    if ($a2){
483        warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
484        print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
485    }
486
487    warn "Minimum required adapter overlap (stringency): $stringency bp\n";
488    print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
489
490
491    if (defined $consider_already_trimmed){
492    	warn "During adapter auto-detection, files are considered already adapter-trimmed if the highest found adapter was found equal to or lower than: $consider_already_trimmed\n";
493	    print REPORT "During adapter auto-detection, files are considered already adapter-trimmed if the highest found adapter was found equal to or lower than: $consider_already_trimmed\n";
494    }
495
496    if ($validate){
497        warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
498        print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
499    }
500    else{
501        warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
502        print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
503    }
504
505    if ($max_length){
506        warn "Maxiumum tolerated read length after trimming (for smallRNA trimming): $max_length bp\n";
507        print REPORT "Maxiumum tolerated read length after trimming (for smallRNA trimming): $max_length bp\n";
508    }
509
510    if ($trim_n){
511        warn "Removing Ns from the start and end of reads\n";
512        print REPORT "Removing Ns from the start and end of reads\n";
513    }
514
515
516    if ($validate){ # only for paired-end files
517
518        if ($retain){ # keeping single-end reads if only one end is long enough
519
520            if ($length_read_1 == 35){
521                warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
522                print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
523            }
524            else{
525                warn "Length cut-off for read 1: $length_read_1 bp\n";
526                print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
527            }
528
529            if ($length_read_2 == 35){
530                warn "Length cut-off for read 2: $length_read_2 bb (default)\n";
531                print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
532            }
533            else{
534                warn "Length cut-off for read 2: $length_read_2 bp\n";
535                print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
536            }
537        }
538    }
539
540    if ($rrbs){
541        warn "File was specified to be an MspI-digested RRBS sample. Read 1 sequences with adapter contamination will be trimmed a further 2 bp from their 3' end, and Read 2 sequences will be trimmed by 2 bp from their 5' end to remove potential methylation-biased bases from the end-repair reaction\n";
542        print REPORT "File was specified to be an MspI-digested RRBS sample. Read 1 sequences with adapter contamination will be trimmed a further 2 bp from their 3' end, and Read 2 sequences will be trimmed by 2 bp from their 5' end to remove potential methylation-biased bases from the end-repair reaction\n";
543    }
544
545    if ($non_directional){
546        warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
547        print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
548    }
549
550    if ($trim){
551        warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
552        print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
553    }
554
555    if ($clip_r1){
556    warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
557    print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
558    }
559    if ($clip_r2){
560		warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
561		print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
562	}
563
564    if ($three_prime_clip_r1){
565		warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
566		print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
567    }
568    if ($three_prime_clip_r2){
569		warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
570		print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
571    }
572
573    if ($fastqc){
574        warn "Running FastQC on the data once trimming has completed\n";
575        print REPORT "Running FastQC on the data once trimming has completed\n";
576
577        if ($fastqc_args){
578            warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
579            print REPORT  "Running FastQC with the following extra arguments: $fastqc_args\n";
580        }
581    }
582
583    if ($keep and $rrbs){
584        warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
585        print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
586    }
587
588
589    if ($gzip or $filename =~ /\.gz$/){
590        $gzip = 1;
591        unless ($dont_gzip){
592            warn "Output file(s) will be GZIP compressed\n";
593            print REPORT "Output file will be GZIP compressed\n";
594        }
595    }
596
597
598    warn "\n";
599    print REPORT "\n";
600    # sleep (3);
601
602    my $temp;
603
604	### We need to make sure that Cutadapt still runs if users use a fairly old version of Cutadapt, as multi-core handling is only supported sincce
605	### Cutadapt version 1.15. Edited on 08 03 2019
606	# if the Cutadapt version has more . in the version, we discard the second one.
607	# warn "Version was: $cutadapt_version\n";
608	if ($cutadapt_version =~ /(\d+\.\d+)\.\d+/){
609		$cutadapt_version = $1;
610	}
611	# warn "Version now is: $cutadapt_version\n";
612	my ($major_version,$sub_version) = ($1,$2) if ($cutadapt_version =~ /^(\d+)\.(\d+)$/);
613	# warn "Major: $major_version\nSub version: $sub_version\n";
614	# sleep(3);
615	if ($major_version == 1){ # versions prior to 1.15 did not support the -j option
616		if ($sub_version < 15){
617			if ($cores > 1){
618				die "I'm sorry but your version of Cutadapt is too old to support multi-core trimming. Please update Cutadapt, and try again\n\n";
619			}
620			else{ #default single-core prcessing with old version of Cutadapt
621				warn "Your version of Cutadapt is fairly old (detected v$cutadapt_version), please consider updating Cutadapt!\n";
622				$cores = ""; # need to delete -j entirely as Cutadapt
623			}
624		}
625		else{
626			warn "Cutadapt seems to be reasonably up-to-date. Setting -j $cores\n";
627			unless ($cores =~ /^-j \d+$/){ # if it was set before, don't change it again
628				$cores = "-j $cores";
629			}
630		}
631	}
632	elsif ($major_version > 1){
633		warn "Cutadapt seems to be fairly up-to-date (version $cutadapt_version). Setting -j $cores\n";
634		unless ($cores =~ /^-j \d+$/){ # if it was set before, don't change it again
635			$cores = "-j $cores";
636		}
637	}
638	else{
639		die "Cutadapt major version was not 1 or higher. Simply too old...\n";
640	}
641
642    ### Proceeding differently for RRBS and other type of libraries
643    if ($rrbs){
644
645		### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
646		if ($cutoff == 0){
647			warn "Quality cutoff selected was 0    -    Skipping quality trimming altogether\n\n";
648			# sleep (3);
649		}
650		else{
651
652			$temp = $filename;
653			$temp =~ s/^.*\///; # replacing optional file path information
654			$temp =~ s/$/_qual_trimmed.fastq/;
655			open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!";
656
657			warn "  >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
658			# sleep (1);
659
660			open (QUAL,"$path_to_cutadapt  $cores -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
661
662			my $qual_count = 0;
663
664			while (1){
665				my $l1 = <QUAL>;
666				my $seq = <QUAL>;
667				my $l3 = <QUAL>;
668				my $qual = <QUAL>;
669				last unless (defined $qual);
670
671				$qual_count++;
672				if ($qual_count%10000000 == 0){
673					warn "$qual_count sequences processed\n";
674				}
675				print TEMP "$l1$seq$l3$qual";
676			}
677
678			warn "\n  >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
679			close QUAL or die "Unable to close QUAL filehandle: $!\n";
680
681		}
682	}
683
684
685	if ($output_filename =~ /\.fastq$/){
686		$output_filename =~ s/\.fastq$/_trimmed.fq/;
687	}
688	elsif ($output_filename =~ /\.fastq\.gz$/){
689		$output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
690	}
691	elsif ($output_filename =~ /\.fq$/){
692		$output_filename =~ s/\.fq$/_trimmed.fq/;
693	}
694	elsif ($output_filename =~ /\.fq\.gz$/){
695		$output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
696	}
697	else{
698		$output_filename =~ s/$/_trimmed.fq/;
699	}
700
701	if ($gzip or $filename =~ /\.gz$/){
702		if ($dont_gzip){
703			open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
704		}
705		else{
706			### 6 Jan 2014: had a request to also gzip intermediate files to save disk space
707			#  if ($validate){
708			# open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
709			# }
710			$output_filename .= '.gz';
711			open (OUT,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n";
712		}
713	}
714	else{
715		open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n";
716	}
717	warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
718
719	my $count = 0;
720	my $too_short = 0;
721	my $too_long = 0;
722	my $too_many_n = 0;
723	my $quality_trimmed = 0;
724	my $rrbs_trimmed = 0;
725	my $rrbs_trimmed_start = 0;
726	my $CAA = 0;
727	my $CGA = 0;
728
729	my $pid;
730
731	if ($rrbs and $cutoff != 0){
732
733		### optionally using 2 different adapters for read 1 and read 2
734		if ($validate and $a2){
735			### Figure out whether current file counts as read 1 or read 2 of paired-end files
736			if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
737				warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
738				#sleep (1);
739				$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
740			}
741			else{                            # this is read 2 of a pair
742				warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
743				#sleep (1);
744				$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
745			}
746		}
747		### Using the same adapter for both read 1 and read 2
748		else{
749			warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
750			# sleep (3);
751			$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
752		}
753
754		close WRITER or die $!; # not needed
755
756		open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
757
758		if ($filename =~ /\.gz$/){
759			open (IN,"gunzip -c $filename |") or die $!; # original, untrimmed file
760		}
761		else{
762			open (IN,$filename) or die $!; # original, untrimmed file
763		}
764
765		while (1){
766
767			# we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
768			my $l1 = <TRIM>;
769			my $seq = <TRIM>; # adapter trimmed sequence
770			my $l3 = <TRIM>;
771			my $qual = <TRIM>;
772
773			$_ = <IN>;   # irrelevant
774			my $original_seq = <IN>;
775			$_ = <IN>;   # irrelevant
776			$_ = <IN>;   # irrelevant
777
778			$_ = <QUAL>; # irrelevant
779			my $qual_trimmed_seq = <QUAL>;
780			$_ = <QUAL>; # irrelevant
781			my $qual_trimmed_qual = <QUAL>;
782
783			last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
784
785			$count++;
786			if ($count%10000000 == 0){
787				warn "$count sequences processed\n";
788			}
789
790			chomp $seq;
791			chomp $qual;
792			chomp $qual_trimmed_seq;
793			chomp $original_seq;
794
795			my $quality_trimmed_seq_length = length $qual_trimmed_seq;
796
797			if (length $original_seq > length $qual_trimmed_seq){
798				++$quality_trimmed;
799			}
800
801			my $nd = 0;
802
803			### NON-DIRECTIONAL RRBS
804			if ($non_directional){
805				$nd = 1;
806				if (length$seq > 2){ # only doing something if the read is longer than 2bp
807
808					# only trimming Read 1 of a pair for a further 2bp from their 3' end
809					if ($seq =~ /^CAA/){ # this might be a non-directional sequence
810						++$CAA;
811						$seq = substr ($seq,2,length($seq)-2);
812						$qual = substr ($qual,2,length($qual)-2);
813						++$rrbs_trimmed_start;
814					}
815					elsif ($seq =~ /^CGA/){ # this might be a non-directional sequence
816						$seq = substr ($seq,2,length($seq)-2);
817						$qual = substr ($qual,2,length($qual)-2);
818						++$CGA;
819						++$rrbs_trimmed_start;
820					}
821					else{
822						# If the reads look like standard OT/OB sequences (CGG/TGG)
823						# we need to trim in the same way as for directional sequences
824						if (length$seq < $quality_trimmed_seq_length){
825							$seq = substr ($seq,0,length($seq)-2);
826							$qual = substr ($qual,0,length($qual)-2);
827							++$rrbs_trimmed;
828						}
829					}
830				}
831				else{
832					# read is too short anyway (so it will probably not survive the length filtering step
833				}
834			}
835
836			### directional read
837			unless ($nd == 1){
838				# only trimming Read 1 of a pair for a further 2bp from their 3' end
839				if ($validate){ # paired end
840					if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
841						if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
842							$seq = substr ($seq,0,length($seq)-2);
843							$qual = substr ($qual,0,length($qual)-2);
844							++$rrbs_trimmed;
845						}
846					}
847					else{
848						# this is read 2 of a pair. We do not trim further from the 3' end but rather trim R2 from the 5' end later
849					}
850				}
851				else{ # single-end reads will be trimmed from their 3' end
852					if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
853						$seq = substr ($seq,0,length($seq)-2);
854						$qual = substr ($qual,0,length($qual)-2);
855						++$rrbs_trimmed;
856					}
857				}
858			}
859
860			### Shortening all sequences by 1 bp on the 3' end
861			# 28 02 2019: This was really only required for Bowtie 1 paired-end alignments, maybe we should drop this option in soon
862			if ($trim){
863				$seq = substr($seq,0,length($seq)-1);
864				$qual = substr($qual,0,length($qual)-1);
865			}
866
867			### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
868			if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
869				print OUT "$l1$seq\n$l3$qual\n";
870			}
871			else{ # single end
872
873				if ($clip_r1){
874					if (length $seq > $clip_r1){  # sequences that are already too short won't be clipped again
875						$seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
876						$qual = substr($qual,$clip_r1);
877					}
878				}
879
880				if ($three_prime_clip_r1){
881					if (length $seq > $three_prime_clip_r1){  # sequences that are already too short won't be clipped again
882					# warn	 "seq/qual before/after trimming:\n$seq\n$qual\n";
883						$seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
884						$qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 ));
885						# warn "$seq\n$qual\n";
886					}
887				}
888
889				if (defined $maxn){
890					my $n_count = Ncounter($seq);
891					# warn "Checking for Ns: Found $n_count\n";
892					if ($n_count > $maxn){
893						++$too_many_n;
894						next;
895					}
896				}
897
898				if (length $seq < $length_cutoff){
899					++$too_short;
900					next;
901				}
902				elsif($max_length and length$seq > $max_length){
903					++$too_long;
904					next; # sequence is too long
905				}
906				else{
907					print OUT "$l1$seq\n$l3$qual\n";
908				}
909			}
910		}
911
912		print REPORT "\n";
913		while (<ERROR>){
914			warn $_;
915			print REPORT $_;
916		}
917
918		close IN or die "Unable to close IN filehandle: $!";
919		close QUAL or die "Unable to close QUAL filehandle: $!";
920		close TRIM or die "Unable to close TRIM filehandle: $!";
921		close OUT or die  "Unable to close OUT filehandle: $!";
922
923	}
924    ############################################################################################################
925
926    elsif($polyA){ # PolyA trimming
927
928		warn "POLY-A TRIMMING MODE; EXPERIMENTAL!!\n";
929		my $isR2 = 0;
930
931		# For the moment we set the temp file name back to $filename
932		$temp = $filename;
933
934		### optionally using 2 different adapters for read 1 and read 2
935		if ($validate and $a2){
936			### Figure out whether current file counts as read 1 or read 2 of paired-end files
937			if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
938				warn "\n  >>> Now performing Poly-A trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
939				$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
940			}
941			else{                            # this is read 2 of a pair
942				$isR2 = 1;
943				warn "\n  >>> Now performing Poly-A trimming for the adapter sequence: '$a2' from file $temp <<< \n";
944				# For Read 2 we need to trim the PolyT (or PolyA) from the 5' end instead! Hence -g $a2 and not -a!
945				$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -g $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
946			}
947		}
948		### Using the same adapter for both read 1 and read 2 - Single end will use the same adapters are Read 1s for paired-end files
949		else{
950			warn "\n  >>> Now performing single-end Poly-A trimming for with the sequence: '$adapter' from file $temp <<< \n";
951			# sleep (3);
952			$pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt  $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
953		}
954
955		close WRITER or die $!; # not needed
956
957		# This is the Illumina adapter trimmed file
958		if ($temp =~ /\.gz$/){
959			open (QUAL,"gunzip -c $output_dir$temp |") or die $!; # quality trimmed file
960		}
961		else{
962			open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
963		}
964
965		while(1){
966			# we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
967			my $l1 = <TRIM>;
968			my $seq = <TRIM>; # adapter trimmed sequence
969			my $l3 = <TRIM>;
970			my $qual = <TRIM>;
971
972			#   $_ = <IN>;   # irrelevant
973			#   my $original_seq = <IN>;
974			#   $_ = <IN>;   # irrelevant
975			#   $_ = <IN>;   # irrelevant
976
977			$_ = <QUAL>; # irrelevant
978			my $qual_trimmed_seq = <QUAL>;
979			$_ = <QUAL>; # irrelevant
980			my $qual_trimmed_qual = <QUAL>;
981
982			last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
983
984			$count++;
985			if ($count%10000000 == 0){
986				warn "$count sequences processed\n";
987			}
988			chomp $l1;
989			chomp $seq;
990			chomp $qual;
991			chomp $qual_trimmed_seq;
992			# chomp $original_seq;
993
994			my $quality_trimmed_seq_length = length $qual_trimmed_seq;
995
996			#    if (length $original_seq > length $qual_trimmed_seq){
997			#		++$quality_trimmed;
998			#	     }
999
1000			my $diff = length($qual_trimmed_seq) - length($seq);
1001
1002			### CHANGING THE readID to remove white spaces and adding PolyA trimming information
1003			$l1 =~ s/\s+/_/g; # removing white spaces from readID
1004			if ($diff > 0){ # only adding the PolyA removal tag if some A's were really removed, so we can filter these out later if desired
1005				# warn "$l1\n";
1006				$l1 =~ s/\@/\@${diff}:A:/; # also adding the PolyA trimmed reads to the start of the read in the format
1007				# "trimmed_bases:A:" as this is the current Way Thermo Fisher are handling it
1008				$l1 .= "_PolyA:$diff";
1009				# warn "$l1\n~~~~~~~~~~~~~\n"; sleep(1);
1010			}
1011
1012			### Shortening all sequences by 1 bp on the 3' end - This is probably no longer needed since we have stopped using Bowtie (1)
1013			if ($trim){
1014				$seq = substr($seq,0,length($seq)-1);
1015				$qual = substr($qual,0,length($qual)-1);
1016			}
1017
1018			### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
1019			if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
1020				if ($isR2){
1021					#print "$l1\n$qual_trimmed_seq\n$seq\n$l3$qual_trimmed_qual\n$qual\n~~~~~\n";
1022				   # print "length original: ",length($qual_trimmed_seq) , "\nlength trimmed:  ", length($seq) , "\nDifference: $diff bp\n   ~~~\n\n"; sleep(1);
1023				}
1024				print OUT "$l1\n$seq\n$l3$qual\n";
1025			}
1026			else{ # single end
1027
1028				if ($clip_r1){
1029					if (length $seq > $clip_r1){  # sequences that are already too short won't be clipped again
1030						$seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
1031						$qual = substr($qual,$clip_r1);
1032					}
1033				}
1034
1035				if ($three_prime_clip_r1){
1036					if (length $seq > $three_prime_clip_r1){  # sequences that are already too short won't be clipped again
1037						# warn "seq/qual before/after trimming:\n$seq\n$qual\n";
1038						$seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
1039						$qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 ));
1040						# warn "$seq\n$qual\n";
1041					}
1042				}
1043
1044				if (length $seq < $length_cutoff){
1045					++$too_short;
1046					next;
1047				}
1048				elsif($max_length and length$seq > $max_length){
1049					++$too_long;
1050					next; # sequence is too long
1051				}
1052				else{
1053					print OUT "$l1\n$seq\n$l3$qual\n";
1054				}
1055			}
1056		}
1057
1058		print REPORT "\n";
1059		while (<ERROR>){
1060			warn $_;
1061			print REPORT $_;
1062		}
1063
1064		close QUAL or die "Unable to close QUAL filehandle: $!";
1065		close TRIM or die "Unable to close TRIM filehandle: $!";
1066		close OUT or die  "Unable to close OUT filehandle: $!";
1067    }
1068
1069    ############################################################################################################
1070    else{ # non-RRBS mode. default
1071
1072    	my $quality_cutoff;
1073
1074		if ($nextseq){
1075      		$quality_cutoff = $nextseq;
1076      	}
1077      	else{
1078      		$quality_cutoff = "-q $cutoff";
1079      	}
1080
1081		### optionally using 2 different adapters for read 1 and read 2
1082		if ($validate and $a2){
1083
1084			### Figure out whether current file counts as read 1 or read 2 of paired-end files
1085			if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
1086				warn "\n  >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
1087				#sleep (1);
1088				$pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt  $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
1089			}
1090			else{                            # this is read 2 of a pair
1091				warn "\n  >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
1092				#sleep (1);
1093				$pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt  $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
1094			}
1095		}
1096		### Using the same adapter for both read 1 and read 2
1097		else{
1098			warn "\n  >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
1099			#sleep (1);
1100			$pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt  $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
1101		}
1102
1103		close WRITER or die $!; # not needed
1104
1105		while (1){
1106
1107			my $l1 = <TRIM>;
1108			my $seq = <TRIM>; # quality and/or adapter trimmed sequence
1109			my $l3 = <TRIM>;
1110			my $qual = <TRIM>;
1111			# print "$l1$seq\n$l3$qual\n";
1112			last unless (defined $qual); # could be an empty string
1113
1114			$count++;
1115			if ($count%10000000 == 0){
1116				warn "$count sequences processed\n";
1117			}
1118
1119			chomp $seq;
1120			chomp $qual;
1121
1122			### Shortening all sequences by 1 bp on the 3' end
1123			if ($trim){
1124				$seq = substr($seq,0,length($seq)-1);
1125				$qual = substr($qual,0,length($qual)-1);
1126			}
1127
1128			### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
1129			if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
1130				print OUT "$l1$seq\n$l3$qual\n";
1131			}
1132			else{ # single end
1133
1134				if ($clip_r1){
1135					if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again
1136						$seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
1137						$qual = substr($qual,$clip_r1);
1138					}
1139				}
1140
1141				if ($three_prime_clip_r1){
1142					if (length $seq > $three_prime_clip_r1){  # sequences that are already too short won't be clipped again
1143						# warn "seq/qual before/after trimming:\n$seq\n$qual\n";
1144						$seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
1145						$qual = substr($qual,0,(length($qual) - $three_prime_clip_r1));
1146						# warn "$seq\n$qual\n";sleep(1);
1147					}
1148				}
1149
1150				if (defined $maxn){
1151					my $n_count = Ncounter($seq);
1152					# warn "Checking for Ns: Found $n_count\n";
1153					if ($n_count > $maxn){
1154						++$too_many_n;
1155						next;
1156					}
1157				}
1158
1159				if (length $seq < $length_cutoff){
1160					++$too_short;
1161					next;
1162				}
1163				elsif ($max_length and length$seq > $max_length){
1164					++$too_long;
1165					next; # sequence is too long
1166				}
1167				else{
1168					print OUT "$l1$seq\n$l3$qual\n";
1169				}
1170			}
1171		}
1172
1173		print REPORT "\n";
1174		while (<ERROR>){
1175			warn $_;
1176			print REPORT $_;
1177		}
1178
1179		close TRIM or die "Unable to close TRIM filehandle: $!\n";
1180		close ERROR or die "Unable to close ERROR filehandle: $!\n";
1181		close OUT or die  "Unable to close OUT filehandle: $!\n";
1182
1183	}
1184
1185
1186	if ($rrbs){
1187		unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
1188
1189			# deleting temporary quality trimmed file
1190			my $deleted = unlink "$output_dir$temp";
1191
1192			if ($deleted){
1193				warn "Successfully deleted temporary file $temp\n\n";
1194			}
1195			else{
1196				warn "Could not delete temporary file $temp";
1197			}
1198		}
1199	}
1200
1201	### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process
1202	waitpid $pid, 0;
1203	unless ($? == 0){
1204		die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n";
1205	}
1206
1207	warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
1208	print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
1209
1210	warn "="x 45,"\n";
1211	print REPORT "="x 45,"\n";
1212
1213	warn "$count sequences processed in total\n";
1214	print REPORT "$count sequences processed in total\n";
1215
1216	###  only reporting this separately if quality and adapter trimming were performed separately
1217	if ($rrbs){
1218		my $percentage_shortened;
1219		if ($count){
1220			$percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
1221			warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
1222			print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
1223		}
1224		else{
1225			warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
1226			print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
1227		}
1228	}
1229
1230	my $percentage_too_short;
1231	my $percentage_too_long;
1232	my $percentage_too_many_n;
1233	if ($count){
1234		$percentage_too_short  = sprintf ("%.1f",$too_short/$count*100);
1235		$percentage_too_long   = sprintf ("%.1f",$too_long/$count*100);
1236		$percentage_too_many_n = sprintf ("%.1f",$too_many_n/$count*100);
1237	}
1238	else{
1239		$percentage_too_short = 'N/A';
1240		$percentage_too_long = 'N/A';
1241		$percentage_too_many_n = 'N/A';
1242	}
1243
1244
1245
1246  if ($validate){ ### only for paired-end files
1247      warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n";
1248  }
1249  else{           ### Single-end file
1250      warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
1251      print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
1252      if (defined $maxn){
1253      warn "Sequences removed because they contained more Ns than the cutoff of $maxn:\t$too_many_n ($percentage_too_many_n%)\n";
1254      print REPORT "Sequences removed because they contained more Ns than the cutoff of $maxn:\t$too_many_n ($percentage_too_many_n%)\n";
1255      }
1256      if ($max_length){
1257        warn "Sequences removed because after trimming they were longer than the maximum length cutoff of $max_length bp:\t$too_long ($percentage_too_long%)\n";
1258      print REPORT "Sequences removed because after trimming they were longer than the maximum length cutoff of $max_length bp:\t$too_long ($percentage_too_long%)\n";
1259      }
1260  }
1261
1262  if ($rrbs){
1263      my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
1264      warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
1265      print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
1266  }
1267
1268  if ($non_directional){
1269      my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
1270      warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
1271      print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
1272  }
1273
1274  warn "\n";
1275  print REPORT "\n";
1276
1277    ### RUNNING FASTQC unless we are dealing with paired-end files
1278    unless($validate){
1279
1280        # File renaming requested in Issue #17 (https://github.com/FelixKrueger/TrimGalore/issues/17)
1281        ### FILE RENAMING
1282        if ($basename){
1283            warn "Now renaming the output file $output_filename\n\n";
1284            my $tempname = "${basename}$1" if ($output_filename =~ /(_trimmed.*)$/);
1285            warn "ORIGINAL FILE 1: >>$output_filename<<\tRENAMING TO:>>$tempname<<\n";
1286
1287            rename "${output_dir}$output_filename", "${output_dir}$tempname";
1288            $output_filename = $tempname;
1289        }
1290
1291        if ($fastqc){
1292            warn "\n  >>> Now running FastQC on the data <<<\n\n";
1293            # sleep (1);
1294            if ($fastqc_args){
1295                system ("$path_to_fastqc $fastqc_args $output_dir$output_filename");
1296            }
1297            else{
1298                system ("$path_to_fastqc $output_dir$output_filename");
1299            }
1300        }
1301    }
1302
1303    ### VALIDATE PAIRED-END FILES
1304    if ($validate){
1305        ### Figure out whether current file counts as read 1 or read 2 of paired-end files
1306
1307        if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
1308            $file_1 = $output_filename;
1309            shift @filenames;
1310            # warn "This is read 1: $file_1\n\n";
1311        }
1312        else{                            # this is read 2 of a pair
1313            $file_2 = $output_filename;
1314            shift @filenames;
1315            # warn "This is read 2: $file_2\n\n";
1316        }
1317
1318        if ($file_1 and $file_2){
1319            warn "Validate paired-end files $file_1 and $file_2\n";
1320
1321            # File renaming requested in Issue #17 (https://github.com/FelixKrueger/TrimGalore/issues/17)
1322            ### FILE RENAMING
1323            if ($basename){
1324                warn "Now renaming the output files\n\n";
1325                my $tempname_1 = "${basename}_R1$1" if ($file_1 =~ /(_trimmed.*)$/);
1326                my $tempname_2 = "${basename}_R2$1" if ($file_2 =~ /(_trimmed.*)$/);
1327                warn "ORIGINAL FILE 1: >>$file_1<<\tRENAMING TO:>>$tempname_1<<\n";
1328                warn "ORIGINAL FILE 2: >>$file_2<<\tRENAMING TO:>>$tempname_2<<\n";
1329
1330                rename "${output_dir}${file_1}","${output_dir}$tempname_1";
1331                rename "${output_dir}${file_2}","${output_dir}$tempname_2";
1332                $file_1 = $tempname_1;
1333                $file_2 = $tempname_2;
1334
1335            }
1336            # sleep (1);
1337
1338        my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
1339
1340        ### RUNNING FASTQC
1341        if ($fastqc){
1342
1343            warn "\n  >>> Now running FastQC on the validated data $val_1<<<\n\n";
1344
1345        if ($fastqc_args){
1346            system ("$path_to_fastqc $fastqc_args $output_dir$val_1");
1347        }
1348        else{
1349            system ("$path_to_fastqc $output_dir$val_1");
1350        }
1351
1352        warn "\n  >>> Now running FastQC on the validated data $val_2<<<\n\n";
1353        #sleep (3);
1354
1355        if ($fastqc_args){
1356        system ("$path_to_fastqc $fastqc_args $output_dir$val_2");
1357        }
1358        else{
1359        system ("$path_to_fastqc $output_dir$val_2");
1360        }
1361
1362      }
1363
1364      warn "Deleting both intermediate output files $file_1 and $file_2\n";
1365      unlink "$output_dir$file_1";
1366      unlink "$output_dir$file_2";
1367
1368      warn "\n",'='x100,"\n\n";
1369      # sleep (1);
1370
1371      $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
1372      $file_2 = undef;
1373      }
1374  }
1375  close REPORT or warn "Failed to close filehandle REPORT: $!\n";
1376}
1377
1378sub hardtrim_to_5prime_end{
1379
1380    my $filename = shift;
1381    warn "Input file name:  $filename\n";
1382
1383    my $in; # filehandle
1384    my $hardtrim_5; # filehandle
1385
1386    if ($filename =~ /gz$/){
1387    open ($in,"$compression_path -d -c $filename |") or die "Couldn't read from file $!";
1388    }
1389    else{
1390    open ($in,$filename) or die "Couldn't read from file $!";
1391    }
1392
1393    ### OUTPUT FILE
1394    my $output_filename = (split (/\//,$filename))[-1];
1395    # warn "Output file name: $output_filename\n";
1396
1397    if ($output_filename =~ /\.fastq$/){
1398    $output_filename =~ s/\.fastq$/.${hardtrim5}bp_5prime.fq/;
1399    }
1400    elsif ($output_filename =~ /\.fastq\.gz$/){
1401    $output_filename =~ s/\.fastq\.gz$/.${hardtrim5}bp_5prime.fq/;
1402    }
1403    elsif ($output_filename =~ /\.fq$/){
1404    $output_filename =~ s/\.fq$/.${hardtrim5}bp_5prime.fq/;
1405    }
1406    elsif ($output_filename =~ /\.fq\.gz$/){
1407    $output_filename =~ s/\.fq\.gz$/.${hardtrim5}bp_5prime.fq/;
1408    }
1409    else{
1410    $output_filename =~ s/$/.${hardtrim5}bp_5prime.fq/;
1411    }
1412
1413    if ($gzip or $filename =~ /\.gz$/){
1414    if ($dont_gzip){
1415        open ($hardtrim_5,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
1416    }
1417    else{
1418        $output_filename .= '.gz';
1419        open ($hardtrim_5,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n";
1420    }
1421    }
1422    else{
1423    open ($hardtrim_5,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n";
1424    }
1425    warn "Writing trimmed version (using the first $hardtrim5 bp only) of the input file '$filename' to '$output_filename'\n";
1426
1427    my $count = 0;
1428
1429    while (1){
1430    my $identifier = <$in>;
1431    my $sequence = <$in>;
1432    my $identifier2 = <$in>;
1433    my $quality_score = <$in>;
1434
1435    last unless ($identifier and $sequence and $identifier2 and $quality_score);
1436    ++$count;
1437
1438    $sequence      =~ s/\r|\n//g; # cross-platform line ending trimming
1439    $quality_score =~ s/\r|\n//g;
1440           $identifier    =~ s/\r|\n//g;
1441    $identifier2   =~ s/\r|\n//g;
1442
1443    my $trimmed_sequence = substr($sequence,0,$hardtrim5);           # from the start to the $hardtrim5
1444    my $trimmed_quality_score = substr($quality_score,0,$hardtrim5); # from the start to the $hardtrim5
1445
1446    print {$hardtrim_5} join ("\n",$identifier,$trimmed_sequence,$identifier2,$trimmed_quality_score),"\n";
1447
1448    }
1449    close $in or warn "Failed to close filehandle for $filename";
1450    close $hardtrim_5 or die "Failed to close out-filehandle hardtrim_5: $!";
1451
1452    warn "\nFinished writing out converted version of the FastQ file $filename ($count sequences in total)\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
1453
1454}
1455
1456sub hardtrim_to_3prime_end{
1457
1458    my $filename = shift;
1459    warn "Input file name:  $filename\n";
1460
1461    my $in;         # filehandle
1462    my $hardtrim_3; # filehandle
1463
1464    if ($filename =~ /gz$/){
1465    open ($in,"$compression_path -d -c $filename |") or die "Couldn't read from file $!";
1466    }
1467    else{
1468    open ($in,$filename) or die "Couldn't read from file $!";
1469    }
1470
1471    ### OUTPUT FILE
1472    my $output_filename = (split (/\//,$filename))[-1];
1473    # warn "Output file name: $output_filename\n";
1474
1475    if ($output_filename =~ /\.fastq$/){
1476    	$output_filename =~ s/\.fastq$/.${hardtrim3}bp_3prime.fq/;
1477    }
1478    elsif ($output_filename =~ /\.fastq\.gz$/){
1479    	$output_filename =~ s/\.fastq\.gz$/.${hardtrim3}bp_3prime.fq/;
1480    }
1481    elsif ($output_filename =~ /\.fq$/){
1482   		$output_filename =~ s/\.fq$/.${hardtrim3}bp_3prime.fq/;
1483    }
1484    elsif ($output_filename =~ /\.fq\.gz$/){
1485    	$output_filename =~ s/\.fq\.gz$/.${hardtrim3}bp_3prime.fq/;
1486    }
1487    else{
1488    	$output_filename =~ s/$/.${hardtrim3}bp_3prime.fq/;
1489    }
1490
1491    if ($gzip or $filename =~ /\.gz$/){
1492    if ($dont_gzip){
1493        open ($hardtrim_3,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
1494    }
1495    else{
1496        $output_filename .= '.gz';
1497        open ($hardtrim_3,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n";
1498    }
1499    }
1500    else{
1501    open ($hardtrim_3,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n";
1502    }
1503    warn "Writing trimmed version (using the last $hardtrim3 bp only) of the input file '$filename' to '$output_filename'\n";
1504
1505    my $count = 0;
1506
1507    while (1){
1508    my $identifier = <$in>;
1509    my $sequence = <$in>;
1510    my $identifier2 = <$in>;
1511    my $quality_score = <$in>;
1512
1513    last unless ($identifier and $sequence and $identifier2 and $quality_score);
1514    ++$count;
1515
1516    $sequence      =~ s/\r|\n//g; # cross-platform line ending trimming
1517    $quality_score =~ s/\r|\n//g;
1518    $identifier    =~ s/\r|\n//g;
1519    $identifier2   =~ s/\r|\n//g;
1520
1521    my $trimmed_sequence = substr($sequence,-$hardtrim3,$hardtrim3);           # $hardtrim3 bp from the end
1522    my $trimmed_quality_score = substr($quality_score,-$hardtrim3,$hardtrim3); # $hardtrim3 bp from the end
1523
1524    print {$hardtrim_3} join ("\n",$identifier,$trimmed_sequence,$identifier2,$trimmed_quality_score),"\n";
1525
1526    }
1527    close $in or warn "Failed to close filehandle for $filename";
1528    close $hardtrim_3 or die "Failed to close out-filehandle hardtrim_3: $!";
1529
1530    warn "\nFinished writing out converted version of the FastQ file $filename ($count sequences in total)\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";
1531
1532}
1533
1534
1535sub validate_paired_end_files{
1536
1537    my $file_1 = shift;
1538    my $file_2 = shift;
1539
1540    warn "file_1: $file_1, file_2: $file_2\n\n";
1541
1542
1543
1544  if ($file_1 =~ /\.gz$/){
1545      open (IN1,"$compression_path -d -c $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
1546  }
1547  else{
1548      open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
1549  }
1550
1551  if ($file_2 =~ /\.gz$/){
1552      open (IN2,"$compression_path -d -c $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
1553  }
1554  else{
1555      open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
1556  }
1557
1558  warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
1559
1560    my $out_1 = $file_1;
1561    my $out_2 = $file_2;
1562
1563    if ($out_1 =~ /gz$/){
1564        $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
1565    }
1566    else{
1567        $out_1 =~ s/trimmed\.fq$/val_1.fq/;
1568    }
1569
1570    if ($out_2 =~ /gz$/){
1571        $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
1572    }
1573    else{
1574        $out_2 =~ s/trimmed\.fq$/val_2.fq/;
1575    }
1576
1577    if ($basename){
1578        ### FILE RENAMING
1579        warn "Renaming the output files (AGAIN).\n";
1580        # warn "ORIGINAL FILE 1: >>$out_1<<\n";
1581        $out_1 =~ s/_R1_val_1.fq$/_val_1.fq/;
1582        # warn "ORIGINAL FILE 1: >>$out_1<<\n";
1583
1584        # warn "ORIGINAL FILE 2: >>$out_2<<\n";
1585        $out_2 =~ s/_R2_val_2.fq$/_val_2.fq/;
1586        # warn "ORIGINAL FILE 2: >>$out_2<<\n";
1587    }
1588
1589  if ($gzip){
1590      if ($dont_gzip){
1591      open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
1592      }
1593      else{
1594      $out_1 .= '.gz';
1595      open (R1,"| $compression_path -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n";
1596      }
1597  }
1598  else{
1599      open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
1600  }
1601
1602  if ($gzip){
1603      if ($dont_gzip){
1604      open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
1605    }
1606      else{
1607      $out_2 .= '.gz';
1608      open (R2,"| $compression_path -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n";
1609      }
1610  }
1611  else{
1612      open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
1613  }
1614
1615  warn "Writing validated paired-end Read 1 reads to $out_1\n";
1616  warn "Writing validated paired-end Read 2 reads to $out_2\n\n";
1617
1618  my $unpaired_1;
1619  my $unpaired_2;
1620
1621  if ($retain){
1622
1623      $unpaired_1 = $file_1;
1624      $unpaired_2 = $file_2;
1625
1626      if ($unpaired_1 =~ /gz$/){
1627      $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
1628      }
1629      else{
1630      $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
1631      }
1632
1633      if ($unpaired_2 =~ /gz$/){
1634      $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
1635      }
1636      else{
1637      $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
1638      }
1639
1640      if ($gzip){
1641      if ($dont_gzip){
1642          open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
1643      }
1644      else{
1645          $unpaired_1 .= '.gz';
1646          open (UNPAIRED1,"| $compression_path -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n";
1647      }
1648      }
1649      else{
1650      open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
1651      }
1652
1653      if ($gzip){
1654      if ($dont_gzip){
1655          open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
1656      }
1657      else{
1658          $unpaired_2 .= '.gz';
1659          open (UNPAIRED2,"| $compression_path -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n";
1660      }
1661      }
1662      else{
1663      open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
1664      }
1665
1666      warn "Writing unpaired read 1 reads to $unpaired_1\n";
1667      warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
1668  }
1669
1670  my $sequence_pairs_removed = 0;
1671  my $too_many_N_pairs = 0;
1672  my $read_1_printed = 0;
1673  my $read_2_printed = 0;
1674
1675  my $count = 0;
1676
1677  while (1){
1678      my $id_1   = <IN1>;
1679      my $seq_1  = <IN1>;
1680      my $l3_1   = <IN1>;
1681      my $qual_1 = <IN1>;
1682
1683      my $id_2   = <IN2>;
1684      my $seq_2  = <IN2>;
1685      my $l3_2   = <IN2>;
1686      my $qual_2 = <IN2>;
1687
1688      if ($id_1 and $seq_1 and $l3_1 and $qual_1 and $id_2 and $seq_2 and $l3_2 and $qual_2){
1689      # all good, got a read from both files
1690      }
1691      elsif( !($id_1 and $seq_1 and $l3_1 and $qual_1) and ($id_2 and $seq_2 and $l3_2 and $qual_2)){
1692      die "Read 1 output is truncated at sequence count: $count, please check your paired-end input files! Terminating...\n\n";
1693      }
1694      elsif( !($id_2 and $seq_2 and $l3_2 and $qual_2) and ($id_1 and $seq_1 and $l3_1 and $qual_1)){
1695      die "Read 2 output is truncated at sequence count: $count, please check your paired-end input files! Terminating...\n\n";
1696      }
1697      else{
1698      last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
1699      last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
1700      }
1701      ++$count;
1702
1703      ## small check if the sequence files appear to be FastQ files
1704      if ($count == 1){ # performed just once
1705      if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
1706          die "Input file doesn't seem to be in FastQ format at sequence $count\n";
1707      }
1708      if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
1709          die "Input file doesn't seem to be in FastQ format at sequence $count\n";
1710      }
1711      }
1712
1713      chomp $seq_1;
1714      chomp $seq_2;
1715      chomp $qual_1;
1716      chomp $qual_2;
1717
1718      if ($clip_r1){
1719      if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again
1720          $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
1721          $qual_1 = substr($qual_1,$clip_r1);
1722      }
1723      }
1724      if ($clip_r2){
1725      if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again
1726          $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
1727          $qual_2 = substr($qual_2,$clip_r2);
1728      }
1729      }
1730
1731      if ($three_prime_clip_r1){
1732      if (length $seq_1 > $three_prime_clip_r1){  # sequences that are already too short won't be clipped again
1733          $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
1734          $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1));
1735      }
1736      }
1737      if ($three_prime_clip_r2){
1738      if (length $seq_2 > $three_prime_clip_r2){  # sequences that are already too short won't be clipped again
1739          $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence
1740          $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2));
1741      }
1742      }
1743
1744      if (defined $maxn){
1745
1746      # Read 1
1747      my $n_count = Ncounter($seq_1);
1748      # warn "Checking Read 1 for Ns: Found $n_count\n";
1749      if ($n_count > $maxn){
1750          ++$too_many_N_pairs;
1751          ++$sequence_pairs_removed;
1752          next; # bailing straight away
1753      }
1754
1755      # Read 2
1756      $n_count = Ncounter($seq_2);
1757      # warn "Checking Read 2 for Ns: Found $n_count\n";
1758      if ($n_count > $maxn){
1759          ++$too_many_N_pairs;
1760          ++$sequence_pairs_removed;
1761          next;
1762      }
1763      }
1764
1765      ### making sure that the reads do have a sensible length
1766      if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
1767      ++$sequence_pairs_removed;
1768      if ($retain){ # writing out single-end reads if they are longer than the cutoff
1769
1770          if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
1771          print UNPAIRED1 $id_1;
1772          print UNPAIRED1 "$seq_1\n";
1773          print UNPAIRED1 $l3_1;
1774          print UNPAIRED1 "$qual_1\n";
1775          ++$read_1_printed;
1776          }
1777
1778          if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
1779          print UNPAIRED2 $id_2;
1780          print UNPAIRED2 "$seq_2\n";
1781          print UNPAIRED2 $l3_2;
1782          print UNPAIRED2 "$qual_2\n";
1783          ++$read_2_printed;
1784          }
1785
1786      }
1787      }
1788      else{
1789      print R1 $id_1;
1790      print R1 "$seq_1\n";
1791      print R1 $l3_1;
1792      print R1 "$qual_1\n";
1793
1794      print R2 $id_2;
1795      print R2 "$seq_2\n";
1796      print R2 $l3_2;
1797      print R2 "$qual_2\n";
1798      }
1799
1800  }
1801
1802
1803  my $percentage;
1804  my $percentage_Ns;
1805
1806  if ($count){
1807    $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
1808    $percentage_Ns = sprintf("%.2f",$too_many_N_pairs/$count*100);
1809  }
1810  else{
1811      $percentage = 'N/A';
1812      $percentage_Ns = 'N/A';
1813  }
1814
1815  warn "Total number of sequences analysed: $count\n\n";
1816  warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
1817  if (defined $maxn){
1818      warn "Number of sequence pairs removed because at least one read contained more N(s) than the specified limit of $maxn: $too_many_N_pairs ($percentage_Ns%)\n";
1819      print REPORT "Number of sequence pairs removed because at least one read contained more N(s) than the specified limit of $maxn: $too_many_N_pairs ($percentage_Ns%)\n";
1820  }
1821
1822  print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
1823  print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
1824
1825  if ($keep){
1826      warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
1827      warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
1828  }
1829
1830  close R1 or die $!;
1831  close R2 or die $!;
1832
1833  if ($retain){
1834      close UNPAIRED1 or die $!;
1835      close UNPAIRED2 or die $!;
1836  }
1837
1838  warn "\n";
1839  if ($retain){
1840      return ($out_1,$out_2,$unpaired_1,$unpaired_2);
1841  }
1842  else{
1843      return ($out_1,$out_2);
1844  }
1845}
1846
1847
1848sub file_sanity_check{
1849
1850  my $file = shift;
1851  if ($file =~ /gz$/){
1852      open (SANITY,"$compression_path -d -c $file |") or die "Failed to read from file '$file' to perform sanity check: $!\n";
1853  }
1854  else{
1855      open (SANITY,$file) or die "Failed to read from file '$file' to perform sanity check: $!\n";
1856  }
1857
1858  # just processing a single FastQ entry
1859  my $id    = <SANITY>;
1860  my $seq   = <SANITY>;
1861  my $three = <SANITY>;
1862  my $qual  = <SANITY>;
1863
1864  unless ($id and $seq and $three and $qual){
1865      die "Input file '$file' seems to be completely empty. Consider respecifying!\n\n";
1866  }
1867
1868  chomp $seq;
1869
1870  # testing if the file is a colorspace file in which case we bail
1871  if ($seq =~ /\d+/){
1872      die "File seems to be in SOLiD colorspace format which is not supported by Trim Galore (sequence is: '$seq')! Please use Cutadapt on colorspace files separately and check its documentation!\n\n";
1873  }
1874
1875  close SANITY;
1876
1877}
1878
1879
1880
1881### ADAPTER AUTO-DETECTION
1882
1883sub autodetect_adapter_type{
1884  warn "\n\nAUTO-DETECTING ADAPTER TYPE\n===========================\n";
1885  warn "Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n";
1886
1887  if ($ARGV[0] =~ /gz$/){
1888    open (AUTODETECT,"$compression_path -d -c $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n";
1889  }
1890  else{
1891    open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n";
1892  }
1893
1894  my %adapters;
1895
1896  $adapters{'Illumina'} -> {seq}  = 'AGATCGGAAGAGC';
1897  $adapters{'Illumina'} -> {count}= 0;
1898  $adapters{'Illumina'} -> {name}= 'Illumina TruSeq, Sanger iPCR; auto-detected';
1899
1900  $adapters{'Nextera'}  -> {seq}  = 'CTGTCTCTTATA';
1901  $adapters{'Nextera'}  -> {count}= 0;
1902  $adapters{'Nextera'}  -> {name}= 'Nextera Transposase sequence; auto-detected';
1903
1904  $adapters{'smallRNA'} -> {seq}  = 'TGGAATTCTCGG';
1905  $adapters{'smallRNA'} -> {count}= 0;
1906  $adapters{'smallRNA'} -> {name}= 'Illumina small RNA adapter; auto-detected';
1907
1908
1909  # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often
1910  my $count = 0;
1911  while (1){
1912
1913    my $line1 = <AUTODETECT>;
1914    my $line2 = <AUTODETECT>;
1915    my $line3 = <AUTODETECT>;
1916    my $line4 = <AUTODETECT>;
1917    last unless ($line4);
1918    $count++;
1919    last if ($count == 1000000);
1920
1921    chomp $line2;
1922    $adapters{'Illumina'}->{count}++ unless (index($line2,'AGATCGGAAGAGC')== -1);
1923    $adapters{'Nextera'} ->{count}++ unless (index($line2,'CTGTCTCTTATA') == -1);
1924    $adapters{'smallRNA'}->{count}++ unless (index($line2,'TGGAATTCTCGG') == -1);
1925
1926  }
1927
1928  my $highest;
1929  my $third;
1930  my $second;
1931  my $seq;
1932  my $adapter_name;
1933  my $report_message;
1934
1935  	warn "Found perfect matches for the following adapter sequences:\nAdapter type\tCount\tSequence\tSequences analysed\tPercentage\n";
1936  	foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){
1937
1938    	my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100);
1939
1940    	warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n";
1941
1942	    unless (defined $highest){
1943		    $highest = $adapter;
1944		    $seq = $adapters{$adapter}->{seq};
1945	      	$adapter_name = $adapters{$adapter}->{name};
1946	      	next;
1947	    }
1948	    unless (defined $second){
1949	      	$second = $adapter;
1950	      	next;
1951	    }
1952 		unless (defined $third){
1953	      	$third = $adapter;
1954	      	next;
1955	    }
1956  }
1957
1958
1959  	if ($adapters{$highest}->{count} == $adapters{$second}->{count} and $adapters{$highest}->{count} == $adapters{$third}->{count}){
1960    	warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n";
1961    	$report_message .= "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n";
1962
1963  		if (defined $consider_already_trimmed){
1964  			if ($adapters{$highest}->{count} <= $consider_already_trimmed ){
1965  				warn "No auto-detected adapter sequence exceeded the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
1966  				$report_message .= "No auto-detected adapter sequence exceeded the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
1967  				$adapter_name = 'No adapter trimming [suppressed by user]';
1968  				$seq = 'X';
1969			}
1970			else{
1971  				warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n";
1972  				$report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n";
1973  				$adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)';
1974  				$seq = 'AGATCGGAAGAGC';
1975    		}
1976	  	}
1977  		else{
1978  			warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n";
1979  			$report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n";
1980  			$adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)';
1981  			$seq = 'AGATCGGAAGAGC';
1982    	}
1983    }
1984    elsif ($adapters{$highest}->{count} == $adapters{$second}->{count} ){
1985		warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n";
1986    	$report_message .= "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n";
1987
1988    	if (defined $consider_already_trimmed){
1989  			if ($adapters{$highest}->{count} <= $consider_already_trimmed ){
1990  				warn "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
1991  				$report_message .= "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
1992  				$adapter_name = 'No adapter trimming [suppressed by user]';
1993  				$seq = 'X';
1994			}
1995			else{
1996				# If one of the highest contaminants was the Illumina adapter, we set that one and print a warning
1997		    	if ( ($highest eq 'Illumina') or ($second eq 'Illumina')) {
1998		    		warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n";
1999		      		$report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior.\n";
2000		      		$adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)';
2001		      		$seq = 'AGATCGGAAGAGC';
2002		    	}
2003		    	else{
2004		    		warn "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior).\n";
2005		      		$report_message .= "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior.\n";
2006		      		$adapter_name = 'Nextera; (assigned because of inconclusive auto-detection)';
2007		      		$seq = 'CTGTCTCTTATA';
2008		      	}
2009		    }
2010		}
2011		else{
2012			# If one of the highest contaminants was the Illumina adapter, we set that one and print a warning
2013	    	if ( ($highest eq 'Illumina') or ($second eq 'Illumina')) {
2014	    		warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n";
2015	      		$report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior.\n";
2016	      		$adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)';
2017	      		$seq = 'AGATCGGAAGAGC';
2018	    	}
2019	    	else{
2020	    		warn "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior).\n";
2021	      		$report_message .= "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior.\n";
2022	      		$adapter_name = 'Nextera; (assigned because of inconclusive auto-detection)';
2023	      		$seq = 'CTGTCTCTTATA';
2024	      	}
2025	    }
2026  	}
2027  	else{
2028  		if (defined $consider_already_trimmed){
2029  			if ($adapters{$highest}->{count} <= $consider_already_trimmed ){
2030  				warn "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
2031  				$report_message .= "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n";
2032  				$adapter_name = 'No adapter trimming [suppressed by user]';
2033  				$seq = 'X';
2034			}
2035			else{
2036				# using the highest occurrence as adapter to look out for
2037    			$report_message .= "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n";
2038    			warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n";
2039  			}
2040  		}
2041  		else{
2042			# using the highest occurrence as adapter to look out for
2043    		$report_message .= "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n";
2044    		warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n";
2045  		}
2046  	}
2047
2048  close AUTODETECT;
2049
2050  return ($seq,$adapter_name,$report_message);
2051
2052}
2053
2054sub autodetect_polyA_type{
2055
2056    warn "\n\nAUTO-DETECTING POLY-A TYPE\n===========================\n";
2057    warn "Attempting to auto-detect PolyA type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n";
2058
2059    if ($ARGV[0] =~ /gz$/){
2060    open (AUTODETECT,"$compression_path -d -c $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n";
2061    }
2062    else{
2063    open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n";
2064    }
2065
2066  my %adapters;
2067
2068    $adapters{'PolyA'} -> {seq}  = 'AAAAAAAAAA';
2069    $adapters{'PolyA'} -> {count}= 0;
2070    $adapters{'PolyA'} -> {name}= 'Poly-A Read 1; auto-detected';
2071
2072    $adapters{'PolyT'}  -> {seq}  = 'TTTTTTTTTT';
2073    $adapters{'PolyT'}  -> {count}= 0;
2074    $adapters{'PolyT'}  -> {name}= 'Poly-T Read 1; auto-detected';
2075
2076
2077    # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often
2078    my $count = 0;
2079    while (1){
2080
2081    my $line1 = <AUTODETECT>;
2082    my $line2 = <AUTODETECT>;
2083    my $line3 = <AUTODETECT>;
2084    my $line4 = <AUTODETECT>;
2085    last unless ($line4);
2086    $count++;
2087    last if ($count == 1000000);
2088
2089    chomp $line2;
2090    $adapters{'PolyA'}->{count}++ unless (index($line2,'AAAAAAAAAA')== -1);
2091    $adapters{'PolyT'} ->{count}++ unless (index($line2,'TTTTTTTTTT') == -1);
2092
2093    }
2094
2095    my $highest;
2096    my $second;
2097    my $seq;
2098    my $adapter_name;
2099
2100    warn "Found perfect matches for the following mono-polymer sequences:\nPoly-nucleotide type\tCount\tSequence\tSequences analysed\tPercentage\n";
2101    foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){
2102
2103    my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100);
2104
2105    warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n";
2106
2107    unless (defined $highest){
2108        $highest = $adapter;
2109        $seq = $adapters{$adapter}->{seq};
2110        $adapter_name = $adapters{$adapter}->{name};
2111        next;
2112    }
2113    unless (defined $second){
2114        $second = $adapter;
2115    }
2116    }
2117
2118
2119    # using the highest occurrence as adapter to look out for
2120    if ($adapters{$highest}->{count} == $adapters{$second}->{count}){
2121    warn "Unable to auto-detect most prominent mono-polymer from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count})\n";
2122
2123    if ($adapters{$highest}->{count} == 0){
2124        warn "Defaulting to Poly-A. Specify -a SEQUENCE to avoid this behavior).\n\n";
2125        $adapter_name = 'Poly-A (inconclusive auto-detection)';
2126        $seq = extend_adapter_sequence ('A',20);
2127    }
2128    else{
2129        warn "Using $highest poly-monomer for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n";
2130    }
2131    }
2132    else{
2133    warn "Using $highest Polymer for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n";
2134    }
2135
2136    close AUTODETECT;
2137
2138    return ($seq,$adapter_name);
2139
2140}
2141
2142
2143###########################################################################
2144
2145sub process_commandline{
2146  my $help;
2147  my $quality;
2148  my $adapter;
2149  my $adapter2;
2150  my $stringency;
2151  my $report;
2152  my $version;
2153  my $rrbs;
2154  my $length_cutoff;
2155  my $keep;
2156  my $fastqc;
2157  my $non_directional;
2158  my $phred33;
2159  my $phred64;
2160  my $fastqc_args;
2161  my $trim;
2162  my $gzip;
2163  my $validate;
2164  my $retain;
2165  my $length_read_1;
2166  my $length_read_2;
2167  my $error_rate;
2168  my $output_dir;
2169  my $no_report_file;
2170  my $suppress_warn;
2171  my $dont_gzip;
2172  my $clip_r1;
2173  my $clip_r2;
2174  my $three_prime_clip_r1;
2175  my $three_prime_clip_r2;
2176  my $nextera;
2177  my $small_rna;
2178  my $illumina;
2179  my $path_to_cutadapt;
2180  my $max_length;
2181  my $maxn;
2182    my $trimn;
2183    my $hardtrim5;
2184    my $hardtrim3;
2185    my $clock;
2186    my $polyA;
2187    my $nextseq;
2188    my $basename;
2189    my $cores;
2190    my $compression_path;
2191    my $consider_already_trimmed;
2192
2193    my $command_line = GetOptions ('help|man' => \$help,
2194                 'q|quality=i' => \$quality,
2195                 'a|adapter=s' => \$adapter,
2196                 'a2|adapter2=s' => \$adapter2,
2197                 'report' => \$report,
2198                 'version' => \$version,
2199                 'stringency=i' => \$stringency,
2200                 'fastqc' => \$fastqc,
2201                 'RRBS' => \$rrbs,
2202                 'keep' => \$keep,
2203                 'length=i' => \$length_cutoff,
2204                 'non_directional' => \$non_directional,
2205                 'phred33' => \$phred33,
2206                 'phred64' => \$phred64,
2207                 'fastqc_args=s' => \$fastqc_args,
2208                 'trim1' => \$trim,
2209                 'gzip' => \$gzip,
2210                 'paired_end' => \$validate,
2211                 'retain_unpaired' => \$retain,
2212                 'length_1|r1=i' => \$length_read_1,
2213                 'length_2|r2=i' => \$length_read_2,
2214                 'e|error_rate=s' => \$error_rate,
2215                 'o|output_dir=s' => \$output_dir,
2216                 'no_report_file' => \$no_report_file,
2217                 'suppress_warn' => \$suppress_warn,
2218                 'dont_gzip' => \$dont_gzip,
2219                 'clip_R1=i' => \$clip_r1,
2220                 'clip_R2=i' => \$clip_r2,
2221                 'three_prime_clip_R1=i' => \$three_prime_clip_r1,
2222                 'three_prime_clip_R2=i' => \$three_prime_clip_r2,
2223                 'illumina' => \$illumina,
2224                 'nextera' => \$nextera,
2225                 'small_rna' => \$small_rna,
2226                 'path_to_cutadapt=s' => \$path_to_cutadapt,
2227                 'max_length=i' => \$max_length,
2228                 'max_n=i'      => \$maxn,
2229                 'trim-n'      => \$trimn,
2230                 'hardtrim5=i'      => \$hardtrim5,
2231                 'hardtrim3=i'      => \$hardtrim3,
2232                 'clock|casio|breitling' => \$clock,
2233                 'polyA' => \$polyA,
2234                 '2colour|nextseq=i' => \$nextseq,
2235                 'basename=s' => \$basename,
2236                 'j|cores=i' => \$cores,
2237                 'consider_already_trimmed=i' => \$consider_already_trimmed,
2238      );
2239
2240	### EXIT ON ERROR if there were errors with any of the supplied options
2241	unless ($command_line){
2242		die "Please respecify command line options\n";
2243	}
2244
2245	### HELPFILE
2246	if ($help){
2247		print_helpfile();
2248		exit;
2249	}
2250
2251
2252
2253
2254
2255  if ($version){
2256    print << "VERSION";
2257
2258                        Quality-/Adapter-/RRBS-/Speciality-Trimming
2259                                [powered by Cutadapt]
2260                                  version $trimmer_version
2261
2262                               Last update: 24 09 2019
2263
2264VERSION
2265    exit;
2266  }
2267
2268    # testing whether the filenames contain white space. This can only ever be the case if passed within "quotes"
2269    foreach my $filename (@ARGV){
2270        if ($filename =~ /\s+/){
2271            die "\n[FATAL ERROR]: Input file names ('$filename') supplied with whitespace(s). Please move to directory containing the input file(s),
2272               and/or avoid using whitespace(s) at all costs (e.g. consider using '_' instead), and try again.\n\n";
2273        }
2274    }
2275
2276	# NUMBER OF CORES
2277	if (defined $cores){
2278		if ($cores < 1){
2279			die "Please a use a positive integer for the number of cores to be used, and re-specify!\n\n";
2280		}
2281		elsif($cores == 1){
2282			warn "Proceeding with single-core trimming (user-defined)\n";
2283		}
2284		elsif($cores >= 8){
2285			warn "Using an excessive number of cores has a diminishing return! It is recommended not to exceed 8 cores per trimming process (you asked for $cores cores). Please consider re-specifying\n"; sleep(2);
2286		}
2287	}
2288	else {
2289		warn "Multicore support not enabled. Proceeding with single-core trimming.\n";
2290		$compression_path = "gzip";
2291		$cores = 1;
2292	}
2293
2294
2295	# Before we start let's have quick look if Cutadapt seems to be working with the path information provided
2296	# To change the path to Cutadapt use --path_to_cutadapt /full/path/to/the/Cutadapt/executable
2297
2298	if(defined $path_to_cutadapt){
2299		warn "Path to Cutadapt set as: '$path_to_cutadapt' (user defined)\n";
2300		# we'll simply use this
2301	}
2302	else{
2303		$path_to_cutadapt = 'cutadapt'; # default, assuming it is in the PATH
2304		warn "Path to Cutadapt set as: '$path_to_cutadapt' (default)\n";
2305	}
2306
2307
2308	my $return = system "$path_to_cutadapt --version 2>&1 > /dev/null ";
2309	if ($return == 0){
2310		warn "Cutadapt seems to be working fine (tested command '$path_to_cutadapt --version')\n";
2311		$cutadapt_version = `$path_to_cutadapt --version`;
2312		chomp $cutadapt_version;
2313		warn "Cutadapt version: $cutadapt_version\n";
2314	}
2315	else{
2316		die "Failed to execute Cutadapt porperly. Please install Cutadapt first and make sure it is in the PATH, or specify the path to the Cutadapt executable using --path_to_cutadapt /path/to/cutadapt\n\n";
2317	}
2318
2319
2320
2321
2322	# We only need to test for pigz if the user asked for multi-core processing
2323	if ($cores > 1){
2324
2325		## Check Python  Version
2326		# warn "Let's also find out the Python version used. $path_to_cutadapt\n";
2327		my $location_of_cutadapt = `which $path_to_cutadapt`;
2328		#warn "Location is: $location_of_cutadapt\n";
2329
2330		# Reading the first line of the cutadapt executable, since this tends to contain the python version it is using
2331		open (my $first_line,$location_of_cutadapt) or die "Failed to read the first line of the Cutadapt executable: $!";
2332		my $shebang = <$first_line>;
2333		chomp $shebang;
2334		# warn "This is the first line:\n>>>$shebang<<<\n";
2335		close $first_line;
2336
2337		# the shebang line seems to contain a path to Python
2338		if ($shebang =~ /python/i){
2339			if ($shebang =~ /\#!/){
2340				# warn "Found a shebangline: >>>$_<<<\n";
2341				$shebang =~ s/\#!//;
2342				# warn "Truncated shebangline: >>>$_<<<\n";
2343			}
2344
2345			my $python_return = `$shebang --version 2>&1`;
2346			chomp $python_return;
2347			# warn "Python return: $python_return\n";
2348
2349			if ($python_return =~ /Python 3.*/){
2350				warn "Cutadapt seems to be using Python 3! Proceeding with multi-core enabled Cutadapt using $cores cores\n";
2351			}
2352			elsif ($python_return =~ /Python 2.*/){
2353				warn "Python 2 found, multi-core not supported. Proceeding with Cutadapt in single-core mode\n";
2354				$cores = 1;
2355			}
2356			else{
2357				die "No Python detected. Python required to run Cutadapt!\n\n";
2358			}
2359
2360			$python_version = $python_return;
2361			$python_version =~ s/^Python //;
2362		}
2363		else{
2364			# the shebang line doesn't seem to contain a path to Python. Instead, someone edited the Cutadapt executable to look differently.
2365			# An example for this is the latest version of Miniconda:
2366			### #!/bin/sh
2367			### '''exec' /long/path/to/conda/envs/deepsv/bin/python "$0" "$@"
2368			### ' '''
2369			### # -*- coding: utf-8 -*-
2370			### ...
2371
2372			### interestingly, Anacondo does not seem to do this
2373			warn "Could not detect version of Python used by Cutadapt from the first line of Cutadapt (but found this: >>>$shebang<<<)\n";
2374			$python_version = 'could not detect';
2375			warn "Letting the (modified) Cutadapt deal with the Python version instead\n";
2376		}
2377
2378		### only proceeding if $cores is still > 1, i.e. if Python 3 was found
2379		if ($cores > 1){
2380
2381			### Test if pigz is installed
2382			my $pigz_return = system ("pigz --version 2> /dev/null");
2383			# warn "PIGZ returned: $pigz_return\n";
2384			if ($pigz_return == 0) {
2385				warn "Parallel gzip (pigz) detected. Proceeding with multicore (de)compression using $cores cores\n\n";
2386				$compression_path = "pigz -p $cores";
2387			}
2388			else {
2389				warn "Proceeding with 'gzip' for compression. PLEASE NOTE: Using multi-cores for trimming with 'gzip' only has only very limited effect! (see here: https://github.com/FelixKrueger/TrimGalore/issues/16#issuecomment-458557103)\n";
2390				warn "To increase performance, please install 'pigz' and run again\n\n";
2391				$compression_path = "gzip";
2392			}
2393		}
2394	}
2395	else{
2396		warn "single-core operation.\n"
2397	}
2398	unless (defined $compression_path){
2399		$compression_path = "gzip"; # fall-back option
2400	}
2401
2402	### SUPRESS WARNINGS
2403	if (defined $suppress_warn){
2404		$DOWARN = 0;
2405	}
2406
2407	### QUALITY SCORES
2408	my $phred_encoding;
2409	if ($phred33){
2410		if ($phred64){
2411			die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
2412		}
2413		$phred_encoding = 33;
2414	}
2415	elsif ($phred64){
2416		$phred_encoding = 64;
2417	}
2418  unless ($phred33 or $phred64){
2419      unless (defined $hardtrim5 or $hardtrim3 or $clock){   # we don't need warnings if we simply hard-trim or Clock-trim a file
2420      warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
2421      }
2422      $phred_encoding = 33;
2423  }
2424
2425      ### ILLUMINA 2-COLOUR CHEMISTRY SPEICIFC HIGH QUALITY G-TRIMMING
2426      if (defined $nextseq){
2427          if (defined $quality){
2428              die "The options '-quality INT' and '--nextseq INT' are mutually exclusive. Please decide which trimming mode you would like to apply!\n\n";
2429          }
2430          #	warn "OK, let's deal with 2-colour issues\n\n";
2431        unless ( ($nextseq > 0) and $nextseq < 200){
2432            die "Please select a sensible value for 2-colour specific trimming (currently allowed range is between 1 and 200 Gs). Please respecify!\n";
2433        }
2434        $nextseq = "--nextseq-trim=$nextseq";
2435      }
2436      else{
2437          $nextseq = '';
2438      }
2439
2440	### NON-DIRECTIONAL RRBS
2441	if ($non_directional){
2442		unless ($rrbs){
2443			die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
2444		}
2445	}
2446	else{
2447		$non_directional = 0;
2448	}
2449
2450  if ($fastqc_args){
2451    $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
2452  }
2453  else{
2454    $fastqc_args = 0;
2455  }
2456
2457  ### CUSTOM ERROR RATE
2458  if (defined $error_rate){
2459    # make sure that the error rate is between 0 and 1
2460    unless ($error_rate >= 0 and $error_rate <= 1){
2461      die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
2462    }
2463  }
2464  else{
2465    $error_rate = 0.1; # (default)
2466  }
2467
2468
2469  if ($nextera and $small_rna or $nextera and $illumina or $illumina and $small_rna ){
2470    die "You can't use several different adapter types at the same time. Make your choice or consider using -a and -a2\n\n";
2471  }
2472
2473  ### ENSURE USERS ARE USING THE POLY-A AUTODETECTION, OR SPECIFY SEQUENCES FOR BOTH -a AND -a2
2474  if ($polyA){
2475      if (defined $adapter){
2476      if ($validate){# paired-end
2477          unless (defined $adapter2){
2478          die "Please use either the PolyA auto-detection (defaults to -a \"A{20}\" -a2 \"T{150}\" (or the other way round)), or specify both -a and -a2. Now try again...\n\n";
2479          }
2480      }
2481      }
2482      if (defined $adapter2){
2483      if ($validate){# paired-end
2484          unless (defined $adapter){
2485          die "Please use either the PolyA auto-detection (defaults to -a \"A{20}\" -a2 \"T{150}\" (or the other way round)), or specify both -a and -a2. Now try again...\n\n";
2486          }
2487      }
2488      }
2489  }
2490
2491
2492  if (defined $adapter){
2493
2494      # The adapter may be given as a single base that occurs a number of times
2495      # in the form BASE{number of times}, e.g. "-a A{10}"
2496      if ($adapter =~ /^([ACTGN]){(\d+)}$/){
2497      # warn "Base: $1\n# repeats: $2\n";
2498
2499
2500      my $tmp_adapter = extend_adapter_sequence(uc$1,$2);
2501      warn "Adapter sequence given as >$adapter< expanded to: >$tmp_adapter<\n";
2502      $adapter = $tmp_adapter;
2503      }
2504      else{
2505      unless ($adapter =~ /^[ACTGNXactgnx]+$/){
2506          die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
2507      }
2508      }
2509      $adapter = uc$adapter;
2510
2511      if ($illumina){
2512      die "You can't supply an adapter sequence AND use the Illumina universal adapter sequence. Make your choice.\n\n";
2513      }
2514      if ($nextera){
2515      die "You can't supply an adapter sequence AND use the Nextera transposase adapter sequence. Make your choice.\n\n";
2516      }
2517      if ($small_rna){
2518      die "You can't supply an adapter sequence AND use the Illumina small RNA adapter sequence. Make your choice.\n\n";
2519      }
2520  }
2521
2522  if (defined $adapter2){
2523      unless ($validate){
2524      die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
2525      }
2526
2527      # The adapter may be given as a single base that occurs a number of times
2528      # in the form BASE{number of times}, e.g. "-a2 A{10}"
2529      if ($adapter2 =~ /^([ACTGN]){(\d+)}$/){
2530      # warn "Base: $1\n# repeats: $2\n";
2531      my $tmp_adapter2 = extend_adapter_sequence(uc$1,$2);
2532
2533      warn "Adapter2 sequence given as >$adapter2< expanded to: >$tmp_adapter2<\n";
2534      $adapter2 = $tmp_adapter2;
2535      }
2536      else{
2537      unless ($adapter2 =~ /^[ACTGNactgn]+$/){
2538          die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
2539      }
2540      }
2541      $adapter2 = uc$adapter2;
2542  }
2543
2544  ### LENGTH CUTOFF
2545  # this gets set right at the start after the adapter auto-detection has been concluded
2546
2547  ### MAXIMUM LENGTH CUTOFF - this is intended for smallRNA-libraries to remove sequences that are longer than a certain cutoff and thus problably not small RNA species
2548  if (defined $max_length){
2549    if ($validate){
2550      die "Maximum length filtering works currently only in single-end mode (which is more sensible for smallRNA-sequencing anyway...)\n\n";
2551    }
2552    warn "Maximum length cutoff set to >> $max_length bp <<; sequences longer than this threshold will be removed (only advised for smallRNA-trimming!)\n\n";
2553  }
2554  else{
2555    $max_length = 0;
2556  }
2557
2558  ### files are supposed to be paired-end files
2559  if ($validate){
2560
2561    # making sure that an even number of reads has been supplied
2562    unless ((scalar@ARGV)%2 == 0){
2563      die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
2564    }
2565
2566    ### Ensuring pairs of R1 and R2 are not the very same file
2567    my $index = 0;
2568    while ($index <= $#ARGV){
2569        # warn "File 1: $ARGV[$index]\n";
2570        # warn "File 2: $ARGV[$index+1]\n~~~~~~~~~~\n\n"; sleep(1);
2571        if ($ARGV[$index] eq $ARGV[$index+1]){
2572            die "[FATAL:] Read 1 ($ARGV[$index]) and Read 2 ($ARGV[$index+1]) files appear to be the very same file. This probably happened inadvertently, so please re-specify!\nExiting...\n\n";
2573        }
2574        $index += 2;
2575    }
2576
2577    ## CUTOFF FOR VALIDATED READ-PAIRS
2578    if (defined $length_read_1 or defined $length_read_2){
2579
2580      unless ($retain){
2581    die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
2582      }
2583
2584      if (defined $length_read_1){
2585    unless ($length_read_1 >= 15 and $length_read_1 <= 100){
2586      die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
2587    }
2588    unless ($length_read_1 > $length_cutoff){
2589      die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
2590    }
2591      }
2592
2593      if (defined $length_read_2){
2594    unless ($length_read_2 >= 15 and $length_read_2 <= 100){
2595      die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
2596    }
2597    unless ($length_read_2 > $length_cutoff){
2598      die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
2599    }
2600      }
2601    }
2602
2603    if ($retain){
2604      $length_read_1 = 35 unless (defined $length_read_1);
2605      $length_read_2 = 35 unless (defined $length_read_2);
2606    }
2607  }
2608
2609
2610    unless ($no_report_file){
2611        $no_report_file = 0;
2612    }
2613
2614    ### PARENT DIRECTORY
2615    my $parent_dir = getcwd();
2616    unless ($parent_dir =~ /\/$/){
2617        $parent_dir =~ s/$/\//;
2618    }
2619
2620    ### OUTPUT DIR PATH
2621    if (defined $output_dir){
2622
2623        if ($output_dir =~ /\s+/){
2624            die "\n[FATAL OPTION]: Output directory name (>>$output_dir<<) contained whitespace(s). Please replace whitespace(s) with '_' and try again.\n\n";
2625        }
2626
2627        unless ($output_dir eq ''){
2628            unless ($output_dir =~ /\/$/){
2629                $output_dir =~ s/$/\//;
2630            }
2631
2632            if (chdir $output_dir){
2633                $output_dir = getcwd(); #  making the path absolute
2634                unless ($output_dir =~ /\/$/){
2635                    $output_dir =~ s/$/\//;
2636                }
2637            }
2638            else{
2639                mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
2640                warn "Output directory $output_dir doesn't exist, creating it for you...\n\n"; sleep(1);
2641                chdir $output_dir or die "Failed to move to $output_dir\n";
2642                $output_dir = getcwd(); #  making the path absolute
2643                unless ($output_dir =~ /\/$/){
2644                        $output_dir =~ s/$/\//;
2645                }
2646            }
2647            warn "Output will be written into the directory: $output_dir\n";
2648        }
2649    }
2650    else{
2651        $output_dir = '';
2652    }
2653    # Changing back to parent directory
2654    chdir $parent_dir or die "Failed to move to $parent_dir\n";
2655
2656
2657  ### Trimming at the 5' end
2658  if (defined $clip_r2){ # trimming 5' bases of read 2
2659      die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate);
2660  }
2661
2662  if (defined $clip_r1){ # trimming 5' bases of read 1
2663      unless ($clip_r1 > 0 and $clip_r1 < 100){
2664      die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
2665      }
2666  }
2667
2668  if (defined $clip_r2){ # trimming 5' bases of read 2
2669      unless ($clip_r2 > 0 and $clip_r2 < 100){
2670      die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
2671      }
2672  }
2673
2674  ### Trimming at the 3' end
2675  if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1
2676      unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){
2677      die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
2678      }
2679  }
2680
2681  if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2
2682      unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){
2683      die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
2684      }
2685  }
2686
2687
2688  if (defined $maxn){
2689      unless ($maxn >= 0 and $maxn <= 100){
2690      die "--max_n needs to be an integer between 0 and 100\nPlease respecify...\n\n";
2691      }
2692  }
2693
2694  my $trim_n;
2695  if ($trimn){
2696      $trim_n = '--trim-n';
2697  }
2698  else{
2699      $trim_n = '';
2700  }
2701
2702	### RRBS
2703	if ($rrbs){
2704		unless ($non_directional){ # only setting this for directional mode
2705			if ($validate){
2706				unless (defined $clip_r2){ # user specified R2 clipping overrides the default setting of --clip_r2 2  # added 07 Dec 2016
2707					warn "Setting the option '--clip_r2 2' (to remove methylation bias from the start of Read 2)\n"; # sleep(1);
2708					$clip_r2 = 2;
2709				}
2710			}
2711		}
2712	}
2713	else{
2714		$rrbs = 0;
2715	}
2716
2717  if (defined $hardtrim5){
2718      unless ($hardtrim5 > 0 and $hardtrim5 < 1000){
2719      die "Hard-trim from 3'-end selected: >$hardtrim5< bp. Please ensure a hard-trimming range between 1 and 999 bp. Please try again...\n\n~~~~~~~~~~~~~~~~~~~~~~\n"
2720      }
2721  }
2722  if (defined $hardtrim3){
2723      unless ($hardtrim3 > 0 and $hardtrim3 < 1000){
2724      die "Hard-trim from 5'-end selected: >$hardtrim3< bp. Please ensure a hard-trimming range between 1 and 999 bp. Please try again...\n\n~~~~~~~~~~~~~~~~~~~~~~\n"
2725      }
2726  }
2727
2728  ### The
2729  if ($clock){
2730      if ($validate){ # already selected as paired-end mode
2731      # Fine. Currently, the Clock protocol requires paired-end reads with dual UMIs
2732      }
2733      else{
2734      die "\nOption --clock selected, but the processing is still set to single-end mode. The current clock protocol requires paired-end sequencing with dual unique molecular identifiers (UMIs) though. Please respecify...\n\n";
2735      }
2736  }
2737
2738    if (defined $basename){
2739        warn "Using user-specified basename (>>$basename<<) instead of deriving the filename from the input file(s)\n";
2740
2741        if ($basename =~ /\//){
2742            die "Please make sure the name specified with --basename does not contain file path information! ($basename)";
2743        }
2744        $basename =~ s/[ !%\$\*&£]/_/g; # replacing weird symbols or spaces
2745
2746        if (scalar @ARGV > 2){
2747            die "[FATAL ERROR]: Number of files supplied can be 1 (single-end mode), or 2 (paired-end mode), but was: ",scalar @ARGV,". Please respecify!\n\n";
2748        }
2749        else{
2750            if (scalar @ARGV == 2){
2751                if ($validate){
2752                    # fine, this is paired-end
2753                }
2754                else{
2755                    die "[FATAL ERROR]: Number of files supplied was 2, but single-end mode was selected as well. Please respecify!\n\n";
2756                }
2757            }
2758        }
2759    }
2760
2761    if (defined $consider_already_trimmed){
2762    	# making sure the range is sensible
2763    	unless ($consider_already_trimmed >= 0 and $consider_already_trimmed <= 10000){
2764    		die "Please select a threshold for when a sample should not be adapter trimmed at all (--consider_already_trimmed) in the range of [0-1000] (inclusive)\n";
2765    	}
2766    	if ($nextera or $small_rna or $illumina){
2767    		die "The threshold for --consider_already_trimmed [INT] only works in conjunction with adapter auto-detection. Make your choice and try again\n\n";
2768  		}
2769  		warn "During the adapter auto-detection, any counts equal to or lower than >$consider_already_trimmed< will be considered as 'file was already adapter-trimmed'. Only quality trimming will be carried out (setting -a X)\n"; sleep(1);
2770    }
2771
2772    return ($compression_path,$cores,$quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina,$max_length,$maxn,$trim_n,$hardtrim5,$clock,$polyA,$hardtrim3,$nextseq,$basename,$consider_already_trimmed);
2773}
2774
2775sub Ncounter{
2776  my $seq = shift;
2777  my $ncount = 0;
2778  while($seq =~ /N/g){
2779    ++$ncount;
2780  }
2781  return $ncount;
2782}
2783
2784sub extend_adapter_sequence{
2785    my ($letter,$number) = @_;
2786    return (${letter}x$number);
2787}
2788
2789
2790
2791sub print_helpfile{
2792  print << "HELP";
2793
2794 USAGE:
2795
2796trim_galore [options] <filename(s)>
2797
2798
2799-h/--help               Print this help message and exits.
2800
2801-v/--version            Print the version information and exits.
2802
2803-q/--quality <INT>      Trim low-quality ends from reads in addition to adapter removal. For
2804                        RRBS samples, quality trimming will be performed first, and adapter
2805                        trimming is carried in a second round. Other files are quality and adapter
2806                        trimmed in a single pass. The algorithm is the same as the one used by BWA
2807                        (Subtract INT from all qualities; compute partial sums from all indices
2808                        to the end of the sequence; cut sequence at the index at which the sum is
2809                        minimal). Default Phred score: 20.
2810
2811--phred33               Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
2812                        (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
2813
2814--phred64               Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
2815                        (Illumina 1.5 encoding) for quality trimming.
2816
2817--fastqc                Run FastQC in the default mode on the FastQ file once trimming is complete.
2818
2819--fastqc_args "<ARGS>"  Passes extra arguments to FastQC. If more than one argument is to be passed
2820                        to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
2821                        --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
2822                        automatically invoke FastQC, so --fastqc does not have to be specified
2823                        separately.
2824
2825-a/--adapter <STRING>   Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will
2826                        try to auto-detect whether the Illumina universal, Nextera transposase or Illumina
2827                        small RNA adapter sequence was used. Also see '--illumina', '--nextera' and
2828                        '--small_rna'. If no adapter can be detected within the first 1 million sequences
2829                        of the first file specified or if there is a tie between several adapter sequences,
2830                        Trim Galore defaults to '--illumina' (as long as the Illumina adapter was one of the
2831                        options, else '--nextera' is the default). A single base
2832                        may also be given as e.g. -a A{10}, to be expanded to -a AAAAAAAAAA.
2833
2834-a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
2835                        option requires '--paired' to be specified as well. If the libraries to be trimmed
2836                        are smallRNA then a2 will be set to the Illumina small RNA 5' adapter automatically
2837                        (GATCGTCGGACT). A single base may also be given as e.g. -a2 A{10}, to be expanded
2838                        to -a2 AAAAAAAAAA.
2839
2840--illumina              Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter
2841                        'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence.
2842
2843--nextera               Adapter sequence to be trimmed is the first 12bp of the Nextera adapter
2844                        'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence.
2845
2846--small_rna             Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA 3' Adapter
2847                        'TGGAATTCTCGG' instead of the default auto-detection of adapter sequence. Selecting
2848                        to trim smallRNA adapters will also lower the --length value to 18bp. If the smallRNA
2849                        libraries are paired-end then a2 will be set to the Illumina small RNA 5' adapter
2850                        automatically (GATCGTCGGACT) unless -a 2 had been defined explicitly.
2851
2852--consider_already_trimmed <INT>     During adapter auto-detection, the limit set by <INT> allows the user to
2853                        set a threshold up to which the file is considered already adapter-trimmed. If no adapter
2854                        sequence exceeds this threshold, no additional adapter trimming will be performed (technically,
2855                        the adapter is set to '-a X'). Quality trimming is still performed as usual.
2856                        Default: NOT SELECTED (i.e. normal auto-detection precedence rules apply).
2857
2858--max_length <INT>      Discard reads that are longer than <INT> bp after trimming. This is only advised for
2859                        smallRNA sequencing to remove non-small RNA sequences.
2860
2861
2862--stringency <INT>      Overlap with adapter sequence required to trim a sequence. Defaults to a
2863                        very stringent setting of 1, i.e. even a single bp of overlapping sequence
2864                        will be trimmed off from the 3' end of any read.
2865
2866-e <ERROR RATE>         Maximum allowed error rate (no. of errors divided by the length of the matching
2867                        region) (default: 0.1)
2868
2869--gzip                  Compress the output file with GZIP. If the input files are GZIP-compressed
2870                        the output files will automatically be GZIP compressed as well. As of v0.2.8 the
2871                        compression will take place on the fly.
2872
2873--dont_gzip             Output files won't be compressed with GZIP. This option overrides --gzip.
2874
2875--length <INT>          Discard reads that became shorter than length INT because of either
2876                        quality or adapter trimming. A value of '0' effectively disables
2877                        this behaviour. Default: 20 bp.
2878
2879                        For paired-end files, both reads of a read-pair need to be longer than
2880                        <INT> bp to be printed out to validated paired-end files (see option --paired).
2881                        If only one read became too short there is the possibility of keeping such
2882                        unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
2883
2884--max_n COUNT           The total number of Ns (as integer) a read may contain before it will be removed altogether.
2885                        In a paired-end setting, either read exceeding this limit will result in the entire
2886                        pair being removed from the trimmed output files.
2887
2888--trim-n                Removes Ns from either side of the read. This option does currently not work in RRBS mode.
2889
2890-o/--output_dir <DIR>   If specified all output will be written to this directory instead of the current
2891                        directory. If the directory doesn't exist it will be created for you.
2892
2893--no_report_file        If specified no report file will be generated.
2894
2895--suppress_warn         If specified any output to STDOUT or STDERR will be suppressed.
2896
2897--clip_R1 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end
2898                        reads). This may be useful if the qualities were very poor, or if there is some
2899                        sort of unwanted bias at the 5' end. Default: OFF.
2900
2901--clip_R2 <int>         Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads
2902                        only). This may be useful if the qualities were very poor, or if there is some sort
2903                        of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
2904                        the first few bp because the end-repair reaction may introduce a bias towards low
2905                        methylation. Please refer to the M-bias plot section in the Bismark User Guide for
2906                        some examples. Default: OFF.
2907
2908--three_prime_clip_R1 <int>     Instructs Trim Galore to remove <int> bp from the 3' end of read 1 (or single-end
2909                        reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted
2910                        bias from the 3' end that is not directly related to adapter sequence or basecall quality.
2911                        Default: OFF.
2912
2913--three_prime_clip_R2 <int>     Instructs Trim Galore to remove <int> bp from the 3' end of read 2 AFTER
2914                        adapter/quality trimming has been performed. This may remove some unwanted bias from
2915                        the 3' end that is not directly related to adapter sequence or basecall quality.
2916                        Default: OFF.
2917
2918--2colour/--nextseq INT This enables the option '--nextseq-trim=3'CUTOFF' within Cutadapt, which will set a quality
2919                        cutoff (that is normally given with -q instead), but qualities of G bases are ignored.
2920                        This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without
2921                        any signal are called as high-quality G bases. This is mutually exlusive with '-q INT'.
2922
2923
2924--path_to_cutadapt </path/to/cutadapt>     You may use this option to specify a path to the Cutadapt executable,
2925                        e.g. /my/home/cutadapt-1.7.1/bin/cutadapt. Else it is assumed that Cutadapt is in
2926                        the PATH.
2927
2928--basename <PREFERRED_NAME>	Use PREFERRED_NAME as the basename for output files, instead of deriving the filenames from
2929                        the input files. Single-end data would be called PREFERRED_NAME_trimmed.fq(.gz), or
2930                        PREFERRED_NAME_val_1.fq(.gz) and PREFERRED_NAME_val_2.fq(.gz) for paired-end data. --basename
2931                        only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists.
2932
2933-j/--cores INT          Number of cores to be used for trimming [default: 1]. For Cutadapt to work with multiple cores, it
2934                        requires Python 3 as well as parallel gzip (pigz) installed on the system. The version of Python used
2935                        is detected from the shebang line of the Cutadapt executable (either 'cutadapt', or a specified path).
2936                        If Python 2 is detected, --cores is set to 1.
2937                        If pigz cannot be detected on your system, Trim Galore reverts to using gzip compression. Please note
2938                        that gzip compression will slow down multi-core processes so much that it is hardly worthwhile, please
2939                        see: https://github.com/FelixKrueger/TrimGalore/issues/16#issuecomment-458557103 for more info).
2940
2941                        Actual core usage: It should be mentioned that the actual number of cores used is a little convoluted.
2942                        Assuming that Python 3 is used and pigz is installed, --cores 2 would use 2 cores to read the input
2943                        (probably not at a high usage though), 2 cores to write to the output (at moderately high usage), and
2944                        2 cores for Cutadapt itself + 2 additional cores for Cutadapt (not sure what they are used for) + 1 core
2945                        for Trim Galore itself. So this can be up to 9 cores, even though most of them won't be used at 100% for
2946                        most of the time. Paired-end processing uses twice as many cores for the validation (= writing out) step.
2947                        --cores 4 would then be: 4 (read) + 4 (write) + 4 (Cutadapt) + 2 (extra Cutadapt) +	1 (Trim Galore) = 15.
2948
2949                        It seems that --cores 4 could be a sweet spot, anything above has diminishing returns.
2950
2951
2952
2953SPECIFIC TRIMMING - without adapter/quality trimming
2954
2955--hardtrim5 <int>       Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences
2956                        to <int> bp at the 5'-end. Once hard-trimming of files is complete, Trim Galore will exit.
2957                        Hard-trimmed output files will end in .<int>_5prime.fq(.gz). Here is an example:
2958
2959                        before:         CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT
2960                        --hardtrim5 20: CCTAAGGAAACAAGTACACT
2961
2962--hardtrim3 <int>       Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences
2963                        to <int> bp at the 3'-end. Once hard-trimming of files is complete, Trim Galore will exit.
2964                        Hard-trimmed output files will end in .<int>_3prime.fq(.gz). Here is an example:
2965
2966                        before:         CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT
2967                        --hardtrim3 20:                                                   TTTTTAAGAAAATGGAAAAT
2968
2969--clock                 In this mode, reads are trimmed in a specific way that is currently used for the Mouse
2970                        Epigenetic Clock (see here: Multi-tissue DNA methylation age predictor in mouse, Stubbs et al.,
2971                        Genome Biology, 2017 18:68 https://doi.org/10.1186/s13059-017-1203-5). Following this, Trim Galore
2972                        will exit.
2973
2974                        In it's current implementation, the dual-UMI RRBS reads come in the following format:
2975
2976                        Read 1  5' UUUUUUUU CAGTA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF TACTG UUUUUUUU 3'
2977                        Read 2  3' UUUUUUUU GTCAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF ATGAC UUUUUUUU 5'
2978
2979                        Where UUUUUUUU is a random 8-mer unique molecular identifier (UMI), CAGTA is a constant region,
2980                        and FFFFFFF... is the actual RRBS-Fragment to be sequenced. The UMIs for Read 1 (R1) and
2981                        Read 2 (R2), as well as the fixed sequences (F1 or F2), are written into the read ID and
2982                        removed from the actual sequence. Here is an example:
2983
2984                        R1: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT
2985                            ATCTAGTTCAGTACGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG
2986                        R2: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT
2987                            CAATTTTGCAGTACAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA
2988
2989                        R1: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT
2990                                         CGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG
2991                        R2: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT
2992                                         CAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA
2993
2994                        Following clock trimming, the resulting files (.clock_UMI.R1.fq(.gz) and .clock_UMI.R2.fq(.gz))
2995                        should be adapter- and quality trimmed with Trim Galore as usual. In addition, reads need to be trimmed
2996                        by 15bp from their 3' end to get rid of potential UMI and fixed sequences. The command is:
2997
2998                        trim_galore --paired --three_prime_clip_R1 15 --three_prime_clip_R2 15 *.clock_UMI.R1.fq.gz *.clock_UMI.R2.fq.gz
2999
3000                        Following this, reads should be aligned with Bismark and deduplicated with UmiBam
3001                        in '--dual_index' mode (see here: https://github.com/FelixKrueger/Umi-Grinder). UmiBam recognises
3002                        the UMIs within this pattern: R1:(ATCTAGTT):R2:(CAATTTTG): as (UMI R1) and (UMI R2).
3003
3004--polyA                 This is a new, still experimental, trimming mode to identify and remove poly-A tails from sequences.
3005                        When --polyA is selected, Trim Galore attempts to identify from the first supplied sample whether
3006                        sequences contain more often a stretch of either 'AAAAAAAAAA' or 'TTTTTTTTTT'. This determines
3007                        if Read 1 of a paired-end end file, or single-end files, are trimmed for PolyA or PolyT. In case of
3008                        paired-end sequencing, Read2 is trimmed for the complementary base from the start of the reads. The
3009                        auto-detection uses a default of A{20} for Read1 (3'-end trimming) and T{150} for Read2 (5'-end trimming).
3010                        These values may be changed manually using the options -a and -a2.
3011
3012                        In addition to trimming the sequences, white spaces are replaced with _ and it records in the read ID
3013                        how many bases were trimmed so it can later be used to identify PolyA trimmed sequences. This is currently done
3014                        by writing tags to both the start ("32:A:") and end ("_PolyA:32") of the reads in the following example:
3015
3016                        \@READ-ID:1:1102:22039:36996 1:N:0:CCTAATCC
3017                        GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
3018
3019                        \@32:A:READ-ID:1:1102:22039:36996_1:N:0:CCTAATCC_PolyA:32
3020                        GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACC
3021
3022                        PLEASE NOTE: The poly-A trimming mode expects that sequences were both adapter and quality trimmed
3023                        before looking for Poly-A tails, and it is the user's responsibility to carry out an initial round of
3024                        trimming. The following sequence:
3025
3026                        1) trim_galore file.fastq.gz
3027                        2) trim_galore --polyA file_trimmed.fq.gz
3028                        3) zcat file_trimmed_trimmed.fq.gz | grep -A 3 PolyA | grep -v ^-- > PolyA_trimmed.fastq
3029
3030                        Will 1) trim qualities and Illumina adapter contamination, 2) find and remove PolyA contamination.
3031                        Finally, if desired, 3) will specifically find PolyA trimmed sequences to a specific FastQ file of your choice.
3032
3033
3034RRBS-specific options (MspI digested material):
3035
3036--rrbs                  Specifies that the input file was an MspI digested RRBS sample (recognition
3037                        site: CCGG). Single-end or Read 1 sequences (paired-end) which were adapter-trimmed
3038                        will have a further 2 bp removed from their 3' end. Sequences which were merely
3039                        trimmed because of poor quality will not be shortened further. Read 2 of paired-end
3040                        libraries will in addition have the first 2 bp removed from the 5' end (by setting
3041                        '--clip_r2 2'). This is to avoid using artificial methylation calls from the filled-in
3042                        cytosine positions close to the 3' MspI site in sequenced fragments.
3043                        This option is not recommended for users of the NuGEN ovation RRBS System 1-16
3044                        kit (see below).
3045
3046--non_directional       Selecting this option for non-directional RRBS libraries will screen
3047                        quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
3048                        and, if found, removes the first two basepairs. Like with the option
3049                        '--rrbs' this avoids using cytosine positions that were filled-in
3050                        during the end-repair step. '--non_directional' requires '--rrbs' to
3051                        be specified as well. Note that this option does not set '--clip_r2 2' in
3052                        paired-end mode.
3053
3054--keep                  Keep the quality trimmed intermediate file. Default: off, which means
3055                        the temporary file is being deleted after adapter trimming. Only has
3056                        an effect for RRBS samples since other FastQ files are not trimmed
3057                        for poor qualities separately.
3058
3059
3060Note for RRBS using the NuGEN Ovation RRBS System 1-16 kit:
3061
3062Owing to the fact that the NuGEN Ovation kit attaches a varying number of nucleotides (0-3) after each MspI
3063site Trim Galore should be run WITHOUT the option --rrbs. This trimming is accomplished in a subsequent
3064diversity trimming step afterwards (see their manual).
3065
3066
3067
3068Note for RRBS using MseI:
3069
3070If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
3071to specify --rrbs or --non_directional since virtually all reads should start with the sequence
3072'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
3073restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
3074run Trim Galore! in the standard (i.e. non-RRBS) mode.
3075
3076
3077
3078
3079Paired-end specific options:
3080
3081--paired                This option performs length trimming of quality/adapter/RRBS trimmed reads for
3082                        paired-end files. To pass the validation test, both sequences of a sequence pair
3083                        are required to have a certain minimum length which is governed by the option
3084                        --length (see above). If only one read passes this length threshold the
3085                        other read can be rescued (see option --retain_unpaired). Using this option lets
3086                        you discard too short read pairs without disturbing the sequence-by-sequence order
3087                        of FastQ files which is required by many aligners.
3088
3089                        Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
3090                        file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
3091
3092-t/--trim1              Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
3093                        are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
3094                        alignments like this:
3095
3096                          R1 --------------------------->     or this:    ----------------------->  R1
3097                          R2 <---------------------------                       <-----------------  R2
3098
3099                        as invalid (whenever a start/end coordinate is contained within the other read).
3100                        NOTE: If you are planning to use Bowtie2, BWA etc. you don't need to specify this option.
3101
3102--retain_unpaired       If only one of the two paired-end reads became too short, the longer
3103                        read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
3104                        output files. The length cutoff for unpaired single-end reads is
3105                        governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
3106
3107-r1/--length_1 <INT>    Unpaired single-end read length cutoff needed for read 1 to be written to
3108                        '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
3109                        Default: 35 bp.
3110
3111-r2/--length_2 <INT>    Unpaired single-end read length cutoff needed for read 2 to be written to
3112                        '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
3113                        Default: 35 bp.
3114
3115Last modified on 07 November 2019.
3116
3117HELP
3118  exit;
3119}
3120