1#!/usr/local/bin/perl 2use strict; 3use warnings; 4use Getopt::Long; 5use IPC::Open3; 6use File::Spec; 7use File::Basename; 8use Cwd; 9 10## This program is Copyright (C) 2012-19, Felix Krueger (felix.krueger@babraham.ac.uk) 11## Edited by Frankie James (github.com/fjames003) for multi-core support 12 13## This program is free software: you can redistribute it and/or modify 14## it under the terms of the GNU General Public License as published by 15## the Free Software Foundation, either version 3 of the License, or 16## (at your option) any later version. 17 18## This program is distributed in the hope that it will be useful, 19## but WITHOUT ANY WARRANTY; without even the implied warranty of 20## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21## GNU General Public License for more details. 22 23## You should have received a copy of the GNU General Public License 24## along with this program. If not, see <http://www.gnu.org/licenses/>. 25 26 27## This script is taking in FastQ sequences and trims them using Cutadapt 28 29## last modified on 07 11 2019 30 31my $DOWARN = 1; # print on screen warning and text by default 32BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; 33 34my $trimmer_version = '0.6.4_dev'; 35my $cutadapt_version; 36my $python_version; 37 38my ($compression_path,$cores,$cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina,$max_length,$maxn,$trim_n,$hardtrim5,$clock,$polyA,$hardtrim3,$nextseq,$basename,$consider_already_trimmed) = process_commandline(); 39my $report_message; # stores result of adapter auto-detection 40 41my @filenames = @ARGV; 42die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n 43USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames); 44file_sanity_check($filenames[0]); 45 46if (defined $hardtrim5){ 47 warn "Hard-trimming from the 3'-end selected. File(s) will be trimmed to leave the leftmost $hardtrim5 bp on the 5'-end, and Trim Galore will then exit.\n\n"; 48 foreach my $file(@filenames){ 49 hardtrim_to_5prime_end($file); 50 } 51 exit; 52} 53 54if (defined $hardtrim3){ 55 warn "Hard-trimming from 5'-end selected. File(s) will be trimmed to leave the rightmost $hardtrim3 bp on the 3'-end, and Trim Galore will then exit.\n\n"; 56 foreach my $file(@filenames){ 57 hardtrim_to_3prime_end($file); 58 } 59 exit; 60} 61 62if ($clock){ 63 warn "\nIT'S TIME FOR CLOCK PROCESSING!!!\t\t\t\t\t\t\t\t\t[pun intended]\n\n"; 64 65 while (@ARGV){ 66 my $in1 = shift @ARGV; 67 my $in2 = shift @ARGV; 68 clockwork($in1,$in2); 69 } 70 warn "\nPre-processing finished...\n\nPlease run Trim Galore again to remove adapters, poor quality bases as well as UMI/fixed sequences from the 3'-end of the reads.\nA sample command for this is:\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\ntrim_galore --paired --three_prime_clip_R1 15 --three_prime_clip_R2 15 *.clock_UMI.R1.fq.gz *.clock_UMI.R2.fq.gz\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nTrim Galore Epigenetic Clock processing complete.\n\n"; 71 exit; 72} 73 74sub clockwork{ 75 76 ### FILEHANDLES 77 my ($in1_fh,$in2_fh); # input filehandles 78 my ($read1_fh,$read2_fh); # output filehandles 79 80 ### INPUT FILES 81 my ($in1,$in2) = @_; 82 if ($in1 =~ /\.gz$/){ 83 open ($in1_fh,"$compression_path -d -c $in1 |") or die "Failed to read from file $in1: $!"; 84 } 85 else{ 86 open ($in1_fh,$in1) or die "Failed to read from file $in1: $!"; 87 } 88 if ($in2 =~ /\.gz$/){ 89 open ($in2_fh,"$compression_path -d -c $in2 |") or die "Failed to read from file $in2: $!"; 90 } 91 else{ 92 open ($in2_fh,$in2) or die "Failed to read from file $in2: $!"; 93 } 94 # warn " Input file name 1: $in1\n"; 95 # warn " Input file name 2: $in2\n"; 96 97 ### OUTPUT FILES 98 my $out1 = (split (/\//,$in1))[-1]; 99 my $out2 = (split (/\//,$in2))[-1]; 100 101 $out1 =~ s/(\.fastq$|\.fastq\.gz$)//; 102 $out1 =~ s/(\.fq$|\.fq\.gz$)//; 103 $out1 .= '.clock_UMI.R1.fq'; # appending to the end 104 105 $out2 =~ s/(\.fastq$|\.fastq\.gz$)//; 106 $out2 =~ s/(\.fq$|\.fq\.gz$)//; 107 $out2 .= '.clock_UMI.R2.fq'; # appending to the end 108 109 110 ### READ 1 111 if ($gzip or $in1 =~ /\.gz$/){ 112 if ($dont_gzip){ 113 open ($read1_fh,'>',$output_dir.$out1) or die "Can't open '$out1': $!\n"; 114 } 115 else{ 116 $out1 .= '.gz'; 117 open ($read1_fh,"| $compression_path -c - > ${output_dir}${out1}") or die "Can't write to '$out1': $!\n"; 118 } 119 } 120 else{ 121 open ($read1_fh,'>',$output_dir.$out1) or die "Can't open '$out1': $!\n"; 122 } 123 warn "Writing dual trimmed version of the input file '$in1' to '$out1'\n"; 124 125 ### READ 2 126 if ($gzip or $in2 =~ /\.gz$/){ 127 if ($dont_gzip){ 128 open ($read2_fh,'>',$output_dir.$out2) or die "Can't open '$out2': $!\n"; 129 } 130 else{ 131 $out2 .= '.gz'; 132 open ($read2_fh,"| $compression_path -c - > ${output_dir}${out2}") or die "Can't write to '$out2': $!\n"; 133 } 134 } 135 else{ 136 open ($read2_fh,'>',$output_dir.$out2) or die "Can't open '$out2': $!\n"; 137 } 138 warn "Writing dual trimmed version of the input file '$in2' to '$out2'\n ---\n"; 139 # warn "Output file name 1: $out1\n"; 140 # warn "Output file name 2: $out2\n"; 141 142 143 my %freqs; 144 my $umi_1; 145 my $umi_2; 146 147 # open ($out2,"| $compression_path -c - > $out1") or die $!; 148 # open (OUT2,"| $compression_path -c - > $out2") or die $!; 149 150 # print "Processing files $in1 and $in2\n"; 151 my %r1; # storing the barcodes for R1 152 my %r2; # storing the barcodes for R2 153 154 my %fix1; # storing the fixed sequence (CAGT + A from A-tailing) of R1 155 my %fix2; # storing the fixed sequence (CAGT + A from A-tailing) of R2 156 157 my $count = 0; 158 my $filtered_count = 0; 159 my $r1_contains_rc = 0; 160 161 ORANGE: while (1){ 162 163 my $one1 = <$in1_fh>; 164 my $one2 = <$in1_fh>; 165 my $one3 = <$in1_fh>; 166 my $one4 = <$in1_fh>; 167 168 my $two1 = <$in2_fh>; 169 my $two2 = <$in2_fh>; 170 my $two3 = <$in2_fh>; 171 my $two4 = <$in2_fh>; 172 173 last unless ($one4 and $two4); 174 chomp $one2; # sequence 175 chomp $two2; # sequence 176 177 chomp $one1; # read ID, need to append UMIs to the read ID 178 chomp $two1; # read ID, need to append UMIs to the read ID 179 180 chomp $one4; # quality 181 chomp $two4; # quality 182 183 ++$count; # sequence count 184 185 if ($count % 1000000 ==0){ 186 warn "Processed $count sequences so far...\n"; 187 } 188 my $r1_barcode; 189 my $r2_barcode; 190 my $r1_fix; 191 my $r2_fix; 192 193 194 $r1_barcode = substr($one2,0,8); 195 $r2_barcode = substr($two2,0,8); 196 $r1_fix = substr($one2,8,4); 197 $r2_fix = substr($two2,8,4); 198 # warn "$one2\n$two2\n$r1_barcode\t$r1_fix\n$r2_barcode\t$r2_fix\n\n";sleep(1); 199 200 # this part of code simply counts the different types of sequence that occurred at the constant region where we expected 201 # to read CAGT in both reads. Both R1 and R2 are taken into account at the same time 202 unless ($r1_fix eq 'CAGT' and $r2_fix eq 'CAGT'){ 203 $freqs{$r1_fix}++; 204 $freqs{$r2_fix}++; 205 $filtered_count++; 206 } 207 208 ### BOTH READ1 AND READ2 should look like this: 209 210 # 0 8 12 13 INDEX position 211 # UUUUUUUU CAGT A FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF SEQUENCE 212 213 # where: 214 # U = UMI base 215 # CAGT = fixed sequence 216 # A = A-tail 217 # FFFF = RRBS-fragment to be aligned 218 219 ### Capturing the sequence after the A from A-tailing (FFFFFFFFFFFFFF...) 220 221 my $seq1 = substr($one2,13); # truncated sequence without UMIs or fixed sequence 222 my $seq2 = substr($two2,13); 223 224 my $qual1 = substr($one4,13); # truncated quality string without barcode or fixed sequence 225 my $qual2 = substr($two4,13); 226 227 # warn "$one1\n"; 228 $one1 .= ":R1:${r1_barcode}:R2:${r2_barcode}:F1:${r1_fix}:F2:${r2_fix}"; 229 #warn "$one1\n"; 230 231 #warn "$two1\n"; 232 $two1 .= ":R1:${r1_barcode}:R2:${r2_barcode}:F1:${r1_fix}:F2:${r2_fix}"; 233 #warn "$two1\n"; 234 # warn "$one2\n $seq1\n$one4\n $qual1\n~~\n$two2\n $seq2\n$two4\n $qual2\n\n";sleep(1); 235 236 print ${read1_fh} "$one1\n"; 237 print ${read1_fh} "$seq1\n"; 238 print ${read1_fh} "+\n"; # replacing this with a + for space and format reasons 239 print ${read1_fh} "$qual1\n"; 240 241 print ${read2_fh} "$two1\n"; 242 print ${read2_fh} "$seq2\n"; 243 print ${read2_fh} "+\n"; # replacing this with a + for space and format reasons 244 print ${read2_fh} "$qual2\n"; 245 246 # sleep(1); 247 248 $r1{$r1_barcode}++; 249 $r2{$r2_barcode}++; 250 251 $fix1{$r1_fix}++; 252 $fix2{$r2_fix}++; 253 } 254 255 my $perc; 256 if ($count){ 257 $perc = sprintf("%.2f",$filtered_count/$count * 100); 258 } 259 else{ 260 $perc = 'N/A'; 261 } 262 263 warn "Sequences processed in total: $count\nthereof had fixed sequence CAGT in both R1 and R2:\t $filtered_count ($perc%)\n ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; 264} 265 266sub rc{ 267 my $string = shift; 268 $string = reverse($string); 269 $string =~ tr/GATC/CTAG/; 270 return $string; 271} 272 273 274######################################################################## 275 276 277my $path_to_fastqc = 'fastqc'; 278 279 280######################################################################## 281 282 283 284 285### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED 286unless (defined $cutoff){ 287 unless ($nextseq){ 288 $cutoff = 20; 289 } 290} 291 292my $phred_score_cutoff = $cutoff; # only relevant for report 293my $adapter_name = ''; 294unless (defined $adapter){ 295 if ($nextera){ 296 $adapter = 'CTGTCTCTTATA'; 297 $adapter_name = 'Nextera Transposase sequence; user defined'; 298 } 299 elsif($small_rna){ 300 $adapter = 'TGGAATTCTCGG'; 301 $adapter_name = 'Illumina small RNA adapter; user defined'; 302 } 303 elsif($illumina){ 304 $adapter = 'AGATCGGAAGAGC'; 305 $adapter_name = 'Illumina TruSeq, Sanger iPCR; user defined'; 306 } 307 else{ # default 308 ### If other -a and/or -a2 were given 309 if ($polyA){ # specialised PolyA trimming 310 ($adapter,$adapter_name) = autodetect_polyA_type(); 311 312 if ($validate){ # we need to select -a2 as the reverse complement to -a 313 if ($adapter =~ /A+/){ 314 $adapter = extend_adapter_sequence("A",20); # defaulting to 20 315 $a2 = extend_adapter_sequence("T",150); # defaulting to 150 bp 316 } 317 elsif($adapter =~ /T+/){ 318 $adapter = extend_adapter_sequence("T",20); # defaulting to 20 319 $a2 = extend_adapter_sequence("A",150); 320 } 321 else{ 322 die "Something unexpected happened with the Poly-A autodetection, please check\n"; 323 } 324 # warn "Paired-end Poly-A detection, set the following parameters:\n"; 325 # warn " -a: $adapter\n-a2: $a2\n ~~~~~~~~~~ \n\n";sleep(1); 326 } 327 else{ 328 # Single end 329 if ($adapter =~ /A+/){ 330 $adapter = extend_adapter_sequence("A",20); # defaulting to 20 331 } 332 elsif($adapter =~ /T+/){ 333 $adapter = extend_adapter_sequence("T",20); # defaulting to 20 334 } 335 else{ 336 die "Something unexpected happened with the Poly-A autodetection, please check\n"; 337 } 338 } 339 } 340 else{ # ADAPTER TRIMMING; DEFAULT 341 ($adapter,$adapter_name,$report_message) = autodetect_adapter_type(); 342 } 343 } 344} 345else{ 346 $adapter_name = 'user defined'; 347} 348 349### For smallRNA adapters we are reducing the sequence length cutoff before a sequences gets thrown out entirely to 18bp. We are doing this because some 20-23bp long smallRNAs 350### may be removed if T, TG, or TGG etc gets trimmed off the end (changed b ack up from 16 to 18bp to remove noise from alignment files, 18 11 2015) 351if ($adapter eq 'TGGAATTCTCGG'){ 352 unless (defined $length_cutoff){ # user defined length cutoff wins over auto-detection 353 $length_cutoff = 18; 354 warn "Reducing length cutoff to 18bp for small RNA-Seq reads because a cutoff of 20bp may remove some short species of small RNAs if they had been trimmed by 1,2 or 3bp\n"; 355 } 356 357 ### If the file is a smallRNA library and paired-end we set the Illumina 5' adapter as the $a2 sequence 358 if ($validate){ 359 unless (defined $a2){ 360 $a2 = 'GATCGTCGGACT'; 361 warn "Setting the Illumina smallRNA 5' adapter as adapter 2: 'GATCGTCGGACT'\n"; 362 } 363 } 364} 365 366unless (defined $length_cutoff){ 367 $length_cutoff = 20; # non small RNA length cutoff 368} 369 370 371unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming 372 $a2 = ''; 373} 374 375unless (defined $stringency){ 376 $stringency = 1; 377} 378 379if ($phred_encoding == 64){ 380 $cutoff += 31; 381} 382 383 384my $file_1; 385my $file_2; 386 387foreach my $filename (@ARGV){ 388 trim ($filename); 389} 390 391 392sub trim{ 393 my $filename = shift; 394 395 my $output_filename = (split (/\//,$filename))[-1]; 396 397 my $report = $output_filename; 398 $report =~ s/$/_trimming_report.txt/; 399 400 401 if ($no_report_file) { 402 $report = File::Spec->devnull; 403 open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n"; 404 # warn "Redirecting report output to /dev/null\n"; 405 } 406 else{ 407 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n"; 408 warn "Writing report to '$output_dir$report'\n"; 409 } 410 411 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; 412 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; 413 414 if ($validate){ # paired-end mode 415 warn "Trimming mode: paired-end\n"; 416 print REPORT "Trimming mode: paired-end\n"; 417 } 418 else{ 419 warn "Trimming mode: single-end\n"; 420 print REPORT "Trimming mode: single-end\n"; 421 } 422 423 warn "Trim Galore version: $trimmer_version\n"; 424 print REPORT "Trim Galore version: $trimmer_version\n"; 425 426 warn "Cutadapt version: $cutadapt_version\n"; 427 print REPORT "Cutadapt version: $cutadapt_version\n"; 428 429 if (defined $python_version){ 430 warn "Python version: $python_version\n"; 431 print REPORT "Python version: $python_version\n"; 432 } 433 434 if (defined $cores){ 435 my $temp = $cores; 436 $temp =~ s/-j //; 437 warn "Number of cores used for trimming: $temp\n"; 438 print REPORT "Number of cores used for trimming: $temp\n"; 439 } 440 441 if (defined $phred_score_cutoff){ 442 warn "Quality Phred score cutoff: $phred_score_cutoff\n"; 443 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n"; 444 } 445 446 447 warn "Quality encoding type selected: ASCII+$phred_encoding\n"; 448 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n"; 449 450 if ($report_message){ 451 # warn "adding Auto-detection statement\n"; 452 print REPORT $report_message; 453 } 454 455 warn "Adapter sequence: '$adapter' ($adapter_name)\n"; 456 print REPORT "Adapter sequence: '$adapter' ($adapter_name)\n"; 457 458 if ($error_rate == 0.1){ 459 warn "Maximum trimming error rate: $error_rate (default)\n"; 460 } 461 else{ 462 warn "Maximum trimming error rate: $error_rate\n"; 463 } 464 465 if (defined $maxn){ 466 warn "Maximum number of tolerated Ns: $maxn\n"; 467 } 468 469 if ($nextseq){ 470 warn "2-colour high quality G-trimming enabled, with quality cutoff: $nextseq\n"; 471 print REPORT "2-colour high quality G-trimming enabled, with quality cutoff: $nextseq\n"; 472 } 473 474 print REPORT "Maximum trimming error rate: $error_rate"; 475 if ($error_rate == 0.1){ 476 print REPORT " (default)\n"; 477 } 478 else{ 479 print REPORT "\n"; 480 } 481 482 if ($a2){ 483 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; 484 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; 485 } 486 487 warn "Minimum required adapter overlap (stringency): $stringency bp\n"; 488 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n"; 489 490 491 if (defined $consider_already_trimmed){ 492 warn "During adapter auto-detection, files are considered already adapter-trimmed if the highest found adapter was found equal to or lower than: $consider_already_trimmed\n"; 493 print REPORT "During adapter auto-detection, files are considered already adapter-trimmed if the highest found adapter was found equal to or lower than: $consider_already_trimmed\n"; 494 } 495 496 if ($validate){ 497 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; 498 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; 499 } 500 else{ 501 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; 502 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; 503 } 504 505 if ($max_length){ 506 warn "Maxiumum tolerated read length after trimming (for smallRNA trimming): $max_length bp\n"; 507 print REPORT "Maxiumum tolerated read length after trimming (for smallRNA trimming): $max_length bp\n"; 508 } 509 510 if ($trim_n){ 511 warn "Removing Ns from the start and end of reads\n"; 512 print REPORT "Removing Ns from the start and end of reads\n"; 513 } 514 515 516 if ($validate){ # only for paired-end files 517 518 if ($retain){ # keeping single-end reads if only one end is long enough 519 520 if ($length_read_1 == 35){ 521 warn "Length cut-off for read 1: $length_read_1 bp (default)\n"; 522 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n"; 523 } 524 else{ 525 warn "Length cut-off for read 1: $length_read_1 bp\n"; 526 print REPORT "Length cut-off for read 1: $length_read_1 bp\n"; 527 } 528 529 if ($length_read_2 == 35){ 530 warn "Length cut-off for read 2: $length_read_2 bb (default)\n"; 531 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n"; 532 } 533 else{ 534 warn "Length cut-off for read 2: $length_read_2 bp\n"; 535 print REPORT "Length cut-off for read 2: $length_read_2 bp\n"; 536 } 537 } 538 } 539 540 if ($rrbs){ 541 warn "File was specified to be an MspI-digested RRBS sample. Read 1 sequences with adapter contamination will be trimmed a further 2 bp from their 3' end, and Read 2 sequences will be trimmed by 2 bp from their 5' end to remove potential methylation-biased bases from the end-repair reaction\n"; 542 print REPORT "File was specified to be an MspI-digested RRBS sample. Read 1 sequences with adapter contamination will be trimmed a further 2 bp from their 3' end, and Read 2 sequences will be trimmed by 2 bp from their 5' end to remove potential methylation-biased bases from the end-repair reaction\n"; 543 } 544 545 if ($non_directional){ 546 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; 547 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; 548 } 549 550 if ($trim){ 551 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; 552 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; 553 } 554 555 if ($clip_r1){ 556 warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; 557 print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; 558 } 559 if ($clip_r2){ 560 warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; 561 print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; 562 } 563 564 if ($three_prime_clip_r1){ 565 warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; 566 print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; 567 } 568 if ($three_prime_clip_r2){ 569 warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; 570 print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; 571 } 572 573 if ($fastqc){ 574 warn "Running FastQC on the data once trimming has completed\n"; 575 print REPORT "Running FastQC on the data once trimming has completed\n"; 576 577 if ($fastqc_args){ 578 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n"; 579 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n"; 580 } 581 } 582 583 if ($keep and $rrbs){ 584 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; 585 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; 586 } 587 588 589 if ($gzip or $filename =~ /\.gz$/){ 590 $gzip = 1; 591 unless ($dont_gzip){ 592 warn "Output file(s) will be GZIP compressed\n"; 593 print REPORT "Output file will be GZIP compressed\n"; 594 } 595 } 596 597 598 warn "\n"; 599 print REPORT "\n"; 600 # sleep (3); 601 602 my $temp; 603 604 ### We need to make sure that Cutadapt still runs if users use a fairly old version of Cutadapt, as multi-core handling is only supported sincce 605 ### Cutadapt version 1.15. Edited on 08 03 2019 606 # if the Cutadapt version has more . in the version, we discard the second one. 607 # warn "Version was: $cutadapt_version\n"; 608 if ($cutadapt_version =~ /(\d+\.\d+)\.\d+/){ 609 $cutadapt_version = $1; 610 } 611 # warn "Version now is: $cutadapt_version\n"; 612 my ($major_version,$sub_version) = ($1,$2) if ($cutadapt_version =~ /^(\d+)\.(\d+)$/); 613 # warn "Major: $major_version\nSub version: $sub_version\n"; 614 # sleep(3); 615 if ($major_version == 1){ # versions prior to 1.15 did not support the -j option 616 if ($sub_version < 15){ 617 if ($cores > 1){ 618 die "I'm sorry but your version of Cutadapt is too old to support multi-core trimming. Please update Cutadapt, and try again\n\n"; 619 } 620 else{ #default single-core prcessing with old version of Cutadapt 621 warn "Your version of Cutadapt is fairly old (detected v$cutadapt_version), please consider updating Cutadapt!\n"; 622 $cores = ""; # need to delete -j entirely as Cutadapt 623 } 624 } 625 else{ 626 warn "Cutadapt seems to be reasonably up-to-date. Setting -j $cores\n"; 627 unless ($cores =~ /^-j \d+$/){ # if it was set before, don't change it again 628 $cores = "-j $cores"; 629 } 630 } 631 } 632 elsif ($major_version > 1){ 633 warn "Cutadapt seems to be fairly up-to-date (version $cutadapt_version). Setting -j $cores\n"; 634 unless ($cores =~ /^-j \d+$/){ # if it was set before, don't change it again 635 $cores = "-j $cores"; 636 } 637 } 638 else{ 639 die "Cutadapt major version was not 1 or higher. Simply too old...\n"; 640 } 641 642 ### Proceeding differently for RRBS and other type of libraries 643 if ($rrbs){ 644 645 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified 646 if ($cutoff == 0){ 647 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n"; 648 # sleep (3); 649 } 650 else{ 651 652 $temp = $filename; 653 $temp =~ s/^.*\///; # replacing optional file path information 654 $temp =~ s/$/_qual_trimmed.fastq/; 655 open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!"; 656 657 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n"; 658 # sleep (1); 659 660 open (QUAL,"$path_to_cutadapt $cores -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!"; 661 662 my $qual_count = 0; 663 664 while (1){ 665 my $l1 = <QUAL>; 666 my $seq = <QUAL>; 667 my $l3 = <QUAL>; 668 my $qual = <QUAL>; 669 last unless (defined $qual); 670 671 $qual_count++; 672 if ($qual_count%10000000 == 0){ 673 warn "$qual_count sequences processed\n"; 674 } 675 print TEMP "$l1$seq$l3$qual"; 676 } 677 678 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n"; 679 close QUAL or die "Unable to close QUAL filehandle: $!\n"; 680 681 } 682 } 683 684 685 if ($output_filename =~ /\.fastq$/){ 686 $output_filename =~ s/\.fastq$/_trimmed.fq/; 687 } 688 elsif ($output_filename =~ /\.fastq\.gz$/){ 689 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/; 690 } 691 elsif ($output_filename =~ /\.fq$/){ 692 $output_filename =~ s/\.fq$/_trimmed.fq/; 693 } 694 elsif ($output_filename =~ /\.fq\.gz$/){ 695 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/; 696 } 697 else{ 698 $output_filename =~ s/$/_trimmed.fq/; 699 } 700 701 if ($gzip or $filename =~ /\.gz$/){ 702 if ($dont_gzip){ 703 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file 704 } 705 else{ 706 ### 6 Jan 2014: had a request to also gzip intermediate files to save disk space 707 # if ($validate){ 708 # open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file 709 # } 710 $output_filename .= '.gz'; 711 open (OUT,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n"; 712 } 713 } 714 else{ 715 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; 716 } 717 warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; 718 719 my $count = 0; 720 my $too_short = 0; 721 my $too_long = 0; 722 my $too_many_n = 0; 723 my $quality_trimmed = 0; 724 my $rrbs_trimmed = 0; 725 my $rrbs_trimmed_start = 0; 726 my $CAA = 0; 727 my $CGA = 0; 728 729 my $pid; 730 731 if ($rrbs and $cutoff != 0){ 732 733 ### optionally using 2 different adapters for read 1 and read 2 734 if ($validate and $a2){ 735 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 736 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 737 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; 738 #sleep (1); 739 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 740 } 741 else{ # this is read 2 of a pair 742 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n"; 743 #sleep (1); 744 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 745 } 746 } 747 ### Using the same adapter for both read 1 and read 2 748 else{ 749 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; 750 # sleep (3); 751 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 752 } 753 754 close WRITER or die $!; # not needed 755 756 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file 757 758 if ($filename =~ /\.gz$/){ 759 open (IN,"gunzip -c $filename |") or die $!; # original, untrimmed file 760 } 761 else{ 762 open (IN,$filename) or die $!; # original, untrimmed file 763 } 764 765 while (1){ 766 767 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not 768 my $l1 = <TRIM>; 769 my $seq = <TRIM>; # adapter trimmed sequence 770 my $l3 = <TRIM>; 771 my $qual = <TRIM>; 772 773 $_ = <IN>; # irrelevant 774 my $original_seq = <IN>; 775 $_ = <IN>; # irrelevant 776 $_ = <IN>; # irrelevant 777 778 $_ = <QUAL>; # irrelevant 779 my $qual_trimmed_seq = <QUAL>; 780 $_ = <QUAL>; # irrelevant 781 my $qual_trimmed_qual = <QUAL>; 782 783 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings 784 785 $count++; 786 if ($count%10000000 == 0){ 787 warn "$count sequences processed\n"; 788 } 789 790 chomp $seq; 791 chomp $qual; 792 chomp $qual_trimmed_seq; 793 chomp $original_seq; 794 795 my $quality_trimmed_seq_length = length $qual_trimmed_seq; 796 797 if (length $original_seq > length $qual_trimmed_seq){ 798 ++$quality_trimmed; 799 } 800 801 my $nd = 0; 802 803 ### NON-DIRECTIONAL RRBS 804 if ($non_directional){ 805 $nd = 1; 806 if (length$seq > 2){ # only doing something if the read is longer than 2bp 807 808 # only trimming Read 1 of a pair for a further 2bp from their 3' end 809 if ($seq =~ /^CAA/){ # this might be a non-directional sequence 810 ++$CAA; 811 $seq = substr ($seq,2,length($seq)-2); 812 $qual = substr ($qual,2,length($qual)-2); 813 ++$rrbs_trimmed_start; 814 } 815 elsif ($seq =~ /^CGA/){ # this might be a non-directional sequence 816 $seq = substr ($seq,2,length($seq)-2); 817 $qual = substr ($qual,2,length($qual)-2); 818 ++$CGA; 819 ++$rrbs_trimmed_start; 820 } 821 else{ 822 # If the reads look like standard OT/OB sequences (CGG/TGG) 823 # we need to trim in the same way as for directional sequences 824 if (length$seq < $quality_trimmed_seq_length){ 825 $seq = substr ($seq,0,length($seq)-2); 826 $qual = substr ($qual,0,length($qual)-2); 827 ++$rrbs_trimmed; 828 } 829 } 830 } 831 else{ 832 # read is too short anyway (so it will probably not survive the length filtering step 833 } 834 } 835 836 ### directional read 837 unless ($nd == 1){ 838 # only trimming Read 1 of a pair for a further 2bp from their 3' end 839 if ($validate){ # paired end 840 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 841 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){ 842 $seq = substr ($seq,0,length($seq)-2); 843 $qual = substr ($qual,0,length($qual)-2); 844 ++$rrbs_trimmed; 845 } 846 } 847 else{ 848 # this is read 2 of a pair. We do not trim further from the 3' end but rather trim R2 from the 5' end later 849 } 850 } 851 else{ # single-end reads will be trimmed from their 3' end 852 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){ 853 $seq = substr ($seq,0,length($seq)-2); 854 $qual = substr ($qual,0,length($qual)-2); 855 ++$rrbs_trimmed; 856 } 857 } 858 } 859 860 ### Shortening all sequences by 1 bp on the 3' end 861 # 28 02 2019: This was really only required for Bowtie 1 paired-end alignments, maybe we should drop this option in soon 862 if ($trim){ 863 $seq = substr($seq,0,length($seq)-1); 864 $qual = substr($qual,0,length($qual)-1); 865 } 866 867 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE 868 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) 869 print OUT "$l1$seq\n$l3$qual\n"; 870 } 871 else{ # single end 872 873 if ($clip_r1){ 874 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again 875 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 876 $qual = substr($qual,$clip_r1); 877 } 878 } 879 880 if ($three_prime_clip_r1){ 881 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again 882 # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; 883 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence 884 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 )); 885 # warn "$seq\n$qual\n"; 886 } 887 } 888 889 if (defined $maxn){ 890 my $n_count = Ncounter($seq); 891 # warn "Checking for Ns: Found $n_count\n"; 892 if ($n_count > $maxn){ 893 ++$too_many_n; 894 next; 895 } 896 } 897 898 if (length $seq < $length_cutoff){ 899 ++$too_short; 900 next; 901 } 902 elsif($max_length and length$seq > $max_length){ 903 ++$too_long; 904 next; # sequence is too long 905 } 906 else{ 907 print OUT "$l1$seq\n$l3$qual\n"; 908 } 909 } 910 } 911 912 print REPORT "\n"; 913 while (<ERROR>){ 914 warn $_; 915 print REPORT $_; 916 } 917 918 close IN or die "Unable to close IN filehandle: $!"; 919 close QUAL or die "Unable to close QUAL filehandle: $!"; 920 close TRIM or die "Unable to close TRIM filehandle: $!"; 921 close OUT or die "Unable to close OUT filehandle: $!"; 922 923 } 924 ############################################################################################################ 925 926 elsif($polyA){ # PolyA trimming 927 928 warn "POLY-A TRIMMING MODE; EXPERIMENTAL!!\n"; 929 my $isR2 = 0; 930 931 # For the moment we set the temp file name back to $filename 932 $temp = $filename; 933 934 ### optionally using 2 different adapters for read 1 and read 2 935 if ($validate and $a2){ 936 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 937 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 938 warn "\n >>> Now performing Poly-A trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; 939 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 940 } 941 else{ # this is read 2 of a pair 942 $isR2 = 1; 943 warn "\n >>> Now performing Poly-A trimming for the adapter sequence: '$a2' from file $temp <<< \n"; 944 # For Read 2 we need to trim the PolyT (or PolyA) from the 5' end instead! Hence -g $a2 and not -a! 945 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -g $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 946 } 947 } 948 ### Using the same adapter for both read 1 and read 2 - Single end will use the same adapters are Read 1s for paired-end files 949 else{ 950 warn "\n >>> Now performing single-end Poly-A trimming for with the sequence: '$adapter' from file $temp <<< \n"; 951 # sleep (3); 952 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt $cores -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 953 } 954 955 close WRITER or die $!; # not needed 956 957 # This is the Illumina adapter trimmed file 958 if ($temp =~ /\.gz$/){ 959 open (QUAL,"gunzip -c $output_dir$temp |") or die $!; # quality trimmed file 960 } 961 else{ 962 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file 963 } 964 965 while(1){ 966 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not 967 my $l1 = <TRIM>; 968 my $seq = <TRIM>; # adapter trimmed sequence 969 my $l3 = <TRIM>; 970 my $qual = <TRIM>; 971 972 # $_ = <IN>; # irrelevant 973 # my $original_seq = <IN>; 974 # $_ = <IN>; # irrelevant 975 # $_ = <IN>; # irrelevant 976 977 $_ = <QUAL>; # irrelevant 978 my $qual_trimmed_seq = <QUAL>; 979 $_ = <QUAL>; # irrelevant 980 my $qual_trimmed_qual = <QUAL>; 981 982 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings 983 984 $count++; 985 if ($count%10000000 == 0){ 986 warn "$count sequences processed\n"; 987 } 988 chomp $l1; 989 chomp $seq; 990 chomp $qual; 991 chomp $qual_trimmed_seq; 992 # chomp $original_seq; 993 994 my $quality_trimmed_seq_length = length $qual_trimmed_seq; 995 996 # if (length $original_seq > length $qual_trimmed_seq){ 997 # ++$quality_trimmed; 998 # } 999 1000 my $diff = length($qual_trimmed_seq) - length($seq); 1001 1002 ### CHANGING THE readID to remove white spaces and adding PolyA trimming information 1003 $l1 =~ s/\s+/_/g; # removing white spaces from readID 1004 if ($diff > 0){ # only adding the PolyA removal tag if some A's were really removed, so we can filter these out later if desired 1005 # warn "$l1\n"; 1006 $l1 =~ s/\@/\@${diff}:A:/; # also adding the PolyA trimmed reads to the start of the read in the format 1007 # "trimmed_bases:A:" as this is the current Way Thermo Fisher are handling it 1008 $l1 .= "_PolyA:$diff"; 1009 # warn "$l1\n~~~~~~~~~~~~~\n"; sleep(1); 1010 } 1011 1012 ### Shortening all sequences by 1 bp on the 3' end - This is probably no longer needed since we have stopped using Bowtie (1) 1013 if ($trim){ 1014 $seq = substr($seq,0,length($seq)-1); 1015 $qual = substr($qual,0,length($qual)-1); 1016 } 1017 1018 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE 1019 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) 1020 if ($isR2){ 1021 #print "$l1\n$qual_trimmed_seq\n$seq\n$l3$qual_trimmed_qual\n$qual\n~~~~~\n"; 1022 # print "length original: ",length($qual_trimmed_seq) , "\nlength trimmed: ", length($seq) , "\nDifference: $diff bp\n ~~~\n\n"; sleep(1); 1023 } 1024 print OUT "$l1\n$seq\n$l3$qual\n"; 1025 } 1026 else{ # single end 1027 1028 if ($clip_r1){ 1029 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again 1030 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 1031 $qual = substr($qual,$clip_r1); 1032 } 1033 } 1034 1035 if ($three_prime_clip_r1){ 1036 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again 1037 # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; 1038 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence 1039 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 )); 1040 # warn "$seq\n$qual\n"; 1041 } 1042 } 1043 1044 if (length $seq < $length_cutoff){ 1045 ++$too_short; 1046 next; 1047 } 1048 elsif($max_length and length$seq > $max_length){ 1049 ++$too_long; 1050 next; # sequence is too long 1051 } 1052 else{ 1053 print OUT "$l1\n$seq\n$l3$qual\n"; 1054 } 1055 } 1056 } 1057 1058 print REPORT "\n"; 1059 while (<ERROR>){ 1060 warn $_; 1061 print REPORT $_; 1062 } 1063 1064 close QUAL or die "Unable to close QUAL filehandle: $!"; 1065 close TRIM or die "Unable to close TRIM filehandle: $!"; 1066 close OUT or die "Unable to close OUT filehandle: $!"; 1067 } 1068 1069 ############################################################################################################ 1070 else{ # non-RRBS mode. default 1071 1072 my $quality_cutoff; 1073 1074 if ($nextseq){ 1075 $quality_cutoff = $nextseq; 1076 } 1077 else{ 1078 $quality_cutoff = "-q $cutoff"; 1079 } 1080 1081 ### optionally using 2 different adapters for read 1 and read 2 1082 if ($validate and $a2){ 1083 1084 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 1085 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 1086 warn "\n >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; 1087 #sleep (1); 1088 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; 1089 } 1090 else{ # this is read 2 of a pair 1091 warn "\n >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n"; 1092 #sleep (1); 1093 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $a2 $filename") or die "Failed to launch Cutadapt: $!"; 1094 } 1095 } 1096 ### Using the same adapter for both read 1 and read 2 1097 else{ 1098 warn "\n >>> Now performing quality (cutoff '$quality_cutoff') and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; 1099 #sleep (1); 1100 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt $cores -e $error_rate $quality_cutoff -O $stringency $trim_n -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; 1101 } 1102 1103 close WRITER or die $!; # not needed 1104 1105 while (1){ 1106 1107 my $l1 = <TRIM>; 1108 my $seq = <TRIM>; # quality and/or adapter trimmed sequence 1109 my $l3 = <TRIM>; 1110 my $qual = <TRIM>; 1111 # print "$l1$seq\n$l3$qual\n"; 1112 last unless (defined $qual); # could be an empty string 1113 1114 $count++; 1115 if ($count%10000000 == 0){ 1116 warn "$count sequences processed\n"; 1117 } 1118 1119 chomp $seq; 1120 chomp $qual; 1121 1122 ### Shortening all sequences by 1 bp on the 3' end 1123 if ($trim){ 1124 $seq = substr($seq,0,length($seq)-1); 1125 $qual = substr($qual,0,length($qual)-1); 1126 } 1127 1128 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE 1129 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) 1130 print OUT "$l1$seq\n$l3$qual\n"; 1131 } 1132 else{ # single end 1133 1134 if ($clip_r1){ 1135 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again 1136 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 1137 $qual = substr($qual,$clip_r1); 1138 } 1139 } 1140 1141 if ($three_prime_clip_r1){ 1142 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again 1143 # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; 1144 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence 1145 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1)); 1146 # warn "$seq\n$qual\n";sleep(1); 1147 } 1148 } 1149 1150 if (defined $maxn){ 1151 my $n_count = Ncounter($seq); 1152 # warn "Checking for Ns: Found $n_count\n"; 1153 if ($n_count > $maxn){ 1154 ++$too_many_n; 1155 next; 1156 } 1157 } 1158 1159 if (length $seq < $length_cutoff){ 1160 ++$too_short; 1161 next; 1162 } 1163 elsif ($max_length and length$seq > $max_length){ 1164 ++$too_long; 1165 next; # sequence is too long 1166 } 1167 else{ 1168 print OUT "$l1$seq\n$l3$qual\n"; 1169 } 1170 } 1171 } 1172 1173 print REPORT "\n"; 1174 while (<ERROR>){ 1175 warn $_; 1176 print REPORT $_; 1177 } 1178 1179 close TRIM or die "Unable to close TRIM filehandle: $!\n"; 1180 close ERROR or die "Unable to close ERROR filehandle: $!\n"; 1181 close OUT or die "Unable to close OUT filehandle: $!\n"; 1182 1183 } 1184 1185 1186 if ($rrbs){ 1187 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files 1188 1189 # deleting temporary quality trimmed file 1190 my $deleted = unlink "$output_dir$temp"; 1191 1192 if ($deleted){ 1193 warn "Successfully deleted temporary file $temp\n\n"; 1194 } 1195 else{ 1196 warn "Could not delete temporary file $temp"; 1197 } 1198 } 1199 } 1200 1201 ### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process 1202 waitpid $pid, 0; 1203 unless ($? == 0){ 1204 die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n"; 1205 } 1206 1207 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; 1208 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; 1209 1210 warn "="x 45,"\n"; 1211 print REPORT "="x 45,"\n"; 1212 1213 warn "$count sequences processed in total\n"; 1214 print REPORT "$count sequences processed in total\n"; 1215 1216 ### only reporting this separately if quality and adapter trimming were performed separately 1217 if ($rrbs){ 1218 my $percentage_shortened; 1219 if ($count){ 1220 $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100); 1221 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; 1222 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; 1223 } 1224 else{ 1225 warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; 1226 print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; 1227 } 1228 } 1229 1230 my $percentage_too_short; 1231 my $percentage_too_long; 1232 my $percentage_too_many_n; 1233 if ($count){ 1234 $percentage_too_short = sprintf ("%.1f",$too_short/$count*100); 1235 $percentage_too_long = sprintf ("%.1f",$too_long/$count*100); 1236 $percentage_too_many_n = sprintf ("%.1f",$too_many_n/$count*100); 1237 } 1238 else{ 1239 $percentage_too_short = 'N/A'; 1240 $percentage_too_long = 'N/A'; 1241 $percentage_too_many_n = 'N/A'; 1242 } 1243 1244 1245 1246 if ($validate){ ### only for paired-end files 1247 warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n"; 1248 } 1249 else{ ### Single-end file 1250 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; 1251 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; 1252 if (defined $maxn){ 1253 warn "Sequences removed because they contained more Ns than the cutoff of $maxn:\t$too_many_n ($percentage_too_many_n%)\n"; 1254 print REPORT "Sequences removed because they contained more Ns than the cutoff of $maxn:\t$too_many_n ($percentage_too_many_n%)\n"; 1255 } 1256 if ($max_length){ 1257 warn "Sequences removed because after trimming they were longer than the maximum length cutoff of $max_length bp:\t$too_long ($percentage_too_long%)\n"; 1258 print REPORT "Sequences removed because after trimming they were longer than the maximum length cutoff of $max_length bp:\t$too_long ($percentage_too_long%)\n"; 1259 } 1260 } 1261 1262 if ($rrbs){ 1263 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100); 1264 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; 1265 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; 1266 } 1267 1268 if ($non_directional){ 1269 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100); 1270 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; 1271 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; 1272 } 1273 1274 warn "\n"; 1275 print REPORT "\n"; 1276 1277 ### RUNNING FASTQC unless we are dealing with paired-end files 1278 unless($validate){ 1279 1280 # File renaming requested in Issue #17 (https://github.com/FelixKrueger/TrimGalore/issues/17) 1281 ### FILE RENAMING 1282 if ($basename){ 1283 warn "Now renaming the output file $output_filename\n\n"; 1284 my $tempname = "${basename}$1" if ($output_filename =~ /(_trimmed.*)$/); 1285 warn "ORIGINAL FILE 1: >>$output_filename<<\tRENAMING TO:>>$tempname<<\n"; 1286 1287 rename "${output_dir}$output_filename", "${output_dir}$tempname"; 1288 $output_filename = $tempname; 1289 } 1290 1291 if ($fastqc){ 1292 warn "\n >>> Now running FastQC on the data <<<\n\n"; 1293 # sleep (1); 1294 if ($fastqc_args){ 1295 system ("$path_to_fastqc $fastqc_args $output_dir$output_filename"); 1296 } 1297 else{ 1298 system ("$path_to_fastqc $output_dir$output_filename"); 1299 } 1300 } 1301 } 1302 1303 ### VALIDATE PAIRED-END FILES 1304 if ($validate){ 1305 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 1306 1307 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 1308 $file_1 = $output_filename; 1309 shift @filenames; 1310 # warn "This is read 1: $file_1\n\n"; 1311 } 1312 else{ # this is read 2 of a pair 1313 $file_2 = $output_filename; 1314 shift @filenames; 1315 # warn "This is read 2: $file_2\n\n"; 1316 } 1317 1318 if ($file_1 and $file_2){ 1319 warn "Validate paired-end files $file_1 and $file_2\n"; 1320 1321 # File renaming requested in Issue #17 (https://github.com/FelixKrueger/TrimGalore/issues/17) 1322 ### FILE RENAMING 1323 if ($basename){ 1324 warn "Now renaming the output files\n\n"; 1325 my $tempname_1 = "${basename}_R1$1" if ($file_1 =~ /(_trimmed.*)$/); 1326 my $tempname_2 = "${basename}_R2$1" if ($file_2 =~ /(_trimmed.*)$/); 1327 warn "ORIGINAL FILE 1: >>$file_1<<\tRENAMING TO:>>$tempname_1<<\n"; 1328 warn "ORIGINAL FILE 2: >>$file_2<<\tRENAMING TO:>>$tempname_2<<\n"; 1329 1330 rename "${output_dir}${file_1}","${output_dir}$tempname_1"; 1331 rename "${output_dir}${file_2}","${output_dir}$tempname_2"; 1332 $file_1 = $tempname_1; 1333 $file_2 = $tempname_2; 1334 1335 } 1336 # sleep (1); 1337 1338 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2); 1339 1340 ### RUNNING FASTQC 1341 if ($fastqc){ 1342 1343 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n"; 1344 1345 if ($fastqc_args){ 1346 system ("$path_to_fastqc $fastqc_args $output_dir$val_1"); 1347 } 1348 else{ 1349 system ("$path_to_fastqc $output_dir$val_1"); 1350 } 1351 1352 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n"; 1353 #sleep (3); 1354 1355 if ($fastqc_args){ 1356 system ("$path_to_fastqc $fastqc_args $output_dir$val_2"); 1357 } 1358 else{ 1359 system ("$path_to_fastqc $output_dir$val_2"); 1360 } 1361 1362 } 1363 1364 warn "Deleting both intermediate output files $file_1 and $file_2\n"; 1365 unlink "$output_dir$file_1"; 1366 unlink "$output_dir$file_2"; 1367 1368 warn "\n",'='x100,"\n\n"; 1369 # sleep (1); 1370 1371 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed 1372 $file_2 = undef; 1373 } 1374 } 1375 close REPORT or warn "Failed to close filehandle REPORT: $!\n"; 1376} 1377 1378sub hardtrim_to_5prime_end{ 1379 1380 my $filename = shift; 1381 warn "Input file name: $filename\n"; 1382 1383 my $in; # filehandle 1384 my $hardtrim_5; # filehandle 1385 1386 if ($filename =~ /gz$/){ 1387 open ($in,"$compression_path -d -c $filename |") or die "Couldn't read from file $!"; 1388 } 1389 else{ 1390 open ($in,$filename) or die "Couldn't read from file $!"; 1391 } 1392 1393 ### OUTPUT FILE 1394 my $output_filename = (split (/\//,$filename))[-1]; 1395 # warn "Output file name: $output_filename\n"; 1396 1397 if ($output_filename =~ /\.fastq$/){ 1398 $output_filename =~ s/\.fastq$/.${hardtrim5}bp_5prime.fq/; 1399 } 1400 elsif ($output_filename =~ /\.fastq\.gz$/){ 1401 $output_filename =~ s/\.fastq\.gz$/.${hardtrim5}bp_5prime.fq/; 1402 } 1403 elsif ($output_filename =~ /\.fq$/){ 1404 $output_filename =~ s/\.fq$/.${hardtrim5}bp_5prime.fq/; 1405 } 1406 elsif ($output_filename =~ /\.fq\.gz$/){ 1407 $output_filename =~ s/\.fq\.gz$/.${hardtrim5}bp_5prime.fq/; 1408 } 1409 else{ 1410 $output_filename =~ s/$/.${hardtrim5}bp_5prime.fq/; 1411 } 1412 1413 if ($gzip or $filename =~ /\.gz$/){ 1414 if ($dont_gzip){ 1415 open ($hardtrim_5,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file 1416 } 1417 else{ 1418 $output_filename .= '.gz'; 1419 open ($hardtrim_5,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n"; 1420 } 1421 } 1422 else{ 1423 open ($hardtrim_5,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; 1424 } 1425 warn "Writing trimmed version (using the first $hardtrim5 bp only) of the input file '$filename' to '$output_filename'\n"; 1426 1427 my $count = 0; 1428 1429 while (1){ 1430 my $identifier = <$in>; 1431 my $sequence = <$in>; 1432 my $identifier2 = <$in>; 1433 my $quality_score = <$in>; 1434 1435 last unless ($identifier and $sequence and $identifier2 and $quality_score); 1436 ++$count; 1437 1438 $sequence =~ s/\r|\n//g; # cross-platform line ending trimming 1439 $quality_score =~ s/\r|\n//g; 1440 $identifier =~ s/\r|\n//g; 1441 $identifier2 =~ s/\r|\n//g; 1442 1443 my $trimmed_sequence = substr($sequence,0,$hardtrim5); # from the start to the $hardtrim5 1444 my $trimmed_quality_score = substr($quality_score,0,$hardtrim5); # from the start to the $hardtrim5 1445 1446 print {$hardtrim_5} join ("\n",$identifier,$trimmed_sequence,$identifier2,$trimmed_quality_score),"\n"; 1447 1448 } 1449 close $in or warn "Failed to close filehandle for $filename"; 1450 close $hardtrim_5 or die "Failed to close out-filehandle hardtrim_5: $!"; 1451 1452 warn "\nFinished writing out converted version of the FastQ file $filename ($count sequences in total)\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; 1453 1454} 1455 1456sub hardtrim_to_3prime_end{ 1457 1458 my $filename = shift; 1459 warn "Input file name: $filename\n"; 1460 1461 my $in; # filehandle 1462 my $hardtrim_3; # filehandle 1463 1464 if ($filename =~ /gz$/){ 1465 open ($in,"$compression_path -d -c $filename |") or die "Couldn't read from file $!"; 1466 } 1467 else{ 1468 open ($in,$filename) or die "Couldn't read from file $!"; 1469 } 1470 1471 ### OUTPUT FILE 1472 my $output_filename = (split (/\//,$filename))[-1]; 1473 # warn "Output file name: $output_filename\n"; 1474 1475 if ($output_filename =~ /\.fastq$/){ 1476 $output_filename =~ s/\.fastq$/.${hardtrim3}bp_3prime.fq/; 1477 } 1478 elsif ($output_filename =~ /\.fastq\.gz$/){ 1479 $output_filename =~ s/\.fastq\.gz$/.${hardtrim3}bp_3prime.fq/; 1480 } 1481 elsif ($output_filename =~ /\.fq$/){ 1482 $output_filename =~ s/\.fq$/.${hardtrim3}bp_3prime.fq/; 1483 } 1484 elsif ($output_filename =~ /\.fq\.gz$/){ 1485 $output_filename =~ s/\.fq\.gz$/.${hardtrim3}bp_3prime.fq/; 1486 } 1487 else{ 1488 $output_filename =~ s/$/.${hardtrim3}bp_3prime.fq/; 1489 } 1490 1491 if ($gzip or $filename =~ /\.gz$/){ 1492 if ($dont_gzip){ 1493 open ($hardtrim_3,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file 1494 } 1495 else{ 1496 $output_filename .= '.gz'; 1497 open ($hardtrim_3,"| $compression_path -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n"; 1498 } 1499 } 1500 else{ 1501 open ($hardtrim_3,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; 1502 } 1503 warn "Writing trimmed version (using the last $hardtrim3 bp only) of the input file '$filename' to '$output_filename'\n"; 1504 1505 my $count = 0; 1506 1507 while (1){ 1508 my $identifier = <$in>; 1509 my $sequence = <$in>; 1510 my $identifier2 = <$in>; 1511 my $quality_score = <$in>; 1512 1513 last unless ($identifier and $sequence and $identifier2 and $quality_score); 1514 ++$count; 1515 1516 $sequence =~ s/\r|\n//g; # cross-platform line ending trimming 1517 $quality_score =~ s/\r|\n//g; 1518 $identifier =~ s/\r|\n//g; 1519 $identifier2 =~ s/\r|\n//g; 1520 1521 my $trimmed_sequence = substr($sequence,-$hardtrim3,$hardtrim3); # $hardtrim3 bp from the end 1522 my $trimmed_quality_score = substr($quality_score,-$hardtrim3,$hardtrim3); # $hardtrim3 bp from the end 1523 1524 print {$hardtrim_3} join ("\n",$identifier,$trimmed_sequence,$identifier2,$trimmed_quality_score),"\n"; 1525 1526 } 1527 close $in or warn "Failed to close filehandle for $filename"; 1528 close $hardtrim_3 or die "Failed to close out-filehandle hardtrim_3: $!"; 1529 1530 warn "\nFinished writing out converted version of the FastQ file $filename ($count sequences in total)\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; 1531 1532} 1533 1534 1535sub validate_paired_end_files{ 1536 1537 my $file_1 = shift; 1538 my $file_2 = shift; 1539 1540 warn "file_1: $file_1, file_2: $file_2\n\n"; 1541 1542 1543 1544 if ($file_1 =~ /\.gz$/){ 1545 open (IN1,"$compression_path -d -c $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n"; 1546 } 1547 else{ 1548 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n"; 1549 } 1550 1551 if ($file_2 =~ /\.gz$/){ 1552 open (IN2,"$compression_path -d -c $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n"; 1553 } 1554 else{ 1555 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n"; 1556 } 1557 1558 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n"; 1559 1560 my $out_1 = $file_1; 1561 my $out_2 = $file_2; 1562 1563 if ($out_1 =~ /gz$/){ 1564 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/; 1565 } 1566 else{ 1567 $out_1 =~ s/trimmed\.fq$/val_1.fq/; 1568 } 1569 1570 if ($out_2 =~ /gz$/){ 1571 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/; 1572 } 1573 else{ 1574 $out_2 =~ s/trimmed\.fq$/val_2.fq/; 1575 } 1576 1577 if ($basename){ 1578 ### FILE RENAMING 1579 warn "Renaming the output files (AGAIN).\n"; 1580 # warn "ORIGINAL FILE 1: >>$out_1<<\n"; 1581 $out_1 =~ s/_R1_val_1.fq$/_val_1.fq/; 1582 # warn "ORIGINAL FILE 1: >>$out_1<<\n"; 1583 1584 # warn "ORIGINAL FILE 2: >>$out_2<<\n"; 1585 $out_2 =~ s/_R2_val_2.fq$/_val_2.fq/; 1586 # warn "ORIGINAL FILE 2: >>$out_2<<\n"; 1587 } 1588 1589 if ($gzip){ 1590 if ($dont_gzip){ 1591 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; 1592 } 1593 else{ 1594 $out_1 .= '.gz'; 1595 open (R1,"| $compression_path -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n"; 1596 } 1597 } 1598 else{ 1599 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; 1600 } 1601 1602 if ($gzip){ 1603 if ($dont_gzip){ 1604 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; 1605 } 1606 else{ 1607 $out_2 .= '.gz'; 1608 open (R2,"| $compression_path -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n"; 1609 } 1610 } 1611 else{ 1612 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; 1613 } 1614 1615 warn "Writing validated paired-end Read 1 reads to $out_1\n"; 1616 warn "Writing validated paired-end Read 2 reads to $out_2\n\n"; 1617 1618 my $unpaired_1; 1619 my $unpaired_2; 1620 1621 if ($retain){ 1622 1623 $unpaired_1 = $file_1; 1624 $unpaired_2 = $file_2; 1625 1626 if ($unpaired_1 =~ /gz$/){ 1627 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/; 1628 } 1629 else{ 1630 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/; 1631 } 1632 1633 if ($unpaired_2 =~ /gz$/){ 1634 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/; 1635 } 1636 else{ 1637 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/; 1638 } 1639 1640 if ($gzip){ 1641 if ($dont_gzip){ 1642 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; 1643 } 1644 else{ 1645 $unpaired_1 .= '.gz'; 1646 open (UNPAIRED1,"| $compression_path -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n"; 1647 } 1648 } 1649 else{ 1650 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; 1651 } 1652 1653 if ($gzip){ 1654 if ($dont_gzip){ 1655 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; 1656 } 1657 else{ 1658 $unpaired_2 .= '.gz'; 1659 open (UNPAIRED2,"| $compression_path -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n"; 1660 } 1661 } 1662 else{ 1663 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; 1664 } 1665 1666 warn "Writing unpaired read 1 reads to $unpaired_1\n"; 1667 warn "Writing unpaired read 2 reads to $unpaired_2\n\n"; 1668 } 1669 1670 my $sequence_pairs_removed = 0; 1671 my $too_many_N_pairs = 0; 1672 my $read_1_printed = 0; 1673 my $read_2_printed = 0; 1674 1675 my $count = 0; 1676 1677 while (1){ 1678 my $id_1 = <IN1>; 1679 my $seq_1 = <IN1>; 1680 my $l3_1 = <IN1>; 1681 my $qual_1 = <IN1>; 1682 1683 my $id_2 = <IN2>; 1684 my $seq_2 = <IN2>; 1685 my $l3_2 = <IN2>; 1686 my $qual_2 = <IN2>; 1687 1688 if ($id_1 and $seq_1 and $l3_1 and $qual_1 and $id_2 and $seq_2 and $l3_2 and $qual_2){ 1689 # all good, got a read from both files 1690 } 1691 elsif( !($id_1 and $seq_1 and $l3_1 and $qual_1) and ($id_2 and $seq_2 and $l3_2 and $qual_2)){ 1692 die "Read 1 output is truncated at sequence count: $count, please check your paired-end input files! Terminating...\n\n"; 1693 } 1694 elsif( !($id_2 and $seq_2 and $l3_2 and $qual_2) and ($id_1 and $seq_1 and $l3_1 and $qual_1)){ 1695 die "Read 2 output is truncated at sequence count: $count, please check your paired-end input files! Terminating...\n\n"; 1696 } 1697 else{ 1698 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1); 1699 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2); 1700 } 1701 ++$count; 1702 1703 ## small check if the sequence files appear to be FastQ files 1704 if ($count == 1){ # performed just once 1705 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){ 1706 die "Input file doesn't seem to be in FastQ format at sequence $count\n"; 1707 } 1708 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){ 1709 die "Input file doesn't seem to be in FastQ format at sequence $count\n"; 1710 } 1711 } 1712 1713 chomp $seq_1; 1714 chomp $seq_2; 1715 chomp $qual_1; 1716 chomp $qual_2; 1717 1718 if ($clip_r1){ 1719 if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again 1720 $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 1721 $qual_1 = substr($qual_1,$clip_r1); 1722 } 1723 } 1724 if ($clip_r2){ 1725 if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again 1726 $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence 1727 $qual_2 = substr($qual_2,$clip_r2); 1728 } 1729 } 1730 1731 if ($three_prime_clip_r1){ 1732 if (length $seq_1 > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again 1733 $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence 1734 $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1)); 1735 } 1736 } 1737 if ($three_prime_clip_r2){ 1738 if (length $seq_2 > $three_prime_clip_r2){ # sequences that are already too short won't be clipped again 1739 $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence 1740 $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2)); 1741 } 1742 } 1743 1744 if (defined $maxn){ 1745 1746 # Read 1 1747 my $n_count = Ncounter($seq_1); 1748 # warn "Checking Read 1 for Ns: Found $n_count\n"; 1749 if ($n_count > $maxn){ 1750 ++$too_many_N_pairs; 1751 ++$sequence_pairs_removed; 1752 next; # bailing straight away 1753 } 1754 1755 # Read 2 1756 $n_count = Ncounter($seq_2); 1757 # warn "Checking Read 2 for Ns: Found $n_count\n"; 1758 if ($n_count > $maxn){ 1759 ++$too_many_N_pairs; 1760 ++$sequence_pairs_removed; 1761 next; 1762 } 1763 } 1764 1765 ### making sure that the reads do have a sensible length 1766 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ 1767 ++$sequence_pairs_removed; 1768 if ($retain){ # writing out single-end reads if they are longer than the cutoff 1769 1770 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough 1771 print UNPAIRED1 $id_1; 1772 print UNPAIRED1 "$seq_1\n"; 1773 print UNPAIRED1 $l3_1; 1774 print UNPAIRED1 "$qual_1\n"; 1775 ++$read_1_printed; 1776 } 1777 1778 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough 1779 print UNPAIRED2 $id_2; 1780 print UNPAIRED2 "$seq_2\n"; 1781 print UNPAIRED2 $l3_2; 1782 print UNPAIRED2 "$qual_2\n"; 1783 ++$read_2_printed; 1784 } 1785 1786 } 1787 } 1788 else{ 1789 print R1 $id_1; 1790 print R1 "$seq_1\n"; 1791 print R1 $l3_1; 1792 print R1 "$qual_1\n"; 1793 1794 print R2 $id_2; 1795 print R2 "$seq_2\n"; 1796 print R2 $l3_2; 1797 print R2 "$qual_2\n"; 1798 } 1799 1800 } 1801 1802 1803 my $percentage; 1804 my $percentage_Ns; 1805 1806 if ($count){ 1807 $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100); 1808 $percentage_Ns = sprintf("%.2f",$too_many_N_pairs/$count*100); 1809 } 1810 else{ 1811 $percentage = 'N/A'; 1812 $percentage_Ns = 'N/A'; 1813 } 1814 1815 warn "Total number of sequences analysed: $count\n\n"; 1816 warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; 1817 if (defined $maxn){ 1818 warn "Number of sequence pairs removed because at least one read contained more N(s) than the specified limit of $maxn: $too_many_N_pairs ($percentage_Ns%)\n"; 1819 print REPORT "Number of sequence pairs removed because at least one read contained more N(s) than the specified limit of $maxn: $too_many_N_pairs ($percentage_Ns%)\n"; 1820 } 1821 1822 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n"; 1823 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; 1824 1825 if ($keep){ 1826 warn "Number of unpaired read 1 reads printed: $read_1_printed\n"; 1827 warn "Number of unpaired read 2 reads printed: $read_2_printed\n"; 1828 } 1829 1830 close R1 or die $!; 1831 close R2 or die $!; 1832 1833 if ($retain){ 1834 close UNPAIRED1 or die $!; 1835 close UNPAIRED2 or die $!; 1836 } 1837 1838 warn "\n"; 1839 if ($retain){ 1840 return ($out_1,$out_2,$unpaired_1,$unpaired_2); 1841 } 1842 else{ 1843 return ($out_1,$out_2); 1844 } 1845} 1846 1847 1848sub file_sanity_check{ 1849 1850 my $file = shift; 1851 if ($file =~ /gz$/){ 1852 open (SANITY,"$compression_path -d -c $file |") or die "Failed to read from file '$file' to perform sanity check: $!\n"; 1853 } 1854 else{ 1855 open (SANITY,$file) or die "Failed to read from file '$file' to perform sanity check: $!\n"; 1856 } 1857 1858 # just processing a single FastQ entry 1859 my $id = <SANITY>; 1860 my $seq = <SANITY>; 1861 my $three = <SANITY>; 1862 my $qual = <SANITY>; 1863 1864 unless ($id and $seq and $three and $qual){ 1865 die "Input file '$file' seems to be completely empty. Consider respecifying!\n\n"; 1866 } 1867 1868 chomp $seq; 1869 1870 # testing if the file is a colorspace file in which case we bail 1871 if ($seq =~ /\d+/){ 1872 die "File seems to be in SOLiD colorspace format which is not supported by Trim Galore (sequence is: '$seq')! Please use Cutadapt on colorspace files separately and check its documentation!\n\n"; 1873 } 1874 1875 close SANITY; 1876 1877} 1878 1879 1880 1881### ADAPTER AUTO-DETECTION 1882 1883sub autodetect_adapter_type{ 1884 warn "\n\nAUTO-DETECTING ADAPTER TYPE\n===========================\n"; 1885 warn "Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n"; 1886 1887 if ($ARGV[0] =~ /gz$/){ 1888 open (AUTODETECT,"$compression_path -d -c $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n"; 1889 } 1890 else{ 1891 open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n"; 1892 } 1893 1894 my %adapters; 1895 1896 $adapters{'Illumina'} -> {seq} = 'AGATCGGAAGAGC'; 1897 $adapters{'Illumina'} -> {count}= 0; 1898 $adapters{'Illumina'} -> {name}= 'Illumina TruSeq, Sanger iPCR; auto-detected'; 1899 1900 $adapters{'Nextera'} -> {seq} = 'CTGTCTCTTATA'; 1901 $adapters{'Nextera'} -> {count}= 0; 1902 $adapters{'Nextera'} -> {name}= 'Nextera Transposase sequence; auto-detected'; 1903 1904 $adapters{'smallRNA'} -> {seq} = 'TGGAATTCTCGG'; 1905 $adapters{'smallRNA'} -> {count}= 0; 1906 $adapters{'smallRNA'} -> {name}= 'Illumina small RNA adapter; auto-detected'; 1907 1908 1909 # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often 1910 my $count = 0; 1911 while (1){ 1912 1913 my $line1 = <AUTODETECT>; 1914 my $line2 = <AUTODETECT>; 1915 my $line3 = <AUTODETECT>; 1916 my $line4 = <AUTODETECT>; 1917 last unless ($line4); 1918 $count++; 1919 last if ($count == 1000000); 1920 1921 chomp $line2; 1922 $adapters{'Illumina'}->{count}++ unless (index($line2,'AGATCGGAAGAGC')== -1); 1923 $adapters{'Nextera'} ->{count}++ unless (index($line2,'CTGTCTCTTATA') == -1); 1924 $adapters{'smallRNA'}->{count}++ unless (index($line2,'TGGAATTCTCGG') == -1); 1925 1926 } 1927 1928 my $highest; 1929 my $third; 1930 my $second; 1931 my $seq; 1932 my $adapter_name; 1933 my $report_message; 1934 1935 warn "Found perfect matches for the following adapter sequences:\nAdapter type\tCount\tSequence\tSequences analysed\tPercentage\n"; 1936 foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){ 1937 1938 my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100); 1939 1940 warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n"; 1941 1942 unless (defined $highest){ 1943 $highest = $adapter; 1944 $seq = $adapters{$adapter}->{seq}; 1945 $adapter_name = $adapters{$adapter}->{name}; 1946 next; 1947 } 1948 unless (defined $second){ 1949 $second = $adapter; 1950 next; 1951 } 1952 unless (defined $third){ 1953 $third = $adapter; 1954 next; 1955 } 1956 } 1957 1958 1959 if ($adapters{$highest}->{count} == $adapters{$second}->{count} and $adapters{$highest}->{count} == $adapters{$third}->{count}){ 1960 warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n"; 1961 $report_message .= "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n"; 1962 1963 if (defined $consider_already_trimmed){ 1964 if ($adapters{$highest}->{count} <= $consider_already_trimmed ){ 1965 warn "No auto-detected adapter sequence exceeded the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 1966 $report_message .= "No auto-detected adapter sequence exceeded the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 1967 $adapter_name = 'No adapter trimming [suppressed by user]'; 1968 $seq = 'X'; 1969 } 1970 else{ 1971 warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; 1972 $report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n"; 1973 $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; 1974 $seq = 'AGATCGGAAGAGC'; 1975 } 1976 } 1977 else{ 1978 warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; 1979 $report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n"; 1980 $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; 1981 $seq = 'AGATCGGAAGAGC'; 1982 } 1983 } 1984 elsif ($adapters{$highest}->{count} == $adapters{$second}->{count} ){ 1985 warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n"; 1986 $report_message .= "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count}, count $third: $adapters{$third}->{count})\n"; 1987 1988 if (defined $consider_already_trimmed){ 1989 if ($adapters{$highest}->{count} <= $consider_already_trimmed ){ 1990 warn "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 1991 $report_message .= "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 1992 $adapter_name = 'No adapter trimming [suppressed by user]'; 1993 $seq = 'X'; 1994 } 1995 else{ 1996 # If one of the highest contaminants was the Illumina adapter, we set that one and print a warning 1997 if ( ($highest eq 'Illumina') or ($second eq 'Illumina')) { 1998 warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; 1999 $report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior.\n"; 2000 $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; 2001 $seq = 'AGATCGGAAGAGC'; 2002 } 2003 else{ 2004 warn "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior).\n"; 2005 $report_message .= "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior.\n"; 2006 $adapter_name = 'Nextera; (assigned because of inconclusive auto-detection)'; 2007 $seq = 'CTGTCTCTTATA'; 2008 } 2009 } 2010 } 2011 else{ 2012 # If one of the highest contaminants was the Illumina adapter, we set that one and print a warning 2013 if ( ($highest eq 'Illumina') or ($second eq 'Illumina')) { 2014 warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; 2015 $report_message .= "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior.\n"; 2016 $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; 2017 $seq = 'AGATCGGAAGAGC'; 2018 } 2019 else{ 2020 warn "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior).\n"; 2021 $report_message .= "Defaulting to Nextera adapter as next best option ( CTGTCTCTTATA ). Specify -a SEQUENCE to avoid this behavior.\n"; 2022 $adapter_name = 'Nextera; (assigned because of inconclusive auto-detection)'; 2023 $seq = 'CTGTCTCTTATA'; 2024 } 2025 } 2026 } 2027 else{ 2028 if (defined $consider_already_trimmed){ 2029 if ($adapters{$highest}->{count} <= $consider_already_trimmed ){ 2030 warn "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 2031 $report_message .= "The highest auto-detected adapter sequence did not exceed the user-specified 'already adapter-trimmed' limit ($consider_already_trimmed). Setting adapter sequence to -a X\n"; 2032 $adapter_name = 'No adapter trimming [suppressed by user]'; 2033 $seq = 'X'; 2034 } 2035 else{ 2036 # using the highest occurrence as adapter to look out for 2037 $report_message .= "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n"; 2038 warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; 2039 } 2040 } 2041 else{ 2042 # using the highest occurrence as adapter to look out for 2043 $report_message .= "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n"; 2044 warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; 2045 } 2046 } 2047 2048 close AUTODETECT; 2049 2050 return ($seq,$adapter_name,$report_message); 2051 2052} 2053 2054sub autodetect_polyA_type{ 2055 2056 warn "\n\nAUTO-DETECTING POLY-A TYPE\n===========================\n"; 2057 warn "Attempting to auto-detect PolyA type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n"; 2058 2059 if ($ARGV[0] =~ /gz$/){ 2060 open (AUTODETECT,"$compression_path -d -c $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n"; 2061 } 2062 else{ 2063 open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n"; 2064 } 2065 2066 my %adapters; 2067 2068 $adapters{'PolyA'} -> {seq} = 'AAAAAAAAAA'; 2069 $adapters{'PolyA'} -> {count}= 0; 2070 $adapters{'PolyA'} -> {name}= 'Poly-A Read 1; auto-detected'; 2071 2072 $adapters{'PolyT'} -> {seq} = 'TTTTTTTTTT'; 2073 $adapters{'PolyT'} -> {count}= 0; 2074 $adapters{'PolyT'} -> {name}= 'Poly-T Read 1; auto-detected'; 2075 2076 2077 # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often 2078 my $count = 0; 2079 while (1){ 2080 2081 my $line1 = <AUTODETECT>; 2082 my $line2 = <AUTODETECT>; 2083 my $line3 = <AUTODETECT>; 2084 my $line4 = <AUTODETECT>; 2085 last unless ($line4); 2086 $count++; 2087 last if ($count == 1000000); 2088 2089 chomp $line2; 2090 $adapters{'PolyA'}->{count}++ unless (index($line2,'AAAAAAAAAA')== -1); 2091 $adapters{'PolyT'} ->{count}++ unless (index($line2,'TTTTTTTTTT') == -1); 2092 2093 } 2094 2095 my $highest; 2096 my $second; 2097 my $seq; 2098 my $adapter_name; 2099 2100 warn "Found perfect matches for the following mono-polymer sequences:\nPoly-nucleotide type\tCount\tSequence\tSequences analysed\tPercentage\n"; 2101 foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){ 2102 2103 my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100); 2104 2105 warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n"; 2106 2107 unless (defined $highest){ 2108 $highest = $adapter; 2109 $seq = $adapters{$adapter}->{seq}; 2110 $adapter_name = $adapters{$adapter}->{name}; 2111 next; 2112 } 2113 unless (defined $second){ 2114 $second = $adapter; 2115 } 2116 } 2117 2118 2119 # using the highest occurrence as adapter to look out for 2120 if ($adapters{$highest}->{count} == $adapters{$second}->{count}){ 2121 warn "Unable to auto-detect most prominent mono-polymer from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{count})\n"; 2122 2123 if ($adapters{$highest}->{count} == 0){ 2124 warn "Defaulting to Poly-A. Specify -a SEQUENCE to avoid this behavior).\n\n"; 2125 $adapter_name = 'Poly-A (inconclusive auto-detection)'; 2126 $seq = extend_adapter_sequence ('A',20); 2127 } 2128 else{ 2129 warn "Using $highest poly-monomer for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; 2130 } 2131 } 2132 else{ 2133 warn "Using $highest Polymer for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; 2134 } 2135 2136 close AUTODETECT; 2137 2138 return ($seq,$adapter_name); 2139 2140} 2141 2142 2143########################################################################### 2144 2145sub process_commandline{ 2146 my $help; 2147 my $quality; 2148 my $adapter; 2149 my $adapter2; 2150 my $stringency; 2151 my $report; 2152 my $version; 2153 my $rrbs; 2154 my $length_cutoff; 2155 my $keep; 2156 my $fastqc; 2157 my $non_directional; 2158 my $phred33; 2159 my $phred64; 2160 my $fastqc_args; 2161 my $trim; 2162 my $gzip; 2163 my $validate; 2164 my $retain; 2165 my $length_read_1; 2166 my $length_read_2; 2167 my $error_rate; 2168 my $output_dir; 2169 my $no_report_file; 2170 my $suppress_warn; 2171 my $dont_gzip; 2172 my $clip_r1; 2173 my $clip_r2; 2174 my $three_prime_clip_r1; 2175 my $three_prime_clip_r2; 2176 my $nextera; 2177 my $small_rna; 2178 my $illumina; 2179 my $path_to_cutadapt; 2180 my $max_length; 2181 my $maxn; 2182 my $trimn; 2183 my $hardtrim5; 2184 my $hardtrim3; 2185 my $clock; 2186 my $polyA; 2187 my $nextseq; 2188 my $basename; 2189 my $cores; 2190 my $compression_path; 2191 my $consider_already_trimmed; 2192 2193 my $command_line = GetOptions ('help|man' => \$help, 2194 'q|quality=i' => \$quality, 2195 'a|adapter=s' => \$adapter, 2196 'a2|adapter2=s' => \$adapter2, 2197 'report' => \$report, 2198 'version' => \$version, 2199 'stringency=i' => \$stringency, 2200 'fastqc' => \$fastqc, 2201 'RRBS' => \$rrbs, 2202 'keep' => \$keep, 2203 'length=i' => \$length_cutoff, 2204 'non_directional' => \$non_directional, 2205 'phred33' => \$phred33, 2206 'phred64' => \$phred64, 2207 'fastqc_args=s' => \$fastqc_args, 2208 'trim1' => \$trim, 2209 'gzip' => \$gzip, 2210 'paired_end' => \$validate, 2211 'retain_unpaired' => \$retain, 2212 'length_1|r1=i' => \$length_read_1, 2213 'length_2|r2=i' => \$length_read_2, 2214 'e|error_rate=s' => \$error_rate, 2215 'o|output_dir=s' => \$output_dir, 2216 'no_report_file' => \$no_report_file, 2217 'suppress_warn' => \$suppress_warn, 2218 'dont_gzip' => \$dont_gzip, 2219 'clip_R1=i' => \$clip_r1, 2220 'clip_R2=i' => \$clip_r2, 2221 'three_prime_clip_R1=i' => \$three_prime_clip_r1, 2222 'three_prime_clip_R2=i' => \$three_prime_clip_r2, 2223 'illumina' => \$illumina, 2224 'nextera' => \$nextera, 2225 'small_rna' => \$small_rna, 2226 'path_to_cutadapt=s' => \$path_to_cutadapt, 2227 'max_length=i' => \$max_length, 2228 'max_n=i' => \$maxn, 2229 'trim-n' => \$trimn, 2230 'hardtrim5=i' => \$hardtrim5, 2231 'hardtrim3=i' => \$hardtrim3, 2232 'clock|casio|breitling' => \$clock, 2233 'polyA' => \$polyA, 2234 '2colour|nextseq=i' => \$nextseq, 2235 'basename=s' => \$basename, 2236 'j|cores=i' => \$cores, 2237 'consider_already_trimmed=i' => \$consider_already_trimmed, 2238 ); 2239 2240 ### EXIT ON ERROR if there were errors with any of the supplied options 2241 unless ($command_line){ 2242 die "Please respecify command line options\n"; 2243 } 2244 2245 ### HELPFILE 2246 if ($help){ 2247 print_helpfile(); 2248 exit; 2249 } 2250 2251 2252 2253 2254 2255 if ($version){ 2256 print << "VERSION"; 2257 2258 Quality-/Adapter-/RRBS-/Speciality-Trimming 2259 [powered by Cutadapt] 2260 version $trimmer_version 2261 2262 Last update: 24 09 2019 2263 2264VERSION 2265 exit; 2266 } 2267 2268 # testing whether the filenames contain white space. This can only ever be the case if passed within "quotes" 2269 foreach my $filename (@ARGV){ 2270 if ($filename =~ /\s+/){ 2271 die "\n[FATAL ERROR]: Input file names ('$filename') supplied with whitespace(s). Please move to directory containing the input file(s), 2272 and/or avoid using whitespace(s) at all costs (e.g. consider using '_' instead), and try again.\n\n"; 2273 } 2274 } 2275 2276 # NUMBER OF CORES 2277 if (defined $cores){ 2278 if ($cores < 1){ 2279 die "Please a use a positive integer for the number of cores to be used, and re-specify!\n\n"; 2280 } 2281 elsif($cores == 1){ 2282 warn "Proceeding with single-core trimming (user-defined)\n"; 2283 } 2284 elsif($cores >= 8){ 2285 warn "Using an excessive number of cores has a diminishing return! It is recommended not to exceed 8 cores per trimming process (you asked for $cores cores). Please consider re-specifying\n"; sleep(2); 2286 } 2287 } 2288 else { 2289 warn "Multicore support not enabled. Proceeding with single-core trimming.\n"; 2290 $compression_path = "gzip"; 2291 $cores = 1; 2292 } 2293 2294 2295 # Before we start let's have quick look if Cutadapt seems to be working with the path information provided 2296 # To change the path to Cutadapt use --path_to_cutadapt /full/path/to/the/Cutadapt/executable 2297 2298 if(defined $path_to_cutadapt){ 2299 warn "Path to Cutadapt set as: '$path_to_cutadapt' (user defined)\n"; 2300 # we'll simply use this 2301 } 2302 else{ 2303 $path_to_cutadapt = 'cutadapt'; # default, assuming it is in the PATH 2304 warn "Path to Cutadapt set as: '$path_to_cutadapt' (default)\n"; 2305 } 2306 2307 2308 my $return = system "$path_to_cutadapt --version 2>&1 > /dev/null "; 2309 if ($return == 0){ 2310 warn "Cutadapt seems to be working fine (tested command '$path_to_cutadapt --version')\n"; 2311 $cutadapt_version = `$path_to_cutadapt --version`; 2312 chomp $cutadapt_version; 2313 warn "Cutadapt version: $cutadapt_version\n"; 2314 } 2315 else{ 2316 die "Failed to execute Cutadapt porperly. Please install Cutadapt first and make sure it is in the PATH, or specify the path to the Cutadapt executable using --path_to_cutadapt /path/to/cutadapt\n\n"; 2317 } 2318 2319 2320 2321 2322 # We only need to test for pigz if the user asked for multi-core processing 2323 if ($cores > 1){ 2324 2325 ## Check Python Version 2326 # warn "Let's also find out the Python version used. $path_to_cutadapt\n"; 2327 my $location_of_cutadapt = `which $path_to_cutadapt`; 2328 #warn "Location is: $location_of_cutadapt\n"; 2329 2330 # Reading the first line of the cutadapt executable, since this tends to contain the python version it is using 2331 open (my $first_line,$location_of_cutadapt) or die "Failed to read the first line of the Cutadapt executable: $!"; 2332 my $shebang = <$first_line>; 2333 chomp $shebang; 2334 # warn "This is the first line:\n>>>$shebang<<<\n"; 2335 close $first_line; 2336 2337 # the shebang line seems to contain a path to Python 2338 if ($shebang =~ /python/i){ 2339 if ($shebang =~ /\#!/){ 2340 # warn "Found a shebangline: >>>$_<<<\n"; 2341 $shebang =~ s/\#!//; 2342 # warn "Truncated shebangline: >>>$_<<<\n"; 2343 } 2344 2345 my $python_return = `$shebang --version 2>&1`; 2346 chomp $python_return; 2347 # warn "Python return: $python_return\n"; 2348 2349 if ($python_return =~ /Python 3.*/){ 2350 warn "Cutadapt seems to be using Python 3! Proceeding with multi-core enabled Cutadapt using $cores cores\n"; 2351 } 2352 elsif ($python_return =~ /Python 2.*/){ 2353 warn "Python 2 found, multi-core not supported. Proceeding with Cutadapt in single-core mode\n"; 2354 $cores = 1; 2355 } 2356 else{ 2357 die "No Python detected. Python required to run Cutadapt!\n\n"; 2358 } 2359 2360 $python_version = $python_return; 2361 $python_version =~ s/^Python //; 2362 } 2363 else{ 2364 # the shebang line doesn't seem to contain a path to Python. Instead, someone edited the Cutadapt executable to look differently. 2365 # An example for this is the latest version of Miniconda: 2366 ### #!/bin/sh 2367 ### '''exec' /long/path/to/conda/envs/deepsv/bin/python "$0" "$@" 2368 ### ' ''' 2369 ### # -*- coding: utf-8 -*- 2370 ### ... 2371 2372 ### interestingly, Anacondo does not seem to do this 2373 warn "Could not detect version of Python used by Cutadapt from the first line of Cutadapt (but found this: >>>$shebang<<<)\n"; 2374 $python_version = 'could not detect'; 2375 warn "Letting the (modified) Cutadapt deal with the Python version instead\n"; 2376 } 2377 2378 ### only proceeding if $cores is still > 1, i.e. if Python 3 was found 2379 if ($cores > 1){ 2380 2381 ### Test if pigz is installed 2382 my $pigz_return = system ("pigz --version 2> /dev/null"); 2383 # warn "PIGZ returned: $pigz_return\n"; 2384 if ($pigz_return == 0) { 2385 warn "Parallel gzip (pigz) detected. Proceeding with multicore (de)compression using $cores cores\n\n"; 2386 $compression_path = "pigz -p $cores"; 2387 } 2388 else { 2389 warn "Proceeding with 'gzip' for compression. PLEASE NOTE: Using multi-cores for trimming with 'gzip' only has only very limited effect! (see here: https://github.com/FelixKrueger/TrimGalore/issues/16#issuecomment-458557103)\n"; 2390 warn "To increase performance, please install 'pigz' and run again\n\n"; 2391 $compression_path = "gzip"; 2392 } 2393 } 2394 } 2395 else{ 2396 warn "single-core operation.\n" 2397 } 2398 unless (defined $compression_path){ 2399 $compression_path = "gzip"; # fall-back option 2400 } 2401 2402 ### SUPRESS WARNINGS 2403 if (defined $suppress_warn){ 2404 $DOWARN = 0; 2405 } 2406 2407 ### QUALITY SCORES 2408 my $phred_encoding; 2409 if ($phred33){ 2410 if ($phred64){ 2411 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n"; 2412 } 2413 $phred_encoding = 33; 2414 } 2415 elsif ($phred64){ 2416 $phred_encoding = 64; 2417 } 2418 unless ($phred33 or $phred64){ 2419 unless (defined $hardtrim5 or $hardtrim3 or $clock){ # we don't need warnings if we simply hard-trim or Clock-trim a file 2420 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n"; 2421 } 2422 $phred_encoding = 33; 2423 } 2424 2425 ### ILLUMINA 2-COLOUR CHEMISTRY SPEICIFC HIGH QUALITY G-TRIMMING 2426 if (defined $nextseq){ 2427 if (defined $quality){ 2428 die "The options '-quality INT' and '--nextseq INT' are mutually exclusive. Please decide which trimming mode you would like to apply!\n\n"; 2429 } 2430 # warn "OK, let's deal with 2-colour issues\n\n"; 2431 unless ( ($nextseq > 0) and $nextseq < 200){ 2432 die "Please select a sensible value for 2-colour specific trimming (currently allowed range is between 1 and 200 Gs). Please respecify!\n"; 2433 } 2434 $nextseq = "--nextseq-trim=$nextseq"; 2435 } 2436 else{ 2437 $nextseq = ''; 2438 } 2439 2440 ### NON-DIRECTIONAL RRBS 2441 if ($non_directional){ 2442 unless ($rrbs){ 2443 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n"; 2444 } 2445 } 2446 else{ 2447 $non_directional = 0; 2448 } 2449 2450 if ($fastqc_args){ 2451 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed 2452 } 2453 else{ 2454 $fastqc_args = 0; 2455 } 2456 2457 ### CUSTOM ERROR RATE 2458 if (defined $error_rate){ 2459 # make sure that the error rate is between 0 and 1 2460 unless ($error_rate >= 0 and $error_rate <= 1){ 2461 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n"; 2462 } 2463 } 2464 else{ 2465 $error_rate = 0.1; # (default) 2466 } 2467 2468 2469 if ($nextera and $small_rna or $nextera and $illumina or $illumina and $small_rna ){ 2470 die "You can't use several different adapter types at the same time. Make your choice or consider using -a and -a2\n\n"; 2471 } 2472 2473 ### ENSURE USERS ARE USING THE POLY-A AUTODETECTION, OR SPECIFY SEQUENCES FOR BOTH -a AND -a2 2474 if ($polyA){ 2475 if (defined $adapter){ 2476 if ($validate){# paired-end 2477 unless (defined $adapter2){ 2478 die "Please use either the PolyA auto-detection (defaults to -a \"A{20}\" -a2 \"T{150}\" (or the other way round)), or specify both -a and -a2. Now try again...\n\n"; 2479 } 2480 } 2481 } 2482 if (defined $adapter2){ 2483 if ($validate){# paired-end 2484 unless (defined $adapter){ 2485 die "Please use either the PolyA auto-detection (defaults to -a \"A{20}\" -a2 \"T{150}\" (or the other way round)), or specify both -a and -a2. Now try again...\n\n"; 2486 } 2487 } 2488 } 2489 } 2490 2491 2492 if (defined $adapter){ 2493 2494 # The adapter may be given as a single base that occurs a number of times 2495 # in the form BASE{number of times}, e.g. "-a A{10}" 2496 if ($adapter =~ /^([ACTGN]){(\d+)}$/){ 2497 # warn "Base: $1\n# repeats: $2\n"; 2498 2499 2500 my $tmp_adapter = extend_adapter_sequence(uc$1,$2); 2501 warn "Adapter sequence given as >$adapter< expanded to: >$tmp_adapter<\n"; 2502 $adapter = $tmp_adapter; 2503 } 2504 else{ 2505 unless ($adapter =~ /^[ACTGNXactgnx]+$/){ 2506 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n"; 2507 } 2508 } 2509 $adapter = uc$adapter; 2510 2511 if ($illumina){ 2512 die "You can't supply an adapter sequence AND use the Illumina universal adapter sequence. Make your choice.\n\n"; 2513 } 2514 if ($nextera){ 2515 die "You can't supply an adapter sequence AND use the Nextera transposase adapter sequence. Make your choice.\n\n"; 2516 } 2517 if ($small_rna){ 2518 die "You can't supply an adapter sequence AND use the Illumina small RNA adapter sequence. Make your choice.\n\n"; 2519 } 2520 } 2521 2522 if (defined $adapter2){ 2523 unless ($validate){ 2524 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n"; 2525 } 2526 2527 # The adapter may be given as a single base that occurs a number of times 2528 # in the form BASE{number of times}, e.g. "-a2 A{10}" 2529 if ($adapter2 =~ /^([ACTGN]){(\d+)}$/){ 2530 # warn "Base: $1\n# repeats: $2\n"; 2531 my $tmp_adapter2 = extend_adapter_sequence(uc$1,$2); 2532 2533 warn "Adapter2 sequence given as >$adapter2< expanded to: >$tmp_adapter2<\n"; 2534 $adapter2 = $tmp_adapter2; 2535 } 2536 else{ 2537 unless ($adapter2 =~ /^[ACTGNactgn]+$/){ 2538 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n"; 2539 } 2540 } 2541 $adapter2 = uc$adapter2; 2542 } 2543 2544 ### LENGTH CUTOFF 2545 # this gets set right at the start after the adapter auto-detection has been concluded 2546 2547 ### MAXIMUM LENGTH CUTOFF - this is intended for smallRNA-libraries to remove sequences that are longer than a certain cutoff and thus problably not small RNA species 2548 if (defined $max_length){ 2549 if ($validate){ 2550 die "Maximum length filtering works currently only in single-end mode (which is more sensible for smallRNA-sequencing anyway...)\n\n"; 2551 } 2552 warn "Maximum length cutoff set to >> $max_length bp <<; sequences longer than this threshold will be removed (only advised for smallRNA-trimming!)\n\n"; 2553 } 2554 else{ 2555 $max_length = 0; 2556 } 2557 2558 ### files are supposed to be paired-end files 2559 if ($validate){ 2560 2561 # making sure that an even number of reads has been supplied 2562 unless ((scalar@ARGV)%2 == 0){ 2563 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n"; 2564 } 2565 2566 ### Ensuring pairs of R1 and R2 are not the very same file 2567 my $index = 0; 2568 while ($index <= $#ARGV){ 2569 # warn "File 1: $ARGV[$index]\n"; 2570 # warn "File 2: $ARGV[$index+1]\n~~~~~~~~~~\n\n"; sleep(1); 2571 if ($ARGV[$index] eq $ARGV[$index+1]){ 2572 die "[FATAL:] Read 1 ($ARGV[$index]) and Read 2 ($ARGV[$index+1]) files appear to be the very same file. This probably happened inadvertently, so please re-specify!\nExiting...\n\n"; 2573 } 2574 $index += 2; 2575 } 2576 2577 ## CUTOFF FOR VALIDATED READ-PAIRS 2578 if (defined $length_read_1 or defined $length_read_2){ 2579 2580 unless ($retain){ 2581 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n"; 2582 } 2583 2584 if (defined $length_read_1){ 2585 unless ($length_read_1 >= 15 and $length_read_1 <= 100){ 2586 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; 2587 } 2588 unless ($length_read_1 > $length_cutoff){ 2589 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; 2590 } 2591 } 2592 2593 if (defined $length_read_2){ 2594 unless ($length_read_2 >= 15 and $length_read_2 <= 100){ 2595 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; 2596 } 2597 unless ($length_read_2 > $length_cutoff){ 2598 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; 2599 } 2600 } 2601 } 2602 2603 if ($retain){ 2604 $length_read_1 = 35 unless (defined $length_read_1); 2605 $length_read_2 = 35 unless (defined $length_read_2); 2606 } 2607 } 2608 2609 2610 unless ($no_report_file){ 2611 $no_report_file = 0; 2612 } 2613 2614 ### PARENT DIRECTORY 2615 my $parent_dir = getcwd(); 2616 unless ($parent_dir =~ /\/$/){ 2617 $parent_dir =~ s/$/\//; 2618 } 2619 2620 ### OUTPUT DIR PATH 2621 if (defined $output_dir){ 2622 2623 if ($output_dir =~ /\s+/){ 2624 die "\n[FATAL OPTION]: Output directory name (>>$output_dir<<) contained whitespace(s). Please replace whitespace(s) with '_' and try again.\n\n"; 2625 } 2626 2627 unless ($output_dir eq ''){ 2628 unless ($output_dir =~ /\/$/){ 2629 $output_dir =~ s/$/\//; 2630 } 2631 2632 if (chdir $output_dir){ 2633 $output_dir = getcwd(); # making the path absolute 2634 unless ($output_dir =~ /\/$/){ 2635 $output_dir =~ s/$/\//; 2636 } 2637 } 2638 else{ 2639 mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; 2640 warn "Output directory $output_dir doesn't exist, creating it for you...\n\n"; sleep(1); 2641 chdir $output_dir or die "Failed to move to $output_dir\n"; 2642 $output_dir = getcwd(); # making the path absolute 2643 unless ($output_dir =~ /\/$/){ 2644 $output_dir =~ s/$/\//; 2645 } 2646 } 2647 warn "Output will be written into the directory: $output_dir\n"; 2648 } 2649 } 2650 else{ 2651 $output_dir = ''; 2652 } 2653 # Changing back to parent directory 2654 chdir $parent_dir or die "Failed to move to $parent_dir\n"; 2655 2656 2657 ### Trimming at the 5' end 2658 if (defined $clip_r2){ # trimming 5' bases of read 2 2659 die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate); 2660 } 2661 2662 if (defined $clip_r1){ # trimming 5' bases of read 1 2663 unless ($clip_r1 > 0 and $clip_r1 < 100){ 2664 die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; 2665 } 2666 } 2667 2668 if (defined $clip_r2){ # trimming 5' bases of read 2 2669 unless ($clip_r2 > 0 and $clip_r2 < 100){ 2670 die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; 2671 } 2672 } 2673 2674 ### Trimming at the 3' end 2675 if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1 2676 unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){ 2677 die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; 2678 } 2679 } 2680 2681 if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2 2682 unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){ 2683 die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; 2684 } 2685 } 2686 2687 2688 if (defined $maxn){ 2689 unless ($maxn >= 0 and $maxn <= 100){ 2690 die "--max_n needs to be an integer between 0 and 100\nPlease respecify...\n\n"; 2691 } 2692 } 2693 2694 my $trim_n; 2695 if ($trimn){ 2696 $trim_n = '--trim-n'; 2697 } 2698 else{ 2699 $trim_n = ''; 2700 } 2701 2702 ### RRBS 2703 if ($rrbs){ 2704 unless ($non_directional){ # only setting this for directional mode 2705 if ($validate){ 2706 unless (defined $clip_r2){ # user specified R2 clipping overrides the default setting of --clip_r2 2 # added 07 Dec 2016 2707 warn "Setting the option '--clip_r2 2' (to remove methylation bias from the start of Read 2)\n"; # sleep(1); 2708 $clip_r2 = 2; 2709 } 2710 } 2711 } 2712 } 2713 else{ 2714 $rrbs = 0; 2715 } 2716 2717 if (defined $hardtrim5){ 2718 unless ($hardtrim5 > 0 and $hardtrim5 < 1000){ 2719 die "Hard-trim from 3'-end selected: >$hardtrim5< bp. Please ensure a hard-trimming range between 1 and 999 bp. Please try again...\n\n~~~~~~~~~~~~~~~~~~~~~~\n" 2720 } 2721 } 2722 if (defined $hardtrim3){ 2723 unless ($hardtrim3 > 0 and $hardtrim3 < 1000){ 2724 die "Hard-trim from 5'-end selected: >$hardtrim3< bp. Please ensure a hard-trimming range between 1 and 999 bp. Please try again...\n\n~~~~~~~~~~~~~~~~~~~~~~\n" 2725 } 2726 } 2727 2728 ### The 2729 if ($clock){ 2730 if ($validate){ # already selected as paired-end mode 2731 # Fine. Currently, the Clock protocol requires paired-end reads with dual UMIs 2732 } 2733 else{ 2734 die "\nOption --clock selected, but the processing is still set to single-end mode. The current clock protocol requires paired-end sequencing with dual unique molecular identifiers (UMIs) though. Please respecify...\n\n"; 2735 } 2736 } 2737 2738 if (defined $basename){ 2739 warn "Using user-specified basename (>>$basename<<) instead of deriving the filename from the input file(s)\n"; 2740 2741 if ($basename =~ /\//){ 2742 die "Please make sure the name specified with --basename does not contain file path information! ($basename)"; 2743 } 2744 $basename =~ s/[ !%\$\*&£]/_/g; # replacing weird symbols or spaces 2745 2746 if (scalar @ARGV > 2){ 2747 die "[FATAL ERROR]: Number of files supplied can be 1 (single-end mode), or 2 (paired-end mode), but was: ",scalar @ARGV,". Please respecify!\n\n"; 2748 } 2749 else{ 2750 if (scalar @ARGV == 2){ 2751 if ($validate){ 2752 # fine, this is paired-end 2753 } 2754 else{ 2755 die "[FATAL ERROR]: Number of files supplied was 2, but single-end mode was selected as well. Please respecify!\n\n"; 2756 } 2757 } 2758 } 2759 } 2760 2761 if (defined $consider_already_trimmed){ 2762 # making sure the range is sensible 2763 unless ($consider_already_trimmed >= 0 and $consider_already_trimmed <= 10000){ 2764 die "Please select a threshold for when a sample should not be adapter trimmed at all (--consider_already_trimmed) in the range of [0-1000] (inclusive)\n"; 2765 } 2766 if ($nextera or $small_rna or $illumina){ 2767 die "The threshold for --consider_already_trimmed [INT] only works in conjunction with adapter auto-detection. Make your choice and try again\n\n"; 2768 } 2769 warn "During the adapter auto-detection, any counts equal to or lower than >$consider_already_trimmed< will be considered as 'file was already adapter-trimmed'. Only quality trimming will be carried out (setting -a X)\n"; sleep(1); 2770 } 2771 2772 return ($compression_path,$cores,$quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina,$max_length,$maxn,$trim_n,$hardtrim5,$clock,$polyA,$hardtrim3,$nextseq,$basename,$consider_already_trimmed); 2773} 2774 2775sub Ncounter{ 2776 my $seq = shift; 2777 my $ncount = 0; 2778 while($seq =~ /N/g){ 2779 ++$ncount; 2780 } 2781 return $ncount; 2782} 2783 2784sub extend_adapter_sequence{ 2785 my ($letter,$number) = @_; 2786 return (${letter}x$number); 2787} 2788 2789 2790 2791sub print_helpfile{ 2792 print << "HELP"; 2793 2794 USAGE: 2795 2796trim_galore [options] <filename(s)> 2797 2798 2799-h/--help Print this help message and exits. 2800 2801-v/--version Print the version information and exits. 2802 2803-q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For 2804 RRBS samples, quality trimming will be performed first, and adapter 2805 trimming is carried in a second round. Other files are quality and adapter 2806 trimmed in a single pass. The algorithm is the same as the one used by BWA 2807 (Subtract INT from all qualities; compute partial sums from all indices 2808 to the end of the sequence; cut sequence at the index at which the sum is 2809 minimal). Default Phred score: 20. 2810 2811--phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores 2812 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON. 2813 2814--phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores 2815 (Illumina 1.5 encoding) for quality trimming. 2816 2817--fastqc Run FastQC in the default mode on the FastQ file once trimming is complete. 2818 2819--fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed 2820 to FastQC they must be in the form "arg1 arg2 etc.". An example would be: 2821 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will 2822 automatically invoke FastQC, so --fastqc does not have to be specified 2823 separately. 2824 2825-a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will 2826 try to auto-detect whether the Illumina universal, Nextera transposase or Illumina 2827 small RNA adapter sequence was used. Also see '--illumina', '--nextera' and 2828 '--small_rna'. If no adapter can be detected within the first 1 million sequences 2829 of the first file specified or if there is a tie between several adapter sequences, 2830 Trim Galore defaults to '--illumina' (as long as the Illumina adapter was one of the 2831 options, else '--nextera' is the default). A single base 2832 may also be given as e.g. -a A{10}, to be expanded to -a AAAAAAAAAA. 2833 2834-a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This 2835 option requires '--paired' to be specified as well. If the libraries to be trimmed 2836 are smallRNA then a2 will be set to the Illumina small RNA 5' adapter automatically 2837 (GATCGTCGGACT). A single base may also be given as e.g. -a2 A{10}, to be expanded 2838 to -a2 AAAAAAAAAA. 2839 2840--illumina Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter 2841 'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence. 2842 2843--nextera Adapter sequence to be trimmed is the first 12bp of the Nextera adapter 2844 'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence. 2845 2846--small_rna Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA 3' Adapter 2847 'TGGAATTCTCGG' instead of the default auto-detection of adapter sequence. Selecting 2848 to trim smallRNA adapters will also lower the --length value to 18bp. If the smallRNA 2849 libraries are paired-end then a2 will be set to the Illumina small RNA 5' adapter 2850 automatically (GATCGTCGGACT) unless -a 2 had been defined explicitly. 2851 2852--consider_already_trimmed <INT> During adapter auto-detection, the limit set by <INT> allows the user to 2853 set a threshold up to which the file is considered already adapter-trimmed. If no adapter 2854 sequence exceeds this threshold, no additional adapter trimming will be performed (technically, 2855 the adapter is set to '-a X'). Quality trimming is still performed as usual. 2856 Default: NOT SELECTED (i.e. normal auto-detection precedence rules apply). 2857 2858--max_length <INT> Discard reads that are longer than <INT> bp after trimming. This is only advised for 2859 smallRNA sequencing to remove non-small RNA sequences. 2860 2861 2862--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a 2863 very stringent setting of 1, i.e. even a single bp of overlapping sequence 2864 will be trimmed off from the 3' end of any read. 2865 2866-e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching 2867 region) (default: 0.1) 2868 2869--gzip Compress the output file with GZIP. If the input files are GZIP-compressed 2870 the output files will automatically be GZIP compressed as well. As of v0.2.8 the 2871 compression will take place on the fly. 2872 2873--dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip. 2874 2875--length <INT> Discard reads that became shorter than length INT because of either 2876 quality or adapter trimming. A value of '0' effectively disables 2877 this behaviour. Default: 20 bp. 2878 2879 For paired-end files, both reads of a read-pair need to be longer than 2880 <INT> bp to be printed out to validated paired-end files (see option --paired). 2881 If only one read became too short there is the possibility of keeping such 2882 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp. 2883 2884--max_n COUNT The total number of Ns (as integer) a read may contain before it will be removed altogether. 2885 In a paired-end setting, either read exceeding this limit will result in the entire 2886 pair being removed from the trimmed output files. 2887 2888--trim-n Removes Ns from either side of the read. This option does currently not work in RRBS mode. 2889 2890-o/--output_dir <DIR> If specified all output will be written to this directory instead of the current 2891 directory. If the directory doesn't exist it will be created for you. 2892 2893--no_report_file If specified no report file will be generated. 2894 2895--suppress_warn If specified any output to STDOUT or STDERR will be suppressed. 2896 2897--clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end 2898 reads). This may be useful if the qualities were very poor, or if there is some 2899 sort of unwanted bias at the 5' end. Default: OFF. 2900 2901--clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads 2902 only). This may be useful if the qualities were very poor, or if there is some sort 2903 of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove 2904 the first few bp because the end-repair reaction may introduce a bias towards low 2905 methylation. Please refer to the M-bias plot section in the Bismark User Guide for 2906 some examples. Default: OFF. 2907 2908--three_prime_clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 1 (or single-end 2909 reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted 2910 bias from the 3' end that is not directly related to adapter sequence or basecall quality. 2911 Default: OFF. 2912 2913--three_prime_clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 2 AFTER 2914 adapter/quality trimming has been performed. This may remove some unwanted bias from 2915 the 3' end that is not directly related to adapter sequence or basecall quality. 2916 Default: OFF. 2917 2918--2colour/--nextseq INT This enables the option '--nextseq-trim=3'CUTOFF' within Cutadapt, which will set a quality 2919 cutoff (that is normally given with -q instead), but qualities of G bases are ignored. 2920 This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without 2921 any signal are called as high-quality G bases. This is mutually exlusive with '-q INT'. 2922 2923 2924--path_to_cutadapt </path/to/cutadapt> You may use this option to specify a path to the Cutadapt executable, 2925 e.g. /my/home/cutadapt-1.7.1/bin/cutadapt. Else it is assumed that Cutadapt is in 2926 the PATH. 2927 2928--basename <PREFERRED_NAME> Use PREFERRED_NAME as the basename for output files, instead of deriving the filenames from 2929 the input files. Single-end data would be called PREFERRED_NAME_trimmed.fq(.gz), or 2930 PREFERRED_NAME_val_1.fq(.gz) and PREFERRED_NAME_val_2.fq(.gz) for paired-end data. --basename 2931 only works when 1 file (single-end) or 2 files (paired-end) are specified, but not for longer lists. 2932 2933-j/--cores INT Number of cores to be used for trimming [default: 1]. For Cutadapt to work with multiple cores, it 2934 requires Python 3 as well as parallel gzip (pigz) installed on the system. The version of Python used 2935 is detected from the shebang line of the Cutadapt executable (either 'cutadapt', or a specified path). 2936 If Python 2 is detected, --cores is set to 1. 2937 If pigz cannot be detected on your system, Trim Galore reverts to using gzip compression. Please note 2938 that gzip compression will slow down multi-core processes so much that it is hardly worthwhile, please 2939 see: https://github.com/FelixKrueger/TrimGalore/issues/16#issuecomment-458557103 for more info). 2940 2941 Actual core usage: It should be mentioned that the actual number of cores used is a little convoluted. 2942 Assuming that Python 3 is used and pigz is installed, --cores 2 would use 2 cores to read the input 2943 (probably not at a high usage though), 2 cores to write to the output (at moderately high usage), and 2944 2 cores for Cutadapt itself + 2 additional cores for Cutadapt (not sure what they are used for) + 1 core 2945 for Trim Galore itself. So this can be up to 9 cores, even though most of them won't be used at 100% for 2946 most of the time. Paired-end processing uses twice as many cores for the validation (= writing out) step. 2947 --cores 4 would then be: 4 (read) + 4 (write) + 4 (Cutadapt) + 2 (extra Cutadapt) + 1 (Trim Galore) = 15. 2948 2949 It seems that --cores 4 could be a sweet spot, anything above has diminishing returns. 2950 2951 2952 2953SPECIFIC TRIMMING - without adapter/quality trimming 2954 2955--hardtrim5 <int> Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences 2956 to <int> bp at the 5'-end. Once hard-trimming of files is complete, Trim Galore will exit. 2957 Hard-trimmed output files will end in .<int>_5prime.fq(.gz). Here is an example: 2958 2959 before: CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT 2960 --hardtrim5 20: CCTAAGGAAACAAGTACACT 2961 2962--hardtrim3 <int> Instead of performing adapter-/quality trimming, this option will simply hard-trim sequences 2963 to <int> bp at the 3'-end. Once hard-trimming of files is complete, Trim Galore will exit. 2964 Hard-trimmed output files will end in .<int>_3prime.fq(.gz). Here is an example: 2965 2966 before: CCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAAT 2967 --hardtrim3 20: TTTTTAAGAAAATGGAAAAT 2968 2969--clock In this mode, reads are trimmed in a specific way that is currently used for the Mouse 2970 Epigenetic Clock (see here: Multi-tissue DNA methylation age predictor in mouse, Stubbs et al., 2971 Genome Biology, 2017 18:68 https://doi.org/10.1186/s13059-017-1203-5). Following this, Trim Galore 2972 will exit. 2973 2974 In it's current implementation, the dual-UMI RRBS reads come in the following format: 2975 2976 Read 1 5' UUUUUUUU CAGTA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF TACTG UUUUUUUU 3' 2977 Read 2 3' UUUUUUUU GTCAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF ATGAC UUUUUUUU 5' 2978 2979 Where UUUUUUUU is a random 8-mer unique molecular identifier (UMI), CAGTA is a constant region, 2980 and FFFFFFF... is the actual RRBS-Fragment to be sequenced. The UMIs for Read 1 (R1) and 2981 Read 2 (R2), as well as the fixed sequences (F1 or F2), are written into the read ID and 2982 removed from the actual sequence. Here is an example: 2983 2984 R1: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT 2985 ATCTAGTTCAGTACGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG 2986 R2: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT 2987 CAATTTTGCAGTACAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA 2988 2989 R1: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 1:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT 2990 CGGTGTTTTCGAATTAGAAAAATATGTATAGAGGAAATAGATATAAAGGCGTATTCGTTATTG 2991 R2: \@HWI-D00436:407:CCAETANXX:1:1101:4105:1905 3:N:0: CGATGTTT:R1:ATCTAGTT:R2:CAATTTTG:F1:CAGT:F2:CAGT 2992 CAAAAATAATACCTCCTCTATTTATCCAAAATCACAAAAAACCACCCACTTAACTTTCCCTAA 2993 2994 Following clock trimming, the resulting files (.clock_UMI.R1.fq(.gz) and .clock_UMI.R2.fq(.gz)) 2995 should be adapter- and quality trimmed with Trim Galore as usual. In addition, reads need to be trimmed 2996 by 15bp from their 3' end to get rid of potential UMI and fixed sequences. The command is: 2997 2998 trim_galore --paired --three_prime_clip_R1 15 --three_prime_clip_R2 15 *.clock_UMI.R1.fq.gz *.clock_UMI.R2.fq.gz 2999 3000 Following this, reads should be aligned with Bismark and deduplicated with UmiBam 3001 in '--dual_index' mode (see here: https://github.com/FelixKrueger/Umi-Grinder). UmiBam recognises 3002 the UMIs within this pattern: R1:(ATCTAGTT):R2:(CAATTTTG): as (UMI R1) and (UMI R2). 3003 3004--polyA This is a new, still experimental, trimming mode to identify and remove poly-A tails from sequences. 3005 When --polyA is selected, Trim Galore attempts to identify from the first supplied sample whether 3006 sequences contain more often a stretch of either 'AAAAAAAAAA' or 'TTTTTTTTTT'. This determines 3007 if Read 1 of a paired-end end file, or single-end files, are trimmed for PolyA or PolyT. In case of 3008 paired-end sequencing, Read2 is trimmed for the complementary base from the start of the reads. The 3009 auto-detection uses a default of A{20} for Read1 (3'-end trimming) and T{150} for Read2 (5'-end trimming). 3010 These values may be changed manually using the options -a and -a2. 3011 3012 In addition to trimming the sequences, white spaces are replaced with _ and it records in the read ID 3013 how many bases were trimmed so it can later be used to identify PolyA trimmed sequences. This is currently done 3014 by writing tags to both the start ("32:A:") and end ("_PolyA:32") of the reads in the following example: 3015 3016 \@READ-ID:1:1102:22039:36996 1:N:0:CCTAATCC 3017 GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 3018 3019 \@32:A:READ-ID:1:1102:22039:36996_1:N:0:CCTAATCC_PolyA:32 3020 GCCTAAGGAAACAAGTACACTCCACACATGCATAAAGGAAATCAAATGTTATTTTTAAGAAAATGGAAAATAAAAACTTTATAAACACC 3021 3022 PLEASE NOTE: The poly-A trimming mode expects that sequences were both adapter and quality trimmed 3023 before looking for Poly-A tails, and it is the user's responsibility to carry out an initial round of 3024 trimming. The following sequence: 3025 3026 1) trim_galore file.fastq.gz 3027 2) trim_galore --polyA file_trimmed.fq.gz 3028 3) zcat file_trimmed_trimmed.fq.gz | grep -A 3 PolyA | grep -v ^-- > PolyA_trimmed.fastq 3029 3030 Will 1) trim qualities and Illumina adapter contamination, 2) find and remove PolyA contamination. 3031 Finally, if desired, 3) will specifically find PolyA trimmed sequences to a specific FastQ file of your choice. 3032 3033 3034RRBS-specific options (MspI digested material): 3035 3036--rrbs Specifies that the input file was an MspI digested RRBS sample (recognition 3037 site: CCGG). Single-end or Read 1 sequences (paired-end) which were adapter-trimmed 3038 will have a further 2 bp removed from their 3' end. Sequences which were merely 3039 trimmed because of poor quality will not be shortened further. Read 2 of paired-end 3040 libraries will in addition have the first 2 bp removed from the 5' end (by setting 3041 '--clip_r2 2'). This is to avoid using artificial methylation calls from the filled-in 3042 cytosine positions close to the 3' MspI site in sequenced fragments. 3043 This option is not recommended for users of the NuGEN ovation RRBS System 1-16 3044 kit (see below). 3045 3046--non_directional Selecting this option for non-directional RRBS libraries will screen 3047 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read 3048 and, if found, removes the first two basepairs. Like with the option 3049 '--rrbs' this avoids using cytosine positions that were filled-in 3050 during the end-repair step. '--non_directional' requires '--rrbs' to 3051 be specified as well. Note that this option does not set '--clip_r2 2' in 3052 paired-end mode. 3053 3054--keep Keep the quality trimmed intermediate file. Default: off, which means 3055 the temporary file is being deleted after adapter trimming. Only has 3056 an effect for RRBS samples since other FastQ files are not trimmed 3057 for poor qualities separately. 3058 3059 3060Note for RRBS using the NuGEN Ovation RRBS System 1-16 kit: 3061 3062Owing to the fact that the NuGEN Ovation kit attaches a varying number of nucleotides (0-3) after each MspI 3063site Trim Galore should be run WITHOUT the option --rrbs. This trimming is accomplished in a subsequent 3064diversity trimming step afterwards (see their manual). 3065 3066 3067 3068Note for RRBS using MseI: 3069 3070If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary 3071to specify --rrbs or --non_directional since virtually all reads should start with the sequence 3072'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA' 3073restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply 3074run Trim Galore! in the standard (i.e. non-RRBS) mode. 3075 3076 3077 3078 3079Paired-end specific options: 3080 3081--paired This option performs length trimming of quality/adapter/RRBS trimmed reads for 3082 paired-end files. To pass the validation test, both sequences of a sequence pair 3083 are required to have a certain minimum length which is governed by the option 3084 --length (see above). If only one read passes this length threshold the 3085 other read can be rescued (see option --retain_unpaired). Using this option lets 3086 you discard too short read pairs without disturbing the sequence-by-sequence order 3087 of FastQ files which is required by many aligners. 3088 3089 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g. 3090 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . 3091 3092-t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that 3093 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards 3094 alignments like this: 3095 3096 R1 ---------------------------> or this: -----------------------> R1 3097 R2 <--------------------------- <----------------- R2 3098 3099 as invalid (whenever a start/end coordinate is contained within the other read). 3100 NOTE: If you are planning to use Bowtie2, BWA etc. you don't need to specify this option. 3101 3102--retain_unpaired If only one of the two paired-end reads became too short, the longer 3103 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' 3104 output files. The length cutoff for unpaired single-end reads is 3105 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF. 3106 3107-r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to 3108 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. 3109 Default: 35 bp. 3110 3111-r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to 3112 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. 3113 Default: 35 bp. 3114 3115Last modified on 07 November 2019. 3116 3117HELP 3118 exit; 3119} 3120