1#! @PERL@ -w 2# 3# -------------------------------------------------------------------- 4# EukHighConfidenceFilter v1.0 5# 6# Annotate eukaryotic tRNA high confidence set from tRNAscan-SE 2.0 predictions 7# 8# Copyright (C) 2017 Patricia Chan and Todd Lowe 9# 10# Baskin School of Engineering, University of California, Santa Cruz 11# lowe@soe.ucsc.edu 12# http://lowelab.ucsc.edu/ 13# -------------------------------------------------------------------- 14 15use strict; 16use lib "@libdir@/tRNAscan-SE"; 17use Getopt::Long; 18 19use tRNAscanSE::tRNA; 20use tRNAscanSE::ArraytRNA; 21 22our @isotypes = ('Ala', 'Gly', 'Pro', 'Thr', 'Val', 23 'Ser', 'Arg', 'Leu', 24 'Phe','Asn', 'Lys', 'Asp', 'Glu', 'His', 'Gln', 25 'Ile', 'Met', 'Tyr', 'Sup', 'Cys', 'Trp', 'SeC'); 26 27our %ac_list = ( 28 'Ala' => [qw/AGC GGC CGC TGC/], 29 'Gly' => [qw/ACC GCC CCC TCC/], 30 'Pro' => [qw/AGG GGG CGG TGG/], 31 'Thr' => [qw/AGT GGT CGT TGT/], 32 'Val' => [qw/AAC GAC CAC TAC/], 33 34 'Ser' => [qw/AGA GGA CGA TGA ACT GCT    /], 35 'Arg' => [qw/ACG GCG CCG TCG     CCT TCT/], 36 'Leu' => [qw/AAG GAG CAG TAG     CAA TAA/], 37 38 'Phe' => [qw/AAA GAA    /], 39 40 'Asn' => [qw/ATT GTT    /], 41 'Lys' => [qw/    CTT TTT/], 42 43 'Asp' => [qw/ATC GTC     /], 44 'Glu' => [qw/    CTC TTC/], 45 46 'His' => [qw/ATG GTG     /], 47 'Gln' => [qw/    CTG TTG/], 48 49 'Tyr' => [qw/ATA GTA     /], 50 'Sup' => [qw/    CTA TTA/], 51 52 'Ile' => [qw/AAT GAT   TAT/], 53 'Met' => [qw/    CAT  /], 54 55 'Cys' => [qw/ACA GCA     /], 56 'Trp' => [qw/    CCA  /], 57 'SeC' => [qw/      TCA/] 58 ); 59 60our %aa_list = ( 61 'AGC'=>'Ala', 'GGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala', 62 'ACC'=>'Gly', 'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly', 63 'AGG'=>'Pro', 'GGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro', 64 'AGT'=>'Thr', 'GGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr', 65 'AAC'=>'Val', 'GAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val', 66 67 'AGA'=>'Ser', 'GGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'ACT'=>'Ser', 'GCT'=>'Ser', 68 'ACG'=>'Arg', 'GCG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg', 69 'AAG'=>'Leu', 'GAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu', 70 71 'AAA'=>'Phe', 'GAA'=>'Phe', 72 73 'ATT'=>'Asn', 'GTT'=>'Asn', 74 'CTT'=>'Lys', 'TTT'=>'Lys', 75 76 'ATC'=>'Asp', 'GTC'=>'Asp', 77 'CTC'=>'Glu', 'TTC'=>'Glu', 78 79 'ATG'=>'His', 'GTG'=>'His', 80 'CTG'=>'Gln', 'TTG'=>'Gln', 81 82 'ATA'=>'Tyr', 'GTA'=>'Tyr', 83 'CTA'=>'Sup', 'TTA'=>'Sup', 84 85 'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile', 86 'CAT'=>'Met', 87 88 'ACA'=>'Cys', 'GCA'=>'Cys', 89 'CCA'=>'Trp', 90 'TCA'=>'SeC', 91 ); 92 93 94our %euk_aa_list = ( 95 'AGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala', 96 'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly', 97 'AGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro', 98 'AGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr', 99 'AAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val', 100 101 'AGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'GCT'=>'Ser', 102 'ACG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg', 103 'AAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu', 104 105 'GAA'=>'Phe', 106 107 'GTT'=>'Asn', 108 'CTT'=>'Lys', 'TTT'=>'Lys', 109 110 'GTC'=>'Asp', 111 'CTC'=>'Glu', 'TTC'=>'Glu', 112 113 'GTG'=>'His', 114 'CTG'=>'Gln', 'TTG'=>'Gln', 115 116 'GTA'=>'Tyr', 117 'CTA'=>'Sup', 'TTA'=>'Sup', 118 119 'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile', 120 'CAT'=>'Met', 121 122 'GCA'=>'Cys', 123 'CCA'=>'Trp', 124 'TCA'=>'SeC', 125 ); 126 127our ($opt_result, $opt_ss, $opt_output, $opt_prefix, $opt_remove, $opt_cmscore1, $opt_ssscore1, $opt_isoscore1, 128 $opt_isoscore2, $opt_isomaxscore2, $opt_help); 129 130our $ANTICODON_COUNT_CUTOFF = 40; 131 132our %file_names = (); 133our $tRNAs = tRNAscanSE::ArraytRNA->new(); 134our %tRNA_counts = (); 135 136&set_options(); 137&set_file_names(); 138my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = &filtering(); 139&print_results($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count); 140 141exit; 142 143sub set_options 144{ 145 $opt_result = ""; 146 $opt_ss = ""; 147 $opt_output = ""; 148 $opt_prefix = ""; 149 $opt_remove = 0; 150 $opt_cmscore1 = 50; 151 $opt_ssscore1 = 10; 152 $opt_isoscore1 = 70; 153 $opt_isoscore2 = 70; 154 $opt_isomaxscore2 = 95; 155 156 Getopt::Long::GetOptions("result|i=s", "ss|s=s", "output|o=s", "prefix|p=s", "remove|r", 157 "cmscore1|c1=f", "ssscore1|m1=f", "isoscore1|e1=f", 158 "isoscore2|e2=f", "isomaxscore2|x=f", "help|h"); 159 160 if ($opt_help || $opt_result eq "" || $opt_ss eq "" || $opt_output eq "" || $opt_prefix eq "" || 161 $opt_cmscore1 < -1 || $opt_ssscore1 < -1 || $opt_isoscore1 < -1 || $opt_isoscore2 < -1 || $opt_isomaxscore2 < -1) 162 { 163 die "Usage: EukQualityFilter [options]\n", 164 "Options\n", 165 "--result -i <file> tRNAscan-SE output file used as input\n", 166 "--ss -s <file> tRNAscan-SE secondary structure file used as input\n", 167 "--output -o <file path> Directory where output files will be written\n", 168 "--prefix -p <name> Prefix for output file name\n", 169 "--remove -r Remove filtered tRNA hits (default: filtered tRNA hits are only tagged)\n", 170 "--cmscore1 -c1 <num> Domain-specific model score cutoff for secondary filtering (default = 50; -1 if not used for filtering)\n", 171 "--ssscore1 -m1 <num> Secondary structure score cutoff for secondary filtering (default = 10; -1 if not used for filtering)\n", 172 "--isoscore1 -e1 <num> Isotype-specific model score cutoff for secondary filtering (default = 70; -1 if not used for filtering)\n", 173 "--isoscore2 -e2 <num> Isotype-specific model starting score cutoff for tertiary filtering (default = 70; -1 if not used for filtering)\n", 174 "--isomaxscore2 -x <num> Maximum isotype-specific model score cutoff for tertiary filtering (default = 95)\n", 175 "--help -h Print this help\n\n"; 176 } 177} 178 179sub set_file_names 180{ 181 system("mkdir -p ".$opt_output); 182 $file_names{tRNAscan_out} = $opt_result; 183 $file_names{tRNAscan_ss} = $opt_ss; 184 $file_names{output_tRNAscan_out} = $opt_output."/".$opt_prefix.".out"; 185 $file_names{output_tRNAscan_ss} = $opt_output."/".$opt_prefix.".ss"; 186 $file_names{log} = $opt_output."/".$opt_prefix.".log"; 187} 188 189sub filtering 190{ 191 my $sec_pass_filtered_ac = {}; 192 my $iso_score_cutoff = {}; 193 194 &pseudogene_filter(); 195 &secondary_filter(); 196 if ($opt_isoscore2 != -1) 197 { 198 ($sec_pass_filtered_ac, $iso_score_cutoff) = &tertiary_filter(); 199 } 200 $tRNAs->sort_array("tRNAscan_id"); 201 my $ac_count = &get_ac_count(); 202 203 return ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count); 204} 205 206sub print_results 207{ 208 my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = @_; 209 &write_out_file($ac_count); 210 &write_ss_file(); 211 &write_summary($sec_pass_filtered_ac, $iso_score_cutoff); 212} 213 214sub pseudogene_filter 215{ 216 my $line = ""; 217 my $tRNA = undef; 218 my %header = (); 219 my ($startpos, $endpos); 220 my @columns = (); 221 my $ct = 0; 222 223 $tRNA_counts{total} = 0; 224 $tRNA_counts{pseudo_filter} = 0; 225 226 print "Status: Filtering pseudogenes\n"; 227 228 open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n"; 229 230 while ($line = <FILE_IN>) 231 { 232 $ct++; 233 234 print STDERR "." if ($ct % 1000 == 0); 235 print STDERR "\n" if ($ct % 50000 == 0); 236 237 chomp($line); 238 if ($line =~ /^Name/) 239 { 240 $line =~ s/tRNA #/tRNA#/; 241 } 242 if ($line =~ /^Sequence/) 243 { 244 $line =~ s/Intron Bounds/Intron\tBound/; 245 } 246 247 @columns = split(/\t/, $line, -1); 248 for (my $i = 0; $i < scalar(@columns); $i++) 249 { 250 $columns[$i] = &trim($columns[$i]); 251 } 252 253 if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/) 254 { 255 if ($columns[0] =~ /^Sequence/) 256 { 257 for (my $i = 0; $i < scalar(@columns); $i++) 258 { 259 if ($columns[$i] eq "Sequence") 260 { 261 $header{seqname} = $i; 262 } 263 elsif ($columns[$i] eq "Anti") 264 { 265 $header{anticodon} = $i; 266 } 267 elsif ($columns[$i] eq "Intron") 268 { 269 $header{intron_start} = $i; 270 $header{intron_end} = $i+1; 271 } 272 elsif ($columns[$i] eq "Inf") 273 { 274 $header{score} = $i; 275 } 276 elsif ($columns[$i] eq "Cove") 277 { 278 $header{score} = $i; 279 } 280 elsif ($columns[$i] eq "HMM") 281 { 282 $header{hmm_score} = $i; 283 } 284 elsif ($columns[$i] eq "2'Str") 285 { 286 $header{ss_score} = $i; 287 } 288 elsif ($columns[$i] eq "Hit") 289 { 290 $header{hit_origin} = $i; 291 } 292 elsif ($columns[$i] eq "HMM") 293 { 294 $header{hmm_score} = $i; 295 } 296 elsif ($columns[$i] eq "Type") 297 { 298 $header{isotype_type} = $i; 299 } 300 } 301 } 302 elsif ($columns[0] =~ /^Name/) 303 { 304 for (my $i = 0; $i < scalar(@columns); $i++) 305 { 306 if ($columns[$i] eq "tRNA#") 307 { 308 $header{trna_id} = $i; 309 } 310 elsif ($columns[$i] eq "Begin" and !defined $header{start}) 311 { 312 $header{start} = $i; 313 $header{end} = $i+1; 314 } 315 elsif ($columns[$i] eq "Type") 316 { 317 $header{isotype} = $i; 318 } 319 elsif ($columns[$i] eq "CM") 320 { 321 $header{isotype_cm} = $i; 322 $header{isotype_score} = $i+1; 323 } 324 elsif ($columns[$i] eq "Note") 325 { 326 $header{note} = $i; 327 } 328 elsif ($columns[$i] eq "Count") 329 { 330 $header{intron_count} = $i; 331 } 332 } 333 } 334 } 335 else 336 { 337 if (!defined $header{isotype_cm}) 338 { 339 die "Error: This filter requires isotype-specific model scan result in the tRNAscan-SE v2 output file.\n"; 340 } 341 if (!defined $header{note}) 342 { 343 die "Error: This filter requires tRNAscan-SE v2 output file.\n"; 344 } 345 346 $tRNA_counts{total}++; 347 348 if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT") 349 { 350 $tRNA_counts{total}--; 351 next; 352 } 353 elsif ($columns[$header{note}] =~ /pseudo/) 354 { 355 $tRNA_counts{pseudo_filter}++; 356 next; 357 } 358 else 359 { 360 $tRNA = tRNAscanSE::tRNA->new; 361 $tRNA->seqname($columns[$header{seqname}]); 362 $tRNA->tRNAscan_id($columns[$header{seqname}].".trna".$columns[$header{trna_id}]); 363 $startpos = $columns[$header{start}]; 364 $endpos = $columns[$header{end}]; 365 if ($startpos < $endpos) 366 { 367 $tRNA->start($startpos); 368 $tRNA->end($endpos); 369 $tRNA->strand("+"); 370 } 371 else 372 { 373 $tRNA->end($startpos); 374 $tRNA->start($endpos); 375 $tRNA->strand("-"); 376 } 377 $tRNA->category("cyto"); 378 $tRNA->isotype($columns[$header{isotype}]); 379 $tRNA->anticodon($columns[$header{anticodon}]); 380 $tRNA->score($columns[$header{score}]); 381 $tRNA->hmm_score($columns[$header{hmm_score}]); 382 $tRNA->ss_score($columns[$header{ss_score}]); 383 $tRNA->tRNAscan_id($tRNA->tRNAscan_id()."-".$tRNA->isotype().$tRNA->anticodon()); 384 my $isotype_type = "cytosolic"; 385 if (defined $header{isotype_type}) 386 { 387 $isotype_type = $columns[$header{isotype_type}]; 388 } 389 $tRNA->add_model_hit($isotype_type, $columns[$header{isotype_cm}], $columns[$header{isotype_score}], ""); 390 $tRNAs->put($tRNA); 391 } 392 } 393 } 394 395 close(FILE_IN); 396 397 print STDERR "\n"; 398} 399 400sub euk_anticodon_filter 401{ 402 my ($ac_count, $tRNA) = @_; 403 my $tag = ""; 404 405 if (!$tRNA->is_pseudo()) 406 { 407 if ($tRNA->isotype() eq "Sup") 408 { 409 $tag = "unexpected anticodon"; 410 $tRNA_counts{ac_filter}++; 411 } 412 elsif (!defined $euk_aa_list{$tRNA->anticodon()}) 413 { 414 my $alt_anticodon = $tRNA->anticodon(); 415 if (substr($tRNA->anticodon(), 0, 1) eq "A") 416 { 417 $alt_anticodon = "G".substr($tRNA->anticodon(), 1); 418 } 419 elsif (substr($tRNA->anticodon(), 0, 1) eq "G") 420 { 421 $alt_anticodon = "A".substr($tRNA->anticodon(), 1); 422 } 423 424 if (defined $aa_list{$tRNA->anticodon()} and defined $aa_list{$alt_anticodon} and defined $euk_aa_list{$alt_anticodon} and 425 $ac_count->{$tRNA->anticodon()} > $ac_count->{$alt_anticodon}) 426 {} 427 else 428 { 429 $tag = "unexpected anticodon"; 430 $tRNA_counts{ac_filter}++; 431 } 432 } 433 } 434 435 return $tag; 436} 437 438sub secondary_filter 439{ 440 print "Status: Secondary filtering\n"; 441 442 $tRNA_counts{secondary_filter} = 0; 443 for (my $i = 0; $i < $tRNAs->get_count(); $i++) 444 { 445 my $tRNA = $tRNAs->get($i); 446 my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); 447 if ($opt_isoscore1 > -1 and !$tRNA->is_pseudo() and ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $opt_isoscore1))) 448 { 449 $tRNA->pseudo(1); 450 $tRNA_counts{secondary_filter}++; 451 } 452 elsif ($opt_cmscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->score() < $opt_cmscore1) 453 { 454 $tRNA->pseudo(1); 455 $tRNA_counts{secondary_filter}++; 456 } 457 elsif ($opt_ssscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->ss_score() < $opt_ssscore1 and $tRNA->isotype() ne "SeC") 458 { 459 $tRNA->pseudo(1); 460 $tRNA_counts{secondary_filter}++; 461 } 462 } 463} 464 465sub get_ac_count 466{ 467 my %ac_count = (); 468 469 for my $ac (sort keys %aa_list) 470 { 471 $ac_count{$ac} = 0; 472 } 473 474 for (my $i = 0; $i < $tRNAs->get_count(); $i++) 475 { 476 my $tRNA = $tRNAs->get($i); 477 if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo()) 478 { 479 $ac_count{$tRNA->anticodon()} += 1; 480 } 481 } 482 483 return \%ac_count; 484} 485 486sub get_isotype_count 487{ 488 my %isotype_count = (); 489 my $iso = ""; 490 491 for my $isotype (sort @isotypes) 492 { 493 $isotype_count{$isotype} = 0; 494 } 495 496 for (my $i = 0; $i < $tRNAs->get_count(); $i++) 497 { 498 my $tRNA = $tRNAs->get($i); 499 if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo()) 500 { 501 $isotype_count{$tRNA->isotype()} += 1; 502 } 503 } 504 505 return \%isotype_count; 506} 507 508sub tertiary_filter 509{ 510 print "Status: Tertiary filtering\n"; 511 512 $tRNA_counts{tertiary_filter} = 0; 513 my %sec_pass_filtered_ac = (); 514 my %iso_score_cutoff = (); 515 my $count_pair = []; 516 my %filtering_isotypes = (); 517 my $tRNA_index = 0; 518 my $start_index = -1; 519 my $end_index = -1; 520 my $isotype_tRNAs = tRNAscanSE::ArraytRNA->new(); 521 my $trna = undef; 522 my $find = 0; 523 524 my $ac_count = &get_ac_count(); 525 526 foreach my $ac (sort {$ac_count->{$b} <=> $ac_count->{$a}} keys %$ac_count) 527 { 528 if ($ac_count->{$ac} > $ANTICODON_COUNT_CUTOFF) 529 { 530 if (!defined $filtering_isotypes{$aa_list{$ac}}) 531 { 532 $filtering_isotypes{$aa_list{$ac}} = 1; 533 } 534 } 535 else 536 { 537 last; 538 } 539 } 540 541 $tRNAs->sort_array("isotype"); 542 543 foreach my $isotype (sort keys %filtering_isotypes) 544 { 545 $iso_score_cutoff{$isotype} = $opt_isoscore2; 546 $find = 0; 547 $isotype_tRNAs->clear(); 548 $start_index = -1; 549 $end_index = -1; 550 551 while (!$find and $tRNA_index < $tRNAs->get_count()) 552 { 553 if ($tRNAs->get($tRNA_index)->isotype() lt $isotype) 554 { 555 $tRNA_index++; 556 } 557 elsif ($tRNAs->get($tRNA_index)->isotype() eq $isotype) 558 { 559 if ($start_index == -1) 560 { 561 $start_index = $tRNA_index; 562 } 563 $end_index = $tRNA_index; 564 my $tRNA = $tRNAs->get($tRNA_index); 565 if (!$tRNA->is_pseudo()) 566 { 567 my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); 568 $trna = tRNAscanSE::tRNA->new; 569 $trna->tRNAscan_id($tRNA->tRNAscan_id()); 570 $trna->anticodon($tRNA->anticodon()); 571 $trna->score($iso_score); 572 $isotype_tRNAs->put($trna); 573 } 574 $tRNA_index++; 575 } 576 else 577 { 578 $find = 1; 579 } 580 } 581 582 if ($find) 583 { 584 foreach my $ac (@{$ac_list{$isotype}}) 585 { 586 if ($ac ne " ") 587 { 588 my $local_ac_count = $ac_count->{$ac}; 589 while ($local_ac_count > $ANTICODON_COUNT_CUTOFF and $iso_score_cutoff{$isotype} <= $opt_isomaxscore2) 590 { 591 $local_ac_count = &iso_score_filter($isotype_tRNAs, $ac, $iso_score_cutoff{$isotype}, $local_ac_count); 592 $iso_score_cutoff{$isotype}++; 593 } 594 if ($ac_count->{$ac} > $local_ac_count) 595 { 596 $iso_score_cutoff{$isotype}--; 597 } 598 } 599 } 600 601 foreach my $ac (@{$ac_list{$isotype}}) 602 { 603 if ($ac ne " ") 604 { 605 $count_pair = []; 606 $count_pair->[0] = $ac_count->{$ac}; 607 $sec_pass_filtered_ac{$ac} = $count_pair; 608 for (my $i = $start_index; $i <= $end_index; $i++) 609 { 610 my $tRNA = $tRNAs->get($i); 611 my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model(); 612 if ($tRNA->isotype() eq $isotype and !$tRNA->is_pseudo()) 613 { 614 if ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $iso_score_cutoff{$isotype})) 615 { 616 $tRNA->pseudo(2); 617 $tRNA_counts{tertiary_filter}++; 618 } 619 } 620 } 621 } 622 } 623 } 624 } 625 626 $ac_count = &get_ac_count(); 627 foreach my $ac (sort keys %sec_pass_filtered_ac) 628 { 629 $sec_pass_filtered_ac{$ac}->[1] = $ac_count->{$ac}; 630 } 631 return (\%sec_pass_filtered_ac, \%iso_score_cutoff); 632} 633 634sub iso_score_filter 635{ 636 my ($isotype_tRNAs, $ac, $isotype_score_cutoff, $local_ac_count) = @_; 637 638 my $count = $local_ac_count; 639 for (my $i = 0; $i < $isotype_tRNAs->get_count(); $i++) 640 { 641 my $tRNA = $isotype_tRNAs->get($i); 642 if ($tRNA->anticodon() eq $ac and !$tRNA->is_pseudo()) 643 { 644 if ($tRNA->score() < $isotype_score_cutoff) 645 { 646 $tRNA->pseudo(1); 647 $count--; 648 } 649 } 650 } 651 return $count; 652} 653 654sub write_out_file 655{ 656 my ($ac_count) = @_; 657 my $line = ""; 658 my $tRNA = undef; 659 my %header = (); 660 my @columns = (); 661 my $tRNAscan_id = ""; 662 my $index = -1; 663 my $include = 0; 664 my $tag = ""; 665 my $ct = 0; 666 667 $tRNA_counts{iso_filter} = 0; 668 $tRNA_counts{undet_filter} = 0; 669 $tRNA_counts{ac_filter} = 0; 670 671 print "Status: Writing output file $file_names{output_tRNAscan_out}\n"; 672 673 open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n"; 674 open(FILE_OUT, ">$file_names{output_tRNAscan_out}") or die "Fail to open $file_names{output_tRNAscan_out}\n"; 675 676 while ($line = <FILE_IN>) 677 { 678 $ct++; 679 680 print STDERR "." if ($ct % 1000 == 0); 681 print STDERR "\n" if ($ct % 50000 == 0); 682 683 chomp($line); 684 if ($line =~ /^Name/) 685 { 686 $line =~ s/tRNA #/tRNA#/; 687 } 688 689 @columns = split(/\t/, $line, -1); 690 for (my $i = 0; $i < scalar(@columns); $i++) 691 { 692 $columns[$i] = &trim($columns[$i]); 693 } 694 695 if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/) 696 { 697 print FILE_OUT $line."\n"; 698 if ($columns[0] =~ /^Sequence/) 699 { 700 for (my $i = 0; $i < scalar(@columns); $i++) 701 { 702 if ($columns[$i] eq "Sequence") 703 { 704 $header{seqname} = $i; 705 } 706 elsif ($columns[$i] eq "Anti") 707 { 708 $header{anticodon} = $i; 709 } 710 elsif ($columns[$i] eq "Type") 711 { 712 $header{isotype_type} = $i; 713 } 714 } 715 } 716 elsif ($columns[0] =~ /^Name/) 717 { 718 for (my $i = 0; $i < scalar(@columns); $i++) 719 { 720 if ($columns[$i] eq "tRNA#") 721 { 722 $header{trna_id} = $i; 723 } 724 elsif ($columns[$i] eq "Type") 725 { 726 $header{isotype} = $i; 727 } 728 elsif ($columns[$i] eq "CM") 729 { 730 $header{isotype_cm} = $i; 731 $header{isotype_score} = $i+1; 732 } 733 elsif ($columns[$i] eq "Note") 734 { 735 $header{note} = $i; 736 } 737 } 738 } 739 } 740 else 741 { 742 $include = 0; 743 $tag = ""; 744 $tRNAscan_id = $columns[$header{seqname}].".trna".$columns[$header{trna_id}]."-".$columns[$header{isotype}].$columns[$header{anticodon}]; 745 $index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id"); 746 if ($index == -1) 747 { 748 if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT") 749 {} 750 elsif (!$opt_remove) 751 { 752 $include = 1; 753 } 754 } 755 else 756 { 757 $tRNA = $tRNAs->get($index); 758 if ($tRNA->is_pseudo() and !$opt_remove) 759 { 760 $include = 1; 761 if ($tRNA->pseudo() == 1) 762 { 763 $tag = "secondary filtered"; 764 } 765 elsif ($tRNA->pseudo() == 2) 766 { 767 $tag = "tertiary filtered"; 768 } 769 } 770 elsif (!$tRNA->is_pseudo()) 771 { 772 $include = 1; 773 if ($columns[$header{note}] =~ /IPD/ and $columns[$header{isotype}] ne "Sup") 774 { 775 $tRNA_counts{iso_filter}++; 776 $tag = "isotype mismatch"; 777 } 778 elsif ($columns[$header{isotype}] eq "Undet") 779 { 780 $tRNA_counts{undet_filter}++; 781 $tag = "undetermined isotype"; 782 } 783 else 784 { 785 $tag = &euk_anticodon_filter($ac_count, $tRNA); 786 if ($tag eq "") 787 { 788 $tag = "high confidence set"; 789 } 790 } 791 } 792 } 793 if ($include) 794 { 795 for (my $i = 0; $i < $header{note}; $i++) 796 { 797 print FILE_OUT $columns[$i]."\t"; 798 } 799 if ($columns[$header{note}] ne "") 800 { 801 print FILE_OUT $columns[$header{note}]; 802 if ($tag ne "") 803 { 804 print FILE_OUT ",".$tag; 805 } 806 } 807 else 808 { 809 print FILE_OUT $tag; 810 } 811 for (my $i = $header{note} + 1; $i < scalar(@columns); $i++) 812 { 813 print FILE_OUT "\t".$columns[$i]; 814 } 815 print FILE_OUT "\n"; 816 } 817 } 818 } 819 close(FILE_IN); 820 close(FILE_OUT); 821 822 print STDERR "\n"; 823} 824 825sub write_ss_file 826{ 827 my $tRNA = undef; 828 my $line = ""; 829 my $tRNAscan_id = ""; 830 my $seqname = ""; 831 my $index = -1; 832 my $print_line = ""; 833 my $include = 0; 834 my $tag = ""; 835 my $ct = 0; 836 837 print "Status: Writing secondary structure file $file_names{output_tRNAscan_ss}\n"; 838 839 open(FILE_IN, "$file_names{tRNAscan_ss}") || die "Error: Fail to open $file_names{tRNAscan_ss}\n"; 840 open(FILE_OUT, ">$file_names{output_tRNAscan_ss}") or die "Fail to open $file_names{output_tRNAscan_ss}\n"; 841 while ($line = <FILE_IN>) 842 { 843 if ($line =~ /^(\S+)\s+\(\d+\-\d+\)\s+Length:\s\d+\sbp/) 844 { 845 $tRNAscan_id = $1; 846 $seqname = substr($tRNAscan_id, 0, rindex($tRNAscan_id, ".")); 847 $print_line = $line; 848 $include = 0; 849 $tag = ""; 850 851 $ct++; 852 print STDERR "." if ($ct % 1000 == 0); 853 print STDERR "\n" if ($ct % 50000 == 0); 854 } 855 elsif ($line =~ /^Type:\s(\S+)\s+Anticodon:\s(\S+)\sat\s.+\s\(.+\)\s+Score:\s\S+/) 856 { 857 $tRNAscan_id .= "-".$1.$2; 858 $print_line .= $line; 859 860 $index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id"); 861 if ($index == -1) 862 { 863 if (!$opt_remove) 864 { 865 $include = 1; 866 $tag = "Filtered"; 867 } 868 } 869 else 870 { 871 $tRNA = $tRNAs->get($index); 872 if ($tRNA->is_pseudo() and !$opt_remove) 873 { 874 $include = 1; 875 $tag = "Filtered"; 876 } 877 elsif (!$tRNA->is_pseudo()) 878 { 879 $include = 1; 880 } 881 } 882 } 883 elsif ($line =~ /^HMM Sc=\S+\s+Sec struct Sc=\S+/) 884 { 885 if ($tag ne "") 886 { 887 $print_line .= $tag.": ".$line; 888 } 889 else 890 { 891 $print_line .= $line; 892 } 893 } 894 elsif (($line =~ /^Possible pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/) 895 or ($line =~ /^Possible truncation, pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/)) 896 { 897 if ($tag ne "") 898 { 899 $print_line .= $tag.", ".$line; 900 } 901 else 902 { 903 $print_line .= $line; 904 } 905 } 906 elsif ($line =~ /^Possible intron: \d+-\d+ \(\d+-\d+\)/) 907 { 908 $print_line .= $line; 909 } 910 elsif (index($line, " * | * | * |") > -1) 911 { 912 $print_line .= $line; 913 } 914 elsif ($line =~ /^Seq:\s\S+$/) 915 { 916 $print_line .= $line; 917 } 918 elsif ($line =~ /^Str:\s\S+$/) 919 { 920 $print_line .= $line; 921 if ($seqname eq "chrM" or$seqname eq "M" or $seqname eq "chrMT" or $seqname eq "MT") 922 {} 923 elsif ($include) 924 { 925 print FILE_OUT $print_line."\n"; 926 } 927 } 928 } 929 close(FILE_IN); 930 close(FILE_OUT); 931 932 print STDERR "\n"; 933} 934 935sub write_summary 936{ 937 my ($sec_pass_filtered_ac, $iso_score_cutoff) = @_; 938 939 print "Status: Writing summary file $file_names{log}\n"; 940 941 open(FILE_OUT, ">$file_names{log}") or die "Fail to open $file_names{log}\n"; 942 943 print FILE_OUT "EukHighConfidenceFilter v1.0 Summary\n", 944 "Completed Time: ".localtime()."\n\n"; 945 print FILE_OUT "Inputs\n", 946 "------------------------------------------------------\n", 947 "tRNAscan-SE output file: ".$file_names{tRNAscan_out}."\n", 948 "tRNAscan-SE ss file: ".$file_names{tRNAscan_ss}."\n"; 949 if ($opt_cmscore1 == -1) 950 { 951 print FILE_OUT "Secondary filtering domain-specific model score: Disabled\n"; 952 } 953 else 954 { 955 print FILE_OUT "Secondary filtering domain-specific model score cutoff: $opt_cmscore1\n"; 956 } 957 if ($opt_ssscore1 == -1) 958 { 959 print FILE_OUT "Secondary filtering secondary structure score: Disabled\n"; 960 } 961 else 962 { 963 print FILE_OUT "Secondary filtering secondary structure score cutoff: $opt_ssscore1\n"; 964 } 965 if ($opt_isoscore1 == -1) 966 { 967 print FILE_OUT "Secondary filtering isotype-specific model score: Disabled\n"; 968 } 969 else 970 { 971 print FILE_OUT "Secondary filtering isotype-specific model score cutoff: $opt_isoscore1\n"; 972 } 973 if ($opt_isoscore2 == -1) 974 { 975 print FILE_OUT "Tertiary filtering isotype-specific model score: Disabled\n"; 976 } 977 else 978 { 979 print FILE_OUT "Tertiary filtering isotype-specific model starting score cutoff: $opt_isoscore2\n"; 980 print FILE_OUT "Tertiary filtering maximum isotype-specific model score cutoff: $opt_isomaxscore2\n"; 981 } 982 if ($opt_remove) 983 { 984 print FILE_OUT "Remove filtered hits: Yes\n"; 985 } 986 else 987 { 988 print FILE_OUT "Remove filtered hits: No\n"; 989 } 990 print FILE_OUT "\n"; 991 print FILE_OUT "Outputs\n", 992 "------------------------------------------------------\n", 993 "tRNAscan-SE output file: ".$file_names{output_tRNAscan_out}."\n", 994 "tRNAscan-SE ss file: ".$file_names{output_tRNAscan_ss}."\n", 995 "Summary file: ".$file_names{log}."\n\n"; 996 997 my $remaining_hits = $tRNA_counts{total} - $tRNA_counts{pseudo_filter}; 998 print FILE_OUT "Summary statistics\n", 999 "------------------------------------------------------\n"; 1000 print FILE_OUT "Total tRNA predictions: ".$tRNA_counts{total}."\n"; 1001 print FILE_OUT "Possible pseudogenes: ".$tRNA_counts{pseudo_filter}."\n"; 1002 if ($opt_isoscore1 > -1 or $opt_ssscore1 > -1 or $opt_cmscore1 > -1) 1003 { 1004 print FILE_OUT "tRNAs with scores lower than secondary cutoffs: ".$tRNA_counts{secondary_filter}."\n"; 1005 1006 $remaining_hits -= $tRNA_counts{secondary_filter}; 1007 } 1008 if ($opt_isoscore2 > -1) 1009 { 1010 $remaining_hits -= $tRNA_counts{tertiary_filter}; 1011 print FILE_OUT "tRNAs failed tertiary filter: ".$tRNA_counts{tertiary_filter}."\n\n"; 1012 } 1013 print FILE_OUT "tRNAs after filtering: ".$remaining_hits."\n\n"; 1014 print FILE_OUT "tRNAs with mismatched isotype: ".$tRNA_counts{iso_filter}."\n"; 1015 print FILE_OUT "tRNAs with undetermined isotype: ".$tRNA_counts{undet_filter}."\n"; 1016 print FILE_OUT "tRNAs with unexpected anticodon: ".$tRNA_counts{ac_filter}."\n\n"; 1017 print FILE_OUT "High confidence set: ".($remaining_hits - $tRNA_counts{iso_filter} - $tRNA_counts{undet_filter} - $tRNA_counts{ac_filter})."\n\n"; 1018 1019 if ($opt_isoscore2 > -1 and $tRNA_counts{tertiary_filter} > 0) 1020 { 1021 print FILE_OUT "Tertiary filter - anticodon counts\n", 1022 "------------------------------------------------------\n\n"; 1023 print FILE_OUT "Isotype Anticodon BeforeFiltering AfterFiltering FinalScoreCutoff\n"; 1024 foreach my $isotype (sort keys %$iso_score_cutoff) 1025 { 1026 foreach my $ac (@{$ac_list{$isotype}}) 1027 { 1028 if ($ac ne " " and defined $euk_aa_list{$ac}) 1029 { 1030 printf FILE_OUT "%-7s %-9s %-15d %-14d %-16d\n", 1031 $isotype, $ac, $sec_pass_filtered_ac->{$ac}->[0], $sec_pass_filtered_ac->{$ac}->[1], $iso_score_cutoff->{$isotype}; 1032 } 1033 } 1034 } 1035 print FILE_OUT "\n"; 1036 } 1037 1038 my $ac_count = &get_ac_count(); 1039 my %isotype_count = (); 1040 foreach my $ac (sort keys %aa_list) 1041 { 1042 if (!defined $isotype_count{$aa_list{$ac}}) 1043 { 1044 $isotype_count{$aa_list{$ac}} = $ac_count->{$ac}; 1045 } 1046 else 1047 { 1048 $isotype_count{$aa_list{$ac}} += $ac_count->{$ac}; 1049 } 1050 } 1051 1052 print FILE_OUT "Isotype / Anticodon Counts After Filtering\n", 1053 "------------------------------------------------------\n\n"; 1054 1055 foreach my $aa (@isotypes) 1056 { 1057 my $iso_count = 0; 1058 if (defined $isotype_count{$aa}) 1059 { 1060 $iso_count = $isotype_count{$aa}; 1061 } 1062 if ($aa eq "SeC") 1063 { 1064 printf FILE_OUT ("%-8s: %d\t", "SelCys", $iso_count); 1065 } 1066 elsif ($aa eq "Sup") 1067 { 1068 printf FILE_OUT ("%-8s: %d\t", "Supres", $iso_count); 1069 } 1070 else 1071 { 1072 printf FILE_OUT ("%-8s: %d\t", $aa, $iso_count); 1073 } 1074 1075 foreach my $ac (@{$ac_list{$aa}}) 1076 { 1077 if ($ac eq " ") 1078 { 1079 print FILE_OUT " "; 1080 } 1081 else 1082 { 1083 if (defined $ac_count->{$ac}) 1084 { 1085 printf FILE_OUT ("%5s: %-6s", $ac, $ac_count->{$ac}); 1086 } 1087 else 1088 { 1089 printf FILE_OUT ("%5s: %-6s", $ac, ""); 1090 } 1091 } 1092 } 1093 print FILE_OUT "\n"; 1094 } 1095 print FILE_OUT "\n"; 1096 1097 close(FILE_OUT); 1098} 1099 1100sub trim 1101{ 1102 my $string = shift; 1103 $string =~ s/^\s+//; 1104 $string =~ s/\s+$//; 1105 return $string; 1106} 1107