1# tRNAscanSE/CM.pm 2# This class contains parameters and functions for running CM tRNA search used in tRNAscan-SE. 3# 4# -------------------------------------------------------------- 5# This module is part of the tRNAscan-SE program. 6# Copyright (C) 2017 Patricia Chan and Todd Lowe 7# -------------------------------------------------------------- 8# 9 10package tRNAscanSE::CM; 11 12use strict; 13use File::Copy; 14use tRNAscanSE::Configuration; 15use tRNAscanSE::Options; 16use tRNAscanSE::Utils; 17use tRNAscanSE::LogFile; 18use tRNAscanSE::ScanResult; 19use tRNAscanSE::SS; 20use tRNAscanSE::Sequence; 21use tRNAscanSE::tRNA; 22use tRNAscanSE::ArraytRNA; 23use tRNAscanSE::IntResultFile; 24use tRNAscanSE::MultiResultFile; 25use tRNAscanSE::CMscanResultFile; 26use tRNAscanSE::ArrayCMscanResults; 27use tRNAscanSE::FpScanResultFile; 28 29sub new 30{ 31 my $class = shift; 32 my $self = {}; 33 34 initialize($self); 35 36 bless ($self, $class); 37 return $self; 38} 39 40sub DESTROY 41{ 42 my $self = shift; 43} 44 45sub initialize 46{ 47 my $self = shift; 48 49 $self->{CM_check_for_introns} = 0; # check for non-canonical introns 50 $self->{CM_check_for_split_halves} = 0; # check for split tRNA fragments 51 $self->{skip_pseudo_filter} = 0; # enable filter for psuedogenes (Cove score <40, 52 # primary struct score <10 bits, secondary 53 # structure score < 5 bits) 54 55 $self->{get_hmm_score} = 0; # also score tRNA with covariance model 56 # without sec structure info, similar 57 # to getting hmm score for match of 58 # seq to tRNA hmm (-H option) 59 60 61 # Convariance model file path 62 $self->{main_cm_file_path} = {}; 63 $self->{mainNS_cm_file_path} = {}; 64 $self->{cove_cm_file_path} = ''; 65 $self->{intron_cm_file_path} = {}; 66 $self->{arch_five_half_cm_file_path} = ''; 67 $self->{arch_three_half_cm_file_path} = ''; 68 $self->{Pselc_cm_file_path} = ''; 69 $self->{Eselc_cm_file_path} = ''; 70 $self->{isotype_cm_db_file_path} = ''; 71 $self->{mito_isotype_cm_db_file_path} = ''; 72 $self->{isotype_cm_file_paths} = {}; 73 $self->{mito_isotype_cm_file_paths} = {}; 74 75 $self->{isotype_cm_cutoff} = {}; 76 77 $self->{covels_bin} = "covels-SE"; # Application executable name 78 $self->{coves_bin} = "coves-SE"; 79 $self->{cmsearch_bin} = "cmsearch"; 80 $self->{cmscan_bin} = "cmscan"; 81 82 $self->{infernal_thread} = -1; 83 84 $self->{tab_results} = +[]; 85} 86 87sub CM_mode 88{ 89 my $self = shift; 90 if (@_) { $self->{CM_mode} = shift; } 91 return $self->{CM_mode}; 92} 93 94sub cove_mode 95{ 96 my $self = shift; 97 return ($self->{CM_mode} eq 'cove'); 98} 99 100sub infernal_mode 101{ 102 my $self = shift; 103 return ($self->{CM_mode} eq 'infernal'); 104} 105 106sub cm_cutoff 107{ 108 my $self = shift; 109 if (@_) { $self->{cm_cutoff} = shift; } 110 return $self->{cm_cutoff}; 111} 112 113sub organelle_cm_cutoff 114{ 115 my $self = shift; 116 if (@_) { $self->{organelle_cm_cutoff} = shift; } 117 return $self->{organelle_cm_cutoff}; 118} 119 120sub infernal_fp_cutoff 121{ 122 my $self = shift; 123 if (@_) { $self->{infernal_fp_cutoff} = shift; } 124 return $self->{infernal_fp_cutoff}; 125} 126 127sub BHB_cm_cutoff 128{ 129 my $self = shift; 130 if (@_) { $self->{BHB_cm_cutoff} = shift; } 131 return $self->{BHB_cm_cutoff}; 132} 133 134sub max_tRNA_length 135{ 136 my $self = shift; 137 if (@_) { $self->{max_tRNA_length} = shift; } 138 return $self->{max_tRNA_length}; 139} 140 141sub max_cove_tRNA_length 142{ 143 my $self = shift; 144 if (@_) { $self->{max_cove_tRNA_length} = shift; } 145 return $self->{max_cove_tRNA_length}; 146} 147 148sub max_cmsearch_tRNA_length 149{ 150 my $self = shift; 151 if (@_) { $self->{max_cmsearch_tRNA_length} = shift; } 152 return $self->{max_cmsearch_tRNA_length}; 153} 154 155sub CM_check_for_introns 156{ 157 my $self = shift; 158 if (@_) { $self->{CM_check_for_introns} = shift; } 159 return $self->{CM_check_for_introns}; 160} 161 162sub CM_check_for_split_halves 163{ 164 my $self = shift; 165 if (@_) { $self->{CM_check_for_split_halves} = shift; } 166 return $self->{CM_check_for_split_halves}; 167} 168 169sub min_tRNA_no_intron 170{ 171 my $self = shift; 172 if (@_) { $self->{min_tRNA_no_intron} = shift; } 173 return $self->{min_tRNA_no_intron}; 174} 175 176sub min_intron_length 177{ 178 my $self = shift; 179 if (@_) { $self->{min_intron_length} = shift; } 180 return $self->{min_intron_length}; 181} 182 183sub skip_pseudo_filter 184{ 185 my $self = shift; 186 if (@_) { $self->{skip_pseudo_filter} = shift; } 187 return $self->{skip_pseudo_filter}; 188} 189 190sub min_pseudo_filter_score 191{ 192 my $self = shift; 193 if (@_) { $self->{min_pseudo_filter_score} = shift; } 194 return $self->{min_pseudo_filter_score}; 195} 196 197sub min_ss_score 198{ 199 my $self = shift; 200 if (@_) { $self->{min_ss_score} = shift; } 201 return $self->{min_ss_score}; 202} 203 204sub min_hmm_score 205{ 206 my $self = shift; 207 if (@_) { $self->{min_hmm_score} = shift; } 208 return $self->{min_hmm_score}; 209} 210 211sub get_hmm_score 212{ 213 my $self = shift; 214 if (@_) { $self->{get_hmm_score} = shift; } 215 return $self->{get_hmm_score}; 216} 217 218sub main_cm_file_path 219{ 220 my $self = shift; 221 if (@_) { %{$self->{main_cm_file_path}} = shift; } 222 return %{$self->{main_cm_file_path}}; 223} 224 225sub add_main_cm_file_path 226{ 227 my $self = shift; 228 my ($key, $file) = @_; 229 $self->{main_cm_file_path}->{$key} = $file; 230} 231 232sub mainNS_cm_file_path 233{ 234 my $self = shift; 235 if (@_) { %{$self->{mainNS_cm_file_path}} = shift; } 236 return %{$self->{mainNS_cm_file_path}}; 237} 238 239sub add_mainNS_cm_file_path 240{ 241 my $self = shift; 242 my ($key, $file) = @_; 243 $self->{mainNS_cm_file_path}->{$key} = $file; 244} 245 246sub cove_cm_file_path 247{ 248 my $self = shift; 249 if (@_) { $self->{cove_cm_file_path} = shift; } 250 return $self->{cove_cm_file_path}; 251} 252 253sub isotype_cm_db_file_path 254{ 255 my $self = shift; 256 if (@_) { $self->{isotype_cm_db_file_path} = shift; } 257 return $self->{isotype_cm_db_file_path}; 258} 259 260sub mito_isotype_cm_db_file_path 261{ 262 my $self = shift; 263 if (@_) { $self->{mito_isotype_cm_db_file_path} = shift; } 264 return $self->{mito_isotype_cm_db_file_path}; 265} 266 267sub isotype_cm_file_paths 268{ 269 my $self = shift; 270 if (@_) { %{$self->{isotype_cm_file_paths}} = shift; } 271 return %{$self->{isotype_cm_file_paths}}; 272} 273 274sub add_isotype_cm_file_path 275{ 276 my $self = shift; 277 my ($isotype, $file) = @_; 278 $self->{isotype_cm_file_paths}->{$isotype} = $file; 279} 280 281sub mito_isotype_cm_file_paths 282{ 283 my $self = shift; 284 if (@_) { %{$self->{mito_isotype_cm_file_paths}} = shift; } 285 return %{$self->{mito_isotype_cm_file_paths}}; 286} 287 288sub add_mito_isotype_cm_file_path 289{ 290 my $self = shift; 291 my ($isotype, $file) = @_; 292 $self->{mito_isotype_cm_file_paths}->{$isotype} = $file; 293} 294 295sub intron_cm_file_path 296{ 297 my $self = shift; 298 if (@_) { %{$self->{intron_cm_file_path}} = shift; } 299 return %{$self->{intron_cm_file_path}}; 300} 301 302sub add_intron_cm_file_path 303{ 304 my $self = shift; 305 my ($key, $file) = @_; 306 $self->{intron_cm_file_path}->{$key} = $file; 307} 308 309sub Pselc_cm_file_path 310{ 311 my $self = shift; 312 if (@_) { $self->{Pselc_cm_file_path} = shift; } 313 return $self->{Pselc_cm_file_path}; 314} 315 316sub Eselc_cm_file_path 317{ 318 my $self = shift; 319 if (@_) { $self->{Eselc_cm_file_path} = shift; } 320 return $self->{Eselc_cm_file_path}; 321} 322 323sub add_isotype_cm_cutoff 324{ 325 my $self = shift; 326 my ($type, $cutoff) = @_; 327 $self->{isotype_cm_cutoff}->{$type} = $cutoff; 328} 329 330sub covels_bin 331{ 332 my $self = shift; 333 if (@_) { $self->{covels_bin} = shift; } 334 return $self->{covels_bin}; 335} 336 337sub coves_bin 338{ 339 my $self = shift; 340 if (@_) { $self->{coves_bin} = shift; } 341 return $self->{coves_bin}; 342} 343 344sub cmsearch_bin 345{ 346 my $self = shift; 347 if (@_) { $self->{cmsearch_bin} = shift; } 348 return $self->{cmsearch_bin}; 349} 350 351sub infernal_thread 352{ 353 my $self = shift; 354 if (@_) { $self->{infernal_thread} = shift; } 355 return $self->{infernal_thread}; 356} 357 358sub tab_results 359{ 360 my $self = shift; 361 if (@_) { $self->{tab_results} = shift; } 362 return $self->{tab_results}; 363} 364 365sub set_defaults 366{ 367 my $self = shift; 368 my $global_vars = shift; 369 my $global_constants = $global_vars->{global_constants}; 370 371 $self->{cm_mode} = $global_constants->get("cm_mode"); 372 $self->{cm_cutoff} = $global_constants->get("cm_cutoff"); 373 $self->{organelle_cm_cutoff} = $global_constants->get("organelle_cm_cutoff"); 374 $self->{infernal_fp_cutoff} = $global_constants->get("infernal_fp_cutoff"); 375 $self->{max_tRNA_length} = $global_constants->get("max_tRNA_length"); 376 $self->{max_cove_tRNA_length} = $global_constants->get("max_cove_tRNA_length"); 377 $self->{max_cmsearch_tRNA_length} = $global_constants->get("max_cmsearch_tRNA_length"); 378 $self->{min_tRNA_no_intron} = $global_constants->get("min_tRNA_no_intron"); 379 $self->{min_intron_length} = $global_constants->get("min_intron_length"); 380 $self->{min_cove_pseudo_filter_score} = $global_constants->get("min_cove_pseudo_filter_score"); 381 $self->{min_cmsearch_pseudo_filter_score} = $global_constants->get("min_cmsearch_pseudo_filter_score"); 382 $self->{min_ss_score} = $global_constants->get("min_ss_score"); 383 $self->{min_hmm_score} = $global_constants->get("min_hmm_score"); 384 $self->{nci_scan_cutoff} = $global_constants->get("nci_scan_cutoff"); 385 $self->{BHB_cm_cutoff} = $global_constants->get("BHB_cm_cutoff"); 386 $self->{split_tRNA_scan_cutoff} = $global_constants->get("split_tRNA_scan_cutoff"); 387 $self->{half_tRNA_cutoff} = $global_constants->get("half_tRNA_cutoff"); 388 $self->{left_splicing_len} = $global_constants->get("left_splicing_len"); 389 $self->{right_splicing_len} = $global_constants->get("right_splicing_len"); 390 my $search_types = $global_constants->get("isotype_cm_cutoff"); 391 foreach my $type (sort keys %$search_types) 392 { 393 $self->add_isotype_cm_cutoff($type, $search_types->{$type}); 394 } 395} 396 397sub set_file_paths 398{ 399 my $self = shift; 400 my $global_vars = shift; 401 my $opts = $global_vars->{options}; 402 my $global_constants = $global_vars->{global_constants}; 403 my $isotype_cms = ""; 404 405 if ($opts->euk_mode()) 406 { 407 if ($self->infernal_mode()) 408 { 409 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "eukaryota"); # default to eukar model 410 $self->{main_cm_file_path}->{SeC} = $global_constants->get_subvalue("euk_cm", "SeC"); # Euk SeC model 411 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "eukaryota-ns"); # no secondary struct 412 $self->{cove_cm_file_path} = $global_constants->get_subvalue("cove_cm", "eukaryota"); # default to eukar cove model 413 } 414 elsif ($self->cove_mode()) 415 { 416 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "eukaryota"); # default to eukar cove model 417 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "eukaryota-ns"); # no secondary struct 418 } 419 $self->{isotype_cm_db_file_path} = $global_constants->get_subvalue("isotype_cm", "eukaryota"); 420 if ($opts->mito_model() ne "") 421 { 422 $self->{mito_isotype_cm_db_file_path} = $global_constants->get_subvalue("mito_cm", $opts->mito_model()); 423 } 424 } 425 elsif ($opts->bact_mode()) 426 { 427 if ($self->infernal_mode()) 428 { 429 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "bacteria"); # use bacterial covariance model 430 $self->{main_cm_file_path}->{SeC} = $global_constants->get_subvalue("bact_cm", "SeC"); # Bacterial SeC model 431 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "bacteria-ns"); # no sec struct 432 $self->{cove_cm_file_path} = $global_constants->get_subvalue("cove_cm", "bacteria"); # use bacterial covariance model 433 } 434 elsif ($self->cove_mode()) 435 { 436 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "bacteria"); # use bacterial covariance model 437 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "bacteria-ns"); # no sec struct 438 } 439 $self->{isotype_cm_db_file_path} = $global_constants->get_subvalue("isotype_cm", "bacteria"); 440 } 441 elsif ($opts->arch_mode()) 442 { 443 my $nci_cms = $global_constants->get("nci_cm"); 444 foreach my $type (sort keys %$nci_cms) 445 { 446 $self->add_intron_cm_file_path($type, $nci_cms->{$type}); 447 } 448 $self->{arch_five_half_cm_file_path} = $global_constants->get_subvalue("cm", "arch_5h"); # model for finding 5'half 449 $self->{arch_three_half_cm_file_path} = $global_constants->get_subvalue("cm", "arch_3h"); # model for finding 3'half 450 if ($self->infernal_mode()) 451 { 452 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "archaea"); # use archaea covariance model 453 $self->{main_cm_file_path}->{SeC} = $global_constants->get_subvalue("arch_cm", "SeC"); # Archaeal SeC model 454 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cm", "archaea-ns"); # no sec struct 455 $self->{cove_cm_file_path} = $global_constants->get_subvalue("cove_cm", "archaea"); # use archaea covariance model 456 } 457 elsif ($opts->cove_mode()) 458 { 459 $self->{main_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "archaea"); # use archaea covariance model 460 $self->{mainNS_cm_file_path}->{Domain} = $global_constants->get_subvalue("cove_cm", "archaea-ns"); # no sec struct 461 } 462 $self->{isotype_cm_db_file_path} = $global_constants->get_subvalue("isotype_cm", "archaea"); 463 } 464 elsif ($opts->mito_mode()) 465 { 466 if ($self->infernal_mode()) 467 { 468 $isotype_cms = $global_constants->get("mito_cm_".$opts->mito_model()); 469 foreach my $isotype (sort keys %$isotype_cms) 470 { 471 $self->{main_cm_file_path}->{$isotype} = $isotype_cms->{$isotype}; 472 } 473 } 474 } 475 elsif ($opts->metagenome_mode()) 476 { 477 if ($self->infernal_mode()) 478 { 479 $self->{main_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general"); # use general covariance model 480 $self->{mainNS_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general-ns"); # no sec struct 481 $self->{main_cm_file_path}->{euk} = $global_constants->get_subvalue("cm", "eukayota"); # use eukayota covariance model 482 $self->{mainNS_cm_file_path}->{euk} = $global_constants->get_subvalue("cm", "eukayota-ns"); # no sec struct 483 $self->{main_cm_file_path}->{arch} = $global_constants->get_subvalue("cm", "archaea"); # use archaea covariance model 484 $self->{mainNS_cm_file_path}->{arch} = $global_constants->get_subvalue("cm", "archaea-ns"); # no sec struct 485 $self->{main_cm_file_path}->{bact} = $global_constants->get_subvalue("cm", "bacteria"); # use bacteria covariance model 486 $self->{mainNS_cm_file_path}->{bact} = $global_constants->get_subvalue("cm", "bacteria-ns"); # no sec struct 487 } 488 } 489 elsif ($opts->numt_mode()) 490 { 491 $isotype_cms = $global_constants->get("mito_cm_mammal"); 492 foreach my $isotype (sort keys %$isotype_cms) 493 { 494 $self->add_isotype_cm_file_path($isotype, $isotype_cms->{$isotype}); 495 } 496 } 497 elsif ($opts->general_mode()) 498 { 499 if ($self->infernal_mode()) 500 { 501 $self->{main_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general"); # use original covariance model 502 $self->{mainNS_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general-ns"); # no sec struct 503 $self->{cove_cm_file_path} = $global_constants->get_subvalue("cove_cm", "general"); # use original covariance model 504 } 505 elsif ($self->cove_mode()) 506 { 507 $self->{main_cm_file_path}->{general} = $global_constants->get_subvalue("cove_cm", "general"); # use original covariance model 508 $self->{mainNS_cm_file_path}->{general} = $global_constants->get_subvalue("cove_cm", "general-ns"); # no sec struct 509 } 510 } 511 elsif ($opts->organelle_mode()) 512 { 513 if ($self->infernal_mode()) 514 { 515 $self->{main_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general1415"); # use original covariance model 516 $self->{mainNS_cm_file_path}->{general} = $global_constants->get_subvalue("cm", "general1415-ns"); # no sec struct 517 $self->{cove_cm_file_path} = $global_constants->get_subvalue("cove_cm", "general"); # use original covariance model 518 } 519 elsif ($self->cove_mode()) 520 { 521 $self->{main_cm_file_path}->{general} = $global_constants->get_subvalue("cove_cm", "general"); # use original covariance model 522 $self->{mainNS_cm_file_path}->{general} = $global_constants->get_subvalue("cove_cm", "general-ns"); # no sec struct 523 } 524 } 525 elsif ($opts->alternate_mode()) 526 { 527 my $cms = $global_constants->get("alt_cm"); 528 foreach my $key (sort keys %$cms) 529 { 530 $self->add_main_cm_file_path($key, $cms->{$key}); 531 } 532 } 533 534 if ($self->infernal_mode()) 535 { 536 $self->{Pselc_cm_file_path} = $self->{main_cm_file_path}->{SeC}; 537 $self->{Eselc_cm_file_path} = $self->{main_cm_file_path}->{SeC}; 538 } 539 elsif ($self->cove_mode()) 540 { 541 $self->{Pselc_cm_file_path} = $global_constants->get_subvalue("cove_cm", "PSELC"); 542 $self->{Eselc_cm_file_path} = $global_constants->get_subvalue("cove_cm", "ESELC"); 543 } 544} 545 546sub check_lib_files 547{ 548 my $self = shift; 549 my ($opts) = @_; 550 551 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 552 { 553 if ($self->{main_cm_file_path}->{$cur_cm} ne "" and !-r $self->{main_cm_file_path}->{$cur_cm}) 554 { 555 die "FATAL: Unable to open ".$self->{main_cm_file_path}->{$cur_cm}." covariance model file\n\n"; 556 } 557 } 558 559 foreach my $cur_cm (sort keys %{$self->{mainNS_cm_file_path}}) 560 { 561 if ($self->{mainNS_cm_file_path}->{$cur_cm} ne "" and !-r $self->{mainNS_cm_file_path}->{$cur_cm}) 562 { 563 die "FATAL: Unable to open ".$self->{mainNS_cm_file_path}->{$cur_cm}." covariance model file\n\n"; 564 } 565 } 566 if ($self->{cove_cm_file_path} ne "" and !-r $self->{cove_cm_file_path}) 567 { 568 die "FATAL: Unable to open ".$self->{cove_cm_file_path}." covariance model file\n\n"; 569 } 570 571 if ($self->{Pselc_cm_file_path} ne "" and !-r $self->{Pselc_cm_file_path}) 572 { 573 die "FATAL: Unable to open ".$self->{Pselc_cm_file_path}." covariance model file\n\n"; 574 } 575 576 if ($self->{Eselc_cm_file_path} ne "" and !-r $self->{Eselc_cm_file_path}) 577 { 578 die "FATAL: Unable to open ".$self->{Eselc_cm_file_path}." covariance model file\n\n"; 579 } 580 581 if ($self->{isotype_cm_db_file_path} ne "" and (!-r $self->{isotype_cm_db_file_path} || !-e "$self->{isotype_cm_db_file_path}".".i1f")) 582 { 583 die "FATAL: Unable to open ".$self->{isotype_cm_db_file_path}." covariance model file\n\n"; 584 } 585 586 if ($self->{mito_isotype_cm_db_file_path} ne "" and (!-r $self->{mito_isotype_cm_db_file_path} || !-e "$self->{mito_isotype_cm_db_file_path}".".i1f")) 587 { 588 die "FATAL: Unable to open ".$self->{mito_isotype_cm_db_file_path}." covariance model file\n\n"; 589 } 590 591 foreach my $isotype (sort keys %{$self->{isotype_cm_file_paths}}) 592 { 593 if ($self->{isotype_cm_file_paths}->{$isotype} ne "" and !-r $self->{isotype_cm_file_paths}->{$isotype}) 594 { 595 die "FATAL: Unable to open ".$self->{isotype_cm_file_paths}->{$isotype}." covariance model file\n\n"; 596 } 597 } 598 599 if ($opts->arch_mode() && ($self->infernal_mode() || $self->cove_mode())) 600 { 601 foreach my $cur_cm (sort keys %{$self->{intron_cm_file_path}}) 602 { 603 if ($self->{intron_cm_file_path}->{$cur_cm} ne "" and !-r $self->{intron_cm_file_path}->{$cur_cm}) 604 { 605 die "FATAL: Unable to open ".$self->{intron_cm_file_path}->{$cur_cm}." covariance model file\n\n"; 606 } 607 } 608 if ($self->{arch_five_half_cm_file_path} ne "" and !-r $self->{arch_five_half_cm_file_path}) 609 { 610 die "FATAL: Unable to open ".$self->{arch_five_half_cm_file_path}." covariance model file\n\n"; 611 } 612 if ($self->{arch_three_half_cm_file_path} ne "" and !-r $self->{arch_three_half_cm_file_path}) 613 { 614 die "FATAL: Unable to open ".$self->{arch_three_half_cm_file_path}." covariance model file\n\n"; 615 } 616 } 617} 618 619sub set_bin 620{ 621 my $self = shift; 622 my $bindir = shift; 623 624 if ($^O =~ /^MSWin/) 625 { 626 $self->{cmsearch_bin} .= ".exe"; 627 $self->{covels_bin} .= ".exe"; 628 $self->{coves_bin} .= ".exe"; 629 } 630 if (!(-x $self->{covels_bin})) 631 { 632 $self->{covels_bin} = $bindir."/".$self->{covels_bin}; 633 if (!(-x $self->{covels_bin})) 634 { 635 die "FATAL: Unable to find ".$self->{covels_bin}." executable\n\n"; 636 } 637 } 638 if (!(-x $self->{coves_bin})) 639 { 640 $self->{coves_bin} = $bindir."/".$self->{coves_bin}; 641 if (!(-x $self->{coves_bin})) 642 { 643 die "FATAL: Unable to find ".$self->{coves_bin}." executable\n\n"; 644 } 645 } 646} 647 648sub set_infernal_bin 649{ 650 my $self = shift; 651 my $bindir = shift; 652 653 $self->{cmsearch_bin} = $bindir."/".$self->{cmsearch_bin}; 654 if (!(-x $self->{cmsearch_bin})) 655 { 656 die "FATAL: Unable to find ".$self->{cmsearch_bin}." executable\n\n"; 657 } 658 659 $self->{cmscan_bin} = $bindir."/".$self->{cmscan_bin}; 660 if (!(-x $self->{cmscan_bin})) 661 { 662 die "FATAL: Unable to find ".$self->{cmscan_bin}." executable\n\n"; 663 } 664} 665 666sub set_search_params 667{ 668 my $self = shift; 669 my ($opts, $r_scan_len, $r_cur_cm_file, $max_search_tRNA_length, $trna) = @_; 670 671 # don't set '-W' param over 200 bp if a pre-scanner is being used, 672 # use max window of 150 bp if cmsearch only (too slow otherwise) 673 if ($opts->eufind_mode() || $opts->tscan_mode() || $opts->use_prev_ts_run()) 674 { 675 $$r_scan_len = &min($trna->len(), $self->{max_tRNA_length}); 676 } 677 else 678 { 679 $$r_scan_len = $max_search_tRNA_length; 680 } 681 682 # set correct CM file for current tRNA 683 $$r_cur_cm_file = $self->{main_cm_file_path}->{Domain}; 684 if ($opts->general_mode() or $opts->organelle_mode()) 685 { 686 $$r_cur_cm_file = $self->{main_cm_file_path}->{general}; 687 } 688 689 if ($opts->eufind_mode()) 690 { 691 # use prok selcys model 692 if ($trna->isotype() eq "SeCp") 693 { 694 $$r_cur_cm_file = $self->{Pselc_cm_file_path}; 695 } 696 # use euk selcys model 697 elsif ($trna->isotype() eq "SeCe") 698 { 699 $$r_cur_cm_file = $self->{Eselc_cm_file_path}; 700 } 701 } 702} 703 704# find anticodon loop & a-codon from coves or cmsearch output 705sub find_anticodon 706{ 707 my $self = shift; 708 my ($opts, $trna, $gc) = @_; 709 my ($antiloop, $antiloop_end, $ac_index, $anticodon, $verify_ac); 710 711 my $seq = $trna->seq(); 712 my $ss = $trna->ss(); 713 my $model = $trna->model(); 714 my $undef_anticodon = $gc->undef_anticodon(); 715 716 my $antiloop_index = 0; 717 my $antiloop_len = 0; 718 719 # Match pattern in secondary structure output, 720 # looking for second stem-loop structure ">>>>...<<<<" 721 # that should be the anitocodon stem-loop 722 723 if ($opts->mito_mode() and $model eq "SerGCT") 724 { 725 if ($ss =~ /^([>.]+)>([.]{4,})<+.+[>.]+<[<.]+/o) 726 { 727 # set to index position of first base in anticodon loop 728 $antiloop_index = length($1) + 1; 729 $antiloop_len = length($2); # anticodon loop length 730 } 731 } 732 elsif ($opts->mito_mode()) 733 { 734 if ($ss =~ /^([>.]+<[<.]+[>.]*)>([.]{4,})<+.+[>.]+<[<.]+/o) 735 { 736 # set to index position of first base in anticodon loop 737 $antiloop_index = length($1) + 1; 738 $antiloop_len = length($2); # anticodon loop length 739 } 740 } 741 elsif ($ss =~ /^([>.]+<[<.]+>[>.]*)>([.]{4,})<+.+[>.]+<[<.]+/o) 742 { 743 # set to index position of first base in anticodon loop 744 $antiloop_index = length($1) + 1; 745 $antiloop_len = length($2); # anticodon loop length 746 } 747 748 if ($antiloop_index != 0 and $antiloop_len != 0) 749 { 750 # index of end of anticodon loop 751 $antiloop_end = $antiloop_index + $antiloop_len - 1; 752 753 $antiloop = substr($seq, $antiloop_index, $antiloop_len); 754 755 # remove '-' gaps from loop 756 $antiloop =~ s/[\-]//g; 757 758 # Don't guess if even number of bp in 759 # anticodon loop 760 if (!$opts->mito_mode()) 761 { 762 # remove introns & non-canonical bases 763 $antiloop =~ s/[a-z]//g; 764 765 if ((length($antiloop) < 5) || ((length($antiloop) % 2) == 0)) 766 { 767 return ($undef_anticodon, -1, -1, -1); 768 } 769 # get anticodon 770 $ac_index = (length($antiloop) - 3) / 2; 771 $anticodon = substr($antiloop, $ac_index, 3); 772 $verify_ac = substr($seq, $ac_index + $antiloop_index, 3); 773 } 774 else 775 { 776 $antiloop = uc($antiloop); 777 if (length($antiloop) < 5) 778 { 779 $trna->category("undetermined_ac"); 780 return ($undef_anticodon, -1, -1, -1); 781 } 782 elsif ((length($antiloop) % 2) == 0) 783 { 784 $ac_index = int((length($antiloop) - 3) / 2); 785 $anticodon = substr($antiloop, $ac_index, 3); 786 my $isotype = $gc->get_tRNA_type($self, $anticodon, $self->{main_cm_file_path}->{$model}, $model, $self->cove_mode()); 787 if ($model eq $isotype or ($model eq "SerGCT" and $isotype eq "Ser") or ($model eq "SerTGA" and $isotype eq "Ser") 788 or ($model eq "LeuTAG" and $isotype eq "Leu") or ($model eq "LeuTAA" and $isotype eq "Leu")) 789 { 790 $trna->category("mito_ac_mislocation"); 791 $trna->note("(".$anticodon.")"); 792 } 793 else 794 { 795 $ac_index = int((length($antiloop) - 3) / 2 + 1); 796 $anticodon = substr($antiloop, $ac_index, 3); 797 $isotype = $gc->get_tRNA_type($self, $anticodon, $self->{main_cm_file_path}->{$model}, $model, $self->cove_mode()); 798 if ($model eq $isotype or ($model eq "SerGCT" and $isotype eq "Ser") or ($model eq "SerTGA" and $isotype eq "Ser") 799 or ($model eq "LeuTAG" and $isotype eq "Leu") or ($model eq "LeuTAA" and $isotype eq "Leu")) 800 { 801 $trna->category("mito_ac_mislocation"); 802 $trna->note("(".$anticodon.")"); 803 } 804 else 805 { 806 $trna->category("mito_mismatch_ac"); 807 $trna->note("(".$anticodon.")"); 808 } 809 } 810 return ($undef_anticodon, -1, -1, -1); 811 } 812 else 813 { 814 # get anticodon 815 $ac_index = (length($antiloop) - 3) / 2; 816 $anticodon = substr($antiloop, $ac_index, 3); 817 $verify_ac = uc(substr($seq, $ac_index + $antiloop_index, 3)); 818 } 819 } 820 821 # check to see if anticodon extracted from the entire 822 # trna sequence (coveseq) is same as that extracted from 823 # just the anticodon loop sequence (antiloop) 824 825 if ($verify_ac ne $anticodon) 826 { 827 $trna->category("undetermined_ac"); 828 return ($undef_anticodon, -1, -1, -1); 829 } 830 return ($anticodon, $antiloop_index, $antiloop_end, $ac_index + $antiloop_index + 1); 831 } 832 else 833 { 834 $trna->category("undetermined_ac"); 835 return ($undef_anticodon, -1, -1, -1); 836 } 837} 838 839sub find_intron 840{ 841 my $self = shift; 842 my ($trna_seq, $antiloop_index, $antiloop_end) = @_; 843 my ($intron, $istart, $iend, $tmpstr, $antiloop_seq); 844 my $min_intron_length = $self->{min_intron_length}; 845 846 # check to see if it was unable 847 # to determine the anticodon loop 848 if ($antiloop_index == -1) 849 { 850 return(0, 0, 0); 851 } 852 # get subsequence from start of anticodon loop 853 # to end of anticodon loop -- look for intron in it 854 $antiloop_seq = substr($trna_seq, $antiloop_index, $antiloop_end - $antiloop_index + 1); 855 856 if ($antiloop_seq =~ /^(.*[^a-z]+)([a-z]{$min_intron_length,})[^a-z]+/o) 857 { 858 $intron = $2; 859 860 # make sure to get the base index for the last (not nec. only) occurrence 861 # of the intron sequence string up to end of anticodon loop 862 $tmpstr = substr($trna_seq, 0, $antiloop_end+1); 863 $istart = index($tmpstr, $intron) + 1; 864 $iend = length($intron) + $istart - 1; 865 } 866 else 867 { 868 $intron = 0; 869 } 870 return ($intron, $istart, $iend); 871} 872 873# is_pseudo_gene 874# 875# Runs a covariance model without secondary structure information on predicted tRNA, puts this value 876# in "hmm_score". Contribution to total score from secondary structure derived by subtracting hmm_score from total score 877# Returns non-zero if tRNA scores fall below minima for either primary or secondary structure components of score 878sub is_pseudo_gene 879{ 880 my $self = shift; 881 my $global_constants = shift; 882 my $opts = shift; 883 my $log = shift; 884 my ($trna, $prescan_tRNA_id, $tmp_trnaseq_file) = @_; 885 my ($dummy1, $dummy2, $hmm_score, $hit_start, $hit_end, $hit_ct, $min_pseudo_filter_score); 886 887 $dummy1 = $dummy2 = ""; # return values not used 888 889 # skip check for pseudo gene if score is above minimum 890 # -D (disable pseudogene checking) is specified 891 # AND -H option (get hmm scores) is NOT specified 892 if ($self->cove_mode()) 893 { 894 $min_pseudo_filter_score = $self->{min_cove_pseudo_filter_score}; 895 } 896 elsif ($self->infernal_mode()) 897 { 898 $min_pseudo_filter_score = $self->{min_cmsearch_pseudo_filter_score}; 899 } 900 901 if ((($trna->score() >= $min_pseudo_filter_score) || $self->{skip_pseudo_filter}) && !$self->{get_hmm_score}) 902 { 903 return 0; 904 } 905 906 $hmm_score = 0; 907 my $ss_score = 0; 908 my $score = 0; 909 my $ns_cm = ""; 910 if ($self->cove_mode()) 911 { 912 $ns_cm = $self->{mainNS_cm_file_path}->{Domain}; 913 if ($opts->general_mode() or $opts->organelle_mode()) 914 { 915 $ns_cm = $self->{mainNS_cm_file_path}->{general} 916 } 917 918 ($dummy1, $dummy2, $hmm_score) = 919 $self->run_coves($log, $tmp_trnaseq_file, $prescan_tRNA_id, $ns_cm); 920 $score = $trna->get_domain_model("cove")->{score}; 921 $ss_score = $score - $hmm_score; 922 $trna->update_domain_model("cove", $score, $score, $hmm_score, $ss_score); 923 } 924 elsif ($self->infernal_mode()) 925 { 926 $ns_cm = $self->{mainNS_cm_file_path}->{$trna->model()}; 927 my $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_cm_hmm.out"; 928 929 ($hmm_score, $hit_start, $hit_end, $hit_ct) = 930 $self->cmsearch_scoring($opts, $log, $tmp_trnaseq_file, $prescan_tRNA_id, $ns_cm, $cms_output_file); 931 $score = $trna->get_domain_model("infernal")->{score}; 932 $ss_score = $score - $hmm_score; 933 $trna->update_domain_model("infernal", $score, $score, $hmm_score, $ss_score); 934 } 935 else 936 { 937 return -1; # Error - no second pass scanner selected 938 } 939 940 if ((($ss_score < $self->{min_ss_score}) || ($hmm_score < $self->{min_hmm_score})) && 941 ($score < $min_pseudo_filter_score)) 942 { 943 return 1; 944 } 945} 946 947# Get anticodon, isotype, intron location, and pseudogene status for tRNA with noncanonical introns 948sub decode_nci_tRNA_properties 949{ 950 my $self = shift; 951 my ($global_vars, $trna) = @_; 952 my $global_constants = $global_vars->{global_constants}; 953 my $opts = $global_vars->{options}; 954 my $gc = $global_vars->{gc}; 955 my $log = $global_vars->{log_file}; 956 957 my ($anticodon, $acodon_index, $acodon_end1, $acodon_index2, $trna_type, $intron, $istart, $iend, 958 $antiloop_index, $antiloop_end, $trna_len, $scan_len); 959 960 $anticodon = "ERR"; 961 962 my $mat_seq = ""; 963 my $precursor_seq = $trna->seq(); 964 my @introns = $trna->ar_introns(); 965 for (my $i = 0; $i < scalar(@introns); $i++) 966 { 967 if ($i == 0) 968 { 969 $mat_seq = substr($precursor_seq, 0, $introns[$i]->{rel_start} - 1); 970 } 971 else 972 { 973 $mat_seq .= substr($precursor_seq, $introns[$i-1]->{rel_end}, $introns[$i]->{rel_start} - $introns[$i-1]->{rel_end} - 1); 974 } 975 } 976 $mat_seq .= substr($precursor_seq, $introns[scalar(@introns)-1]->{rel_end}); 977 978 if (uc($mat_seq) eq uc($trna->mat_seq())) 979 { 980 $trna->seq($trna->mat_seq()); 981 } 982 else 983 { 984 $log->warning("Mismatch mature sequence for tRNA for noncanonical intron\t". $trna->tRNAscan_id()); 985 } 986 987 if ($self->cove_mode() || $self->infernal_mode()) 988 { 989 ($anticodon, $antiloop_index, $antiloop_end, $acodon_index) = $self->find_anticodon($opts, $trna, $gc); 990 $acodon_index2 = 0; 991 $trna->anticodon($anticodon); 992 993 for (my $i = 0; $i < scalar(@introns); $i++) 994 { 995 if ($acodon_index > $introns[$i]->{rel_start}) 996 { 997 $acodon_index += ($introns[$i]->{rel_end} - $introns[$i]->{rel_start} + 1); 998 } 999 elsif ($acodon_index < $introns[$i]->{rel_start} and ($acodon_index + 2) >= $introns[$i]->{rel_start}) 1000 { 1001 $acodon_end1 = $introns[$i]->{rel_start} - 1; 1002 $acodon_index2 = $introns[$i]->{rel_end} + 1; 1003 } 1004 elsif ($acodon_index + 2 < $introns[$i]->{rel_start}) 1005 { 1006 last; 1007 } 1008 } 1009 if ($trna->get_ac_pos_count() > 0) 1010 { 1011 $trna->remove_ac_pos(0); 1012 } 1013 if ($acodon_index2 == 0) 1014 { 1015 $trna->add_ac_pos($acodon_index, $acodon_index + 2); 1016 } 1017 else 1018 { 1019 $trna->add_ac_pos($acodon_index, $acodon_end1); 1020 $trna->add_ac_pos($acodon_index2, (3 - ($acodon_end1 - $acodon_index + 1)) + $acodon_index2 - 1); 1021 } 1022 } 1023 else 1024 { 1025 die "Second pass mode not selected -- can't decode tRNA type\n\n"; 1026 } 1027 1028 # check for problem parsing anticodon loop 1029 if (($anticodon eq $gc->undef_anticodon()) || ($trna->seq() eq 'Error')) 1030 { 1031 $trna->anticodon($gc->undef_anticodon()); 1032 $trna->isotype($gc->undef_isotype()); 1033 $intron = 0; 1034 1035 if ($opts->save_odd_struct()) 1036 { 1037 open(ODDTRNA, ">>".$opts->odd_struct_file()) || 1038 die "FATAL: Can't open ".$opts->odd_struct_file()." to save seconary structures\n\n"; 1039 print ODDTRNA $trna->tRNAscan_id()." (".$trna->start()."-".$trna->end()."):\n".$trna->seq()."\n".$trna->ss()."\n\n"; 1040 close(ODDTRNA); 1041 } 1042 } 1043 # continue tRNA struct parsing 1044 else 1045 { 1046 ($intron, $istart, $iend) = $self->find_intron($trna->seq(), $antiloop_index, $antiloop_end); 1047 1048 if ($intron) 1049 { 1050 $mat_seq = substr($trna->seq(), 0, $istart - 1).substr($trna->seq(), $iend); 1051 $trna->mat_ss(substr($trna->ss(), 0, $istart - 1).substr($trna->ss(), $iend)); 1052 $trna->ss($trna->mat_ss()); 1053 1054 for (my $i = 0; $i < scalar(@introns); $i++) 1055 { 1056 if ($istart > $introns[$i]->{rel_start}) 1057 { 1058 $istart += ($introns[$i]->{rel_end} - $introns[$i]->{rel_start} + 1); 1059 $iend += ($introns[$i]->{rel_end} - $introns[$i]->{rel_start} + 1); 1060 } 1061 elsif ($iend < $introns[$i]->{rel_start}) 1062 { 1063 last; 1064 } 1065 } 1066 my $intron_start = 0; 1067 my $intron_end = 0; 1068 if ($trna->strand() eq "+") 1069 { 1070 $intron_start = $istart + $trna->start() - 1; 1071 $intron_end = $iend + $trna->start() - 1 1072 } 1073 else 1074 { 1075 $intron_start = $trna->end() - $iend + 1; 1076 $intron_end = $trna->end() - $istart + 1; 1077 } 1078 $trna->add_intron($istart, $iend, $intron_start, $intron_end, "CI", $intron); 1079 $trna->merge_introns(); 1080 } 1081 1082 $trna->isotype($gc->get_tRNA_type($self, $trna->anticodon(), $self->{main_cm_file_path}->{$trna->model()}, $trna->model(), $self->cove_mode())); 1083 } 1084 1085 # Reset CI type if marked as NCI 1086 @introns = $trna->ar_introns(); 1087 my $has_CI = 0; 1088 for (my $i = 0; $i < scalar(@introns); $i++) 1089 { 1090 if ($introns[$i]->{type} eq "CI") 1091 { 1092 $has_CI = 1; 1093 last; 1094 } 1095 } 1096 1097 for (my $i = 0; !$has_CI and $i < scalar(@introns); $i++) 1098 { 1099 if ($acodon_index2 == 0) 1100 { 1101 if ($acodon_index + 2 + 2 == $introns[$i]->{rel_start} and $introns[$i]->{type} eq "NCI") 1102 { 1103 $trna->set_intron($i, $introns[$i]->{rel_start}, $introns[$i]->{rel_end}, "CI", $introns[$i]->{seq}); 1104 last; 1105 } 1106 } 1107 else 1108 { 1109 if (((3 - ($acodon_end1 - $acodon_index + 1)) + $acodon_index2 - 1 + 2) == $introns[$i]->{rel_start} and $introns[$i]->{type} eq "NCI") 1110 { 1111 $trna->set_intron($i, $introns[$i]->{rel_start}, $introns[$i]->{rel_end}, "CI", $introns[$i]->{seq}); 1112 last; 1113 } 1114 } 1115 } 1116 1117 # Write current tRNA to temp file for re-analysis with other models 1118 $trna_len = length($trna->seq()); 1119 &write_tRNA($global_constants->get("tmp_trnaseq_file"), $trna->tRNAscan_id(), "", $trna->seq(), 1); 1120 1121 if (($trna->model() ne "SeC") && 1122 ($self->is_pseudo_gene($global_constants, $opts, $log, $trna, $trna->tRNAscan_id(), $global_constants->get("tmp_trnaseq_file"))) && 1123 (!$self->{skip_pseudo_filter})) 1124 { 1125 $trna->pseudo(1); 1126 } 1127 else 1128 { 1129 $trna->pseudo(0); 1130 } 1131 1132 $trna->mat_seq($mat_seq); 1133 $trna->seq($precursor_seq); 1134} 1135 1136# Get tRNA anticodon, isotype, intron location, and pseudogene status 1137sub decode_tRNA_properties 1138{ 1139 my $self = shift; 1140 my ($global_vars, $trna, $prescan_tRNA, $cur_cm_file) = @_; 1141 my $global_constants = $global_vars->{global_constants}; 1142 my $opts = $global_vars->{options}; 1143 my $gc = $global_vars->{gc}; 1144 my $log = $global_vars->{log_file}; 1145 1146 my ($anticodon, $acodon_index, $trna_type, $intron, $istart, $iend, 1147 $antiloop_index, $antiloop_end, $trna_len, $scan_len); 1148 1149 $anticodon = "ERR"; 1150 1151 if ($self->cove_mode() || $self->infernal_mode()) 1152 { 1153 ($anticodon, $antiloop_index, $antiloop_end, $acodon_index) = $self->find_anticodon($opts, $trna, $gc); 1154 $trna->anticodon($anticodon); 1155 $trna->add_ac_pos($acodon_index, $acodon_index + 2); 1156 } 1157 else 1158 { 1159 die "Second pass mode not selected -- can't decode tRNA type\n\n"; 1160 } 1161 1162 # check for problem parsing anticodon loop 1163 if (($anticodon eq $gc->undef_anticodon()) || ($trna->seq() eq 'Error')) 1164 { 1165 $trna->anticodon($gc->undef_anticodon()); 1166 $trna->isotype($gc->undef_isotype()); 1167 $intron = 0; 1168 1169 if ($opts->save_odd_struct()) 1170 { 1171 open(ODDTRNA, ">>".$opts->odd_struct_file()) || 1172 die "FATAL: Can't open ".$opts->odd_struct_file()." to save seconary structures\n\n"; 1173 print ODDTRNA $prescan_tRNA->tRNAscan_id()." (".$trna->start()."-".$trna->end()."):\n".$trna->seq()."\n".$trna->ss()."\n\n"; 1174 close(ODDTRNA); 1175 } 1176 } 1177 # continue tRNA struct parsing 1178 else 1179 { 1180 ($intron, $istart, $iend) = $self->find_intron($trna->seq(), $antiloop_index, $antiloop_end); 1181 1182 if ($intron) 1183 { 1184 my $intron_start = 0; 1185 my $intron_end = 0; 1186 if ($trna->strand() eq "+") 1187 { 1188 $intron_start = $istart + $trna->start() - 1; 1189 $intron_end = $iend + $trna->start() - 1 1190 } 1191 else 1192 { 1193 $intron_start = $trna->end() - $iend + 1; 1194 $intron_end = $trna->end() - $istart + 1; 1195 } 1196 $trna->add_intron($istart, $iend, $intron_start, $intron_end, "CI", $intron); 1197 } 1198 1199 if (defined $prescan_tRNA->anticodon() and ($prescan_tRNA->anticodon() ne "???")) 1200 { 1201 if (($anticodon ne (uc($prescan_tRNA->anticodon()))) && 1202 ($opts->tscan_mode() || $opts->eufind_mode()) && ($opts->strict_params())) 1203 { 1204 $log->broadcast($prescan_tRNA->tRNAscan_id()." - anticondon conflict\t".$opts->second_pass_label().": $anticodon\tfirstpass (".$prescan_tRNA->hit_source().")". 1205 ": ".$prescan_tRNA->anticodon()."\n".$trna->seq()."\n".$trna->ss()."\n"); 1206 } 1207 } 1208 1209 $trna->isotype($gc->get_tRNA_type($self, $trna->anticodon(), $cur_cm_file, $trna->model(), $self->cove_mode())); 1210 1211 if ($anticodon ne "TCA" and $trna->isotype() eq "SeC") 1212 { 1213 $trna->anticodon($gc->undef_anticodon()); 1214 $trna->isotype($gc->undef_isotype()); 1215 } 1216 } 1217 1218 # Write current tRNA to temp file for re-analysis with other models 1219 $trna_len = length($trna->seq()); 1220 &write_tRNA($global_constants->get("tmp_trnaseq_file"), $prescan_tRNA->tRNAscan_id(), "", $trna->seq(), 1); 1221 1222 if (($trna->model() ne "SeC") && 1223 ($self->is_pseudo_gene($global_constants, $opts, $log, $trna, $prescan_tRNA->tRNAscan_id(), $global_constants->get("tmp_trnaseq_file"))) && 1224 (!$self->{skip_pseudo_filter})) 1225 { 1226 $trna->pseudo(1); 1227 } 1228} 1229 1230# Fix fMet with missing first base 1231sub fix_fMet 1232{ 1233 my $self = shift; 1234 my ($global_vars, $trna) = @_; 1235 my $opts = $global_vars->{options}; 1236 my $rescore = 0; 1237 1238 my $inf = $trna->get_domain_model("infernal"); 1239 1240 if ($inf->{score} > 40 and $trna->isotype() eq "Met" and $opts->bact_mode()) 1241 { 1242 if ((substr($trna->seq(), length($trna->seq())-3) eq "CCA" and substr($trna->ss(), length($trna->ss())-5) eq ".....") or 1243 (substr($trna->ss(), length($trna->ss())-5) eq "<<<..")) 1244 { 1245 if (substr($trna->ss(), 0, 1) ne ".") 1246 { 1247 if (($trna->strand() eq "+" and $trna->start() > 1) or 1248 ($trna->strand() eq "-" and $trna->end() < $trna->src_seqlen())) 1249 { 1250 $trna->seq(substr($trna->upstream(), length($trna->upstream())-1) . $trna->seq()); 1251 $trna->upstream(substr($trna->upstream(), 0, length($trna->upstream())-1)); 1252 $trna->ss(".".$trna->ss()); 1253 if ($trna->strand() eq "+") 1254 { 1255 $trna->start($trna->start() - 1); 1256 } 1257 else 1258 { 1259 $trna->end($trna->end() + 1); 1260 } 1261 my @ar_ac_pos = $trna->ar_ac_pos(); 1262 if (scalar(@ar_ac_pos) > 0) 1263 { 1264 $ar_ac_pos[0]->{rel_start} += 1; 1265 $ar_ac_pos[0]->{rel_end} += 1; 1266 $trna->ar_ac_pos(@ar_ac_pos); 1267 } 1268 1269 $rescore = 1; 1270 } 1271 } 1272 elsif (substr($trna->ss(), 0, 4) eq ".>.>" and substr($trna->seq(), 0, 2) eq "CG") 1273 { 1274 my $pos71 = ""; 1275 if (substr($trna->seq(), length($trna->seq())-3) eq "CCA" and substr($trna->ss(), length($trna->ss())-5) eq ".....") 1276 { 1277 $pos71 = substr($trna->seq(), length($trna->ss())-6, 1); 1278 } 1279 elsif (substr($trna->ss(), length($trna->ss())-5) eq "<<<..") 1280 { 1281 $pos71 = substr($trna->seq(), length($trna->ss())-3, 1); 1282 } 1283 if (&rev_comp_seq(uc($pos71)) eq uc(substr($trna->seq(), 2, 1))) 1284 { 1285 $trna->seq(substr($trna->seq(), 1, 1).uc(substr($trna->seq(), 2, 1)).substr($trna->seq(), 3)); 1286 $trna->ss(substr($trna->ss(), 0, 2).substr($trna->ss(), 3)); 1287 if ($trna->strand() eq "+") 1288 { 1289 $trna->start($trna->start() + 1); 1290 } 1291 else 1292 { 1293 $trna->end($trna->end() - 1); 1294 } 1295 my @ar_ac_pos = $trna->ar_ac_pos(); 1296 if (scalar(@ar_ac_pos) > 0) 1297 { 1298 $ar_ac_pos[0]->{rel_start} -= 1; 1299 $ar_ac_pos[0]->{rel_end} -= 1; 1300 $trna->ar_ac_pos(@ar_ac_pos); 1301 } 1302 $rescore = 1; 1303 } 1304 } 1305 } 1306 } 1307 1308 if ($rescore) 1309 { 1310 $self->rescore_tRNA($global_vars, $trna, $trna); 1311 } 1312} 1313 1314# Fix archaeal His for incorrect bulge 1315sub fix_His 1316{ 1317 my $self = shift; 1318 my ($global_vars, $trna) = @_; 1319 my $opts = $global_vars->{options}; 1320 my $rescore = 0; 1321 1322 my $inf = $trna->get_domain_model("infernal"); 1323 1324 if ($inf->{score} > 35 and $trna->isotype() eq "His" and $opts->arch_mode()) 1325 { 1326 my $pos5 = ""; 1327 my $pos68 = ""; 1328 my $mid = ""; 1329 if (substr($trna->ss(), 0, 9) eq ">>>>.>>>." and substr($trna->ss(), length($trna->ss())-9) eq "<<<.<<<<.") 1330 { 1331 $pos5 = uc(substr($trna->seq(), 4, 1)); 1332 $pos68 = uc(substr($trna->seq(), length($trna->seq())-6, 1)); 1333 $mid = substr($trna->seq(), 5, length($trna->seq())-11); 1334 } 1335 if (($pos5 eq "A" and $pos68 eq "T") or ($pos5 eq "T" and $pos68 eq "A") or 1336 ($pos5 eq "G" and $pos68 eq "C") or ($pos5 eq "C" and $pos68 eq "G") or 1337 ($pos5 eq "G" and $pos68 eq "T") or ($pos5 eq "T" and $pos68 eq "G")) 1338 { 1339 $trna->upstream($trna->upstream().substr($trna->seq(), 0, 1)); 1340 $trna->downstream(substr($trna->seq(), length($trna->seq()) - 1).$trna->downstream()); 1341 $trna->seq(substr($trna->seq(), 1, 3).$pos5.$mid.$pos68.substr($trna->seq(), length($trna->seq())-5, 4)); 1342 $trna->ss(substr($trna->ss(), 1, 3).">".substr($trna->ss(), 5, length($trna->ss())-11)."<".substr($trna->ss(), length($trna->ss())-5, 3)."."); 1343 $trna->start($trna->start() + 1); 1344 $trna->end($trna->end() - 1); 1345 my @ar_ac_pos = $trna->ar_ac_pos(); 1346 if (scalar(@ar_ac_pos) > 0) 1347 { 1348 for (my $i = 0; $i < scalar(@ar_ac_pos); $i++) 1349 { 1350 $ar_ac_pos[$i]->{rel_start} -= 1; 1351 $ar_ac_pos[$i]->{rel_end} -= 1; 1352 } 1353 $trna->ar_ac_pos(@ar_ac_pos); 1354 } 1355 $self->rescore_tRNA($global_vars, $trna, $trna); 1356 } 1357 } 1358} 1359 1360# Get tRNA anticodon, isotype, and intron location 1361sub decode_mito_tRNA_properties 1362{ 1363 my $self = shift; 1364 my ($global_vars, $trna, $prescan_tRNA, $cur_cm_file) = @_; 1365 my $global_constants = $global_vars->{global_constants}; 1366 my $opts = $global_vars->{options}; 1367 my $gc = $global_vars->{gc}; 1368 my $log = $global_vars->{log_file}; 1369 1370 my ($anticodon, $acodon_index, $trna_type, $intron, $istart, $iend, 1371 $antiloop_index, $antiloop_end, $trna_len, $scan_len); 1372 1373 $anticodon = "ERR"; 1374 1375 ($anticodon, $antiloop_index, $antiloop_end, $acodon_index) = $self->find_anticodon($opts, $trna, $gc); 1376 $trna->anticodon($anticodon); 1377 $trna->add_ac_pos($acodon_index, $acodon_index + 2); 1378 1379 # check for problem parsing anticodon loop 1380 if (($anticodon eq $gc->undef_anticodon()) || ($trna->seq() eq 'Error')) 1381 { 1382 $trna->anticodon($gc->undef_anticodon()); 1383 $intron = 0; 1384 1385 if ($opts->save_odd_struct()) 1386 { 1387 open(ODDTRNA, ">>".$opts->odd_struct_file()) || 1388 die "FATAL: Can't open ".$opts->odd_struct_file()." to save seconary structures\n\n"; 1389 print ODDTRNA $prescan_tRNA->tRNAscan_id()." (".$trna->start()."-".$trna->end()."):\n".$trna->seq()."\n".$trna->ss()."\n\n"; 1390 close(ODDTRNA); 1391 } 1392 } 1393 1394 ($intron, $istart, $iend) = $self->find_intron($trna->seq(), $antiloop_index, $antiloop_end); 1395 1396 if ($intron) 1397 { 1398 my $intron_start = 0; 1399 my $intron_end = 0; 1400 if ($trna->strand() eq "+") 1401 { 1402 $intron_start = $istart + $trna->start() - 1; 1403 $intron_end = $iend + $trna->start() - 1 1404 } 1405 else 1406 { 1407 $intron_start = $trna->end() - $iend + 1; 1408 $intron_end = $trna->end() - $istart + 1; 1409 } 1410 } 1411 1412 my $isotype = $gc->get_tRNA_type($self, $trna->anticodon(), $cur_cm_file, $trna->model(), 0); 1413 my $vert_mito_isotype = $gc->get_vert_mito_type($trna->anticodon()); 1414 1415 my $model_iso = $trna->model(); 1416 my $model_ac = ""; 1417 1418 if (length($trna->model()) > 3) 1419 { 1420 $model_iso = substr($trna->model(), 0, 3); 1421 $model_ac = substr($trna->model(), 3); 1422 } 1423 if ($model_iso ne $isotype) 1424 { 1425 if ($trna->category() eq "") 1426 { 1427 $trna->category("mito_iso_conflict"); 1428 $trna->note("(".$isotype.")"); 1429 } 1430 $log->broadcast($prescan_tRNA->tRNAscan_id().".trna".$trna->id()." - isotype/anticondon conflict\t Model: ".$trna->model()." Detected: ".$isotype.$anticodon."\n". 1431 $trna->seq()."\n".$trna->ss()."\n"); 1432 } 1433 if ($model_ac ne "" and $model_ac ne $anticodon) 1434 { 1435 if ($trna->category() eq "") 1436 { 1437 $trna->category("mito_mismatch_ac"); 1438 $trna->note("(".$model_ac.")"); 1439 } 1440 } 1441 if ($vert_mito_isotype eq "") 1442 { 1443 if ($trna->category() eq "") 1444 { 1445 $trna->category("mito_noncanonical_ac"); 1446 } 1447 } 1448 1449 $trna->isotype($model_iso); 1450} 1451 1452sub scan_noncanonical_introns 1453{ 1454 my $self = shift; 1455 my ($global_vars, $seq_name) = @_; 1456 my $global_constants = $global_vars->{global_constants}; 1457 my $opts = $global_vars->{options}; 1458 my $log = $global_vars->{log_file}; 1459 my $sp_int_results = $global_vars->{sp_int_results}; 1460 my $sp_tRNAs = $global_vars->{sp_tRNAs}; 1461 my $tmp_trnaseq_file = $global_constants->get("tmp_trnaseq_file"); 1462 1463 my $tRNA = tRNAscanSE::tRNA->new; 1464 my $tRNA_copy = tRNAscanSE::tRNA->new; 1465 my $cm_intron = tRNAscanSE::tRNA->new; 1466 my $cm_intron2 = tRNAscanSE::tRNA->new; 1467 my $previous_intron_len = 0; 1468 my $rnd2 = 0; 1469 my $add_ci = 0; 1470 1471 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 1472 my $cms_merge_file_rnd1 = $global_constants->get("temp_dir")."/tscan$$"."_intron_merge.out"; 1473 my $count = $self->run_cmsearch_intron($global_vars, $seq_name, $arrayCMscanResults, 0, $cms_merge_file_rnd1, 1); 1474 if ($cms_merge_file_rnd1 eq "") 1475 { 1476 $log->error("Fail to run infernal for $seq_name"); 1477 return 0; 1478 } 1479 1480 $sp_int_results->clear_index(); 1481 $sp_int_results->open_file("write"); 1482 1483 $arrayCMscanResults->open_file($cms_merge_file_rnd1); 1484 $arrayCMscanResults->get_next_cmsearch_hit($cm_intron); 1485 1486 for (my $i = 0; $i < $sp_tRNAs->get_count(); $i++) 1487 { 1488 $rnd2 = 0; 1489 $add_ci = 0; 1490 $previous_intron_len = 0; 1491 $tRNA = $sp_tRNAs->get($i); 1492 $tRNA_copy->copy($tRNA); 1493 my $padded_seq = $tRNA->upstream().$tRNA->seq().$tRNA->downstream(); 1494 if ($cm_intron->seqname() ne $tRNA->seqname().".trna".&pad_num($tRNA->id(), 6)) 1495 { 1496 if ($tRNA->get_intron_count() == 0) 1497 { 1498 $sp_int_results->write_tRNA($tRNA); 1499 next; 1500 } 1501 else 1502 { 1503 $padded_seq = $tRNA->upstream().$tRNA->mat_seq().$tRNA->downstream(); 1504 $rnd2 = 1; 1505 $add_ci = 1; 1506 } 1507 } 1508 else 1509 { 1510 while (($cm_intron->seqname() ne "") and ($cm_intron->seqname() eq $tRNA->seqname().".trna".&pad_num($tRNA->id(), 6))) 1511 { 1512 my ($ret_value, $duplicate, $clip_seq, $intron_len) = $self->check_intron_validity($global_vars, $tRNA, $cm_intron, $padded_seq, $previous_intron_len); 1513 if ($ret_value) 1514 { 1515 $padded_seq = $clip_seq; 1516 $previous_intron_len = $intron_len; 1517 $rnd2 = 1; 1518 if ($duplicate) 1519 { 1520 $add_ci = 1; 1521 } 1522 } 1523 1524 $arrayCMscanResults->get_next_cmsearch_hit($cm_intron); 1525 } 1526 } 1527 1528 if ($rnd2) 1529 { 1530 my $trna_file = tRNAscanSE::Sequence->new; 1531 $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); 1532 $trna_file->set_seq_info($tRNA->seqname().".trna".&pad_num($tRNA->id(), 6), $tRNA->tRNAscan_id(), length($padded_seq), $padded_seq); 1533 $trna_file->write_fasta(); 1534 $trna_file->close_file(); 1535 1536 my $rnd2IntronScanResults = tRNAscanSE::ArrayCMscanResults->new; 1537 my $cms_merge_file_rnd2 = $global_constants->get("temp_dir")."/tscan$$"."_intron_rnd2_merge.out"; 1538 my $count = $self->run_cmsearch_intron($global_vars, $tRNA->tRNAscan_id(), $rnd2IntronScanResults, 0, $cms_merge_file_rnd2, 2); 1539 if ($cms_merge_file_rnd2 eq "") 1540 { 1541 $log->error("Fail to run infernal for ".$tRNA->tRNAscan_id()); 1542 return 0; 1543 } 1544 1545 $previous_intron_len = 0; 1546 $rnd2IntronScanResults->open_file($cms_merge_file_rnd2); 1547 $rnd2IntronScanResults->get_next_cmsearch_hit($cm_intron2); 1548 while ($cm_intron2->seqname() ne "") 1549 { 1550 my ($ret_value, $duplicate, $clip_seq, $intron_len) = $self->check_intron_validity($global_vars, $tRNA, $cm_intron2, $padded_seq, $previous_intron_len); 1551 if ($ret_value) 1552 { 1553 $padded_seq = $clip_seq; 1554 $previous_intron_len = $intron_len; 1555 if ($duplicate) 1556 { 1557 $add_ci = 1; 1558 } 1559 } 1560 1561 $rnd2IntronScanResults->get_next_cmsearch_hit($cm_intron2); 1562 } 1563 $rnd2IntronScanResults->close_file(); 1564 } 1565 1566 my @ar_introns = $tRNA->ar_introns(); 1567 my $nci_count = 0; 1568 my $ci_index = -1; 1569 for (my $i = 0; $i < scalar(@ar_introns); $i++) 1570 { 1571 if ($ar_introns[$i]->{type} eq "NCI") 1572 { 1573 $nci_count++; 1574 } 1575 elsif ($ar_introns[$i]->{type} eq "CI") 1576 { 1577 $ci_index = $i; 1578 } 1579 } 1580 1581 if ($nci_count > 0) 1582 { 1583 my $ci_seq = ""; 1584 if ($ci_index > -1) 1585 { 1586 if ($add_ci) 1587 { 1588 $ci_seq = $ar_introns[$ci_index]->{seq}; 1589 } 1590 $tRNA->remove_intron($ci_index); 1591 } 1592 $tRNA->sort_introns(); 1593 1594 if ($add_ci) 1595 { 1596 $self->add_canonical_intron($global_vars, $tRNA, $ci_seq); 1597 } 1598 $self->decode_nci_tRNA_properties($global_vars, $tRNA); 1599 1600 $tRNA->tRNAscan_id($tRNA->seqname().".tRNA".$tRNA->id()."-".$tRNA->isotype().$tRNA->anticodon()); 1601 $tRNA->set_default_scores(); 1602 $sp_int_results->write_tRNA($tRNA); 1603 } 1604 else 1605 { 1606 $sp_int_results->write_tRNA($tRNA_copy); 1607 } 1608 } 1609 1610 $sp_int_results->close_file(); 1611 $arrayCMscanResults->close_file(); 1612} 1613 1614sub add_canonical_intron 1615{ 1616 my $self = shift; 1617 my ($global_vars, $tRNA, $ci_seq) = @_; 1618 my $global_constants = $global_vars->{global_constants}; 1619 my $log = $global_vars->{log_file}; 1620 my $ret_value = 1; 1621 1622 my $mat_seq = ""; 1623 my $precursor_seq = $tRNA->seq(); 1624 my @introns = $tRNA->ar_introns(); 1625 1626 my $index = index(uc($precursor_seq), uc($ci_seq)); 1627 if ($index > -1) 1628 { 1629 $precursor_seq = substr($precursor_seq, 0, $index).lc($ci_seq).substr($precursor_seq, $index + length($ci_seq)); 1630 } 1631 for (my $i = 0; $i < scalar(@introns); $i++) 1632 { 1633 if ($i == 0) 1634 { 1635 $mat_seq = substr($precursor_seq, 0, $introns[$i]->{rel_start} - 1); 1636 } 1637 else 1638 { 1639 $mat_seq .= substr($precursor_seq, $introns[$i-1]->{rel_end}, $introns[$i]->{rel_start} - $introns[$i-1]->{rel_end} - 1); 1640 } 1641 } 1642 $mat_seq .= substr($precursor_seq, $introns[scalar(@introns)-1]->{rel_end}); 1643 1644 if (uc($mat_seq) ne uc($tRNA->mat_seq())) 1645 { 1646 my $trna_file = tRNAscanSE::Sequence->new; 1647 $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); 1648 $trna_file->set_seq_info($tRNA->seqname().".trna".&pad_num($tRNA->id(), 6), $tRNA->tRNAscan_id(), length($mat_seq), $mat_seq); 1649 $trna_file->write_fasta(); 1650 $trna_file->close_file(); 1651 1652 my $scan_flag = 0; 1653 my $cms_output_file = ""; 1654 my $file_idx = -1; 1655 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 1656 my $cms_merge_file = $global_constants->get("temp_dir")."/tscan$$"."_add_intron_merge.out"; 1657 my $cm_tRNA = tRNAscanSE::tRNA->new; 1658 1659 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 1660 { 1661 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_add_intron_".$cur_cm.".out"; 1662 if ($self->exec_cmsearch($self->{main_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $tRNA->tRNAscan_id(), $cms_output_file, $log, $self->{cm_cutoff})) 1663 { 1664 $file_idx = $arrayCMscanResults->add_file($cms_output_file, $cur_cm); 1665 $arrayCMscanResults->merge_result_file($file_idx, 0); 1666 } 1667 else 1668 { 1669 $ret_value = 0; 1670 } 1671 } 1672 $arrayCMscanResults->write_merge_file($cms_merge_file, 1); 1673 1674 if ($arrayCMscanResults->get_result_count() > 0) 1675 { 1676 $arrayCMscanResults->open_file($cms_merge_file); 1677 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 1678 1679 if ($cm_tRNA->seqname() ne "") 1680 { 1681 if ((uc(substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 3)) ne "CCA") and 1682 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) eq "....")) 1683 { 1684 if ($cm_tRNA->strand() eq "+") 1685 { 1686 $cm_tRNA->end($cm_tRNA->end() - 3); 1687 } 1688 else 1689 { 1690 $cm_tRNA->start($cm_tRNA->start() + 3); 1691 } 1692 $cm_tRNA->seq(substr($cm_tRNA->seq(), 0, length($cm_tRNA->seq()) - 3)); 1693 $cm_tRNA->ss(substr($cm_tRNA->ss(), 0, length($cm_tRNA->ss()) - 3)); 1694 } 1695 1696 $tRNA->ss($cm_tRNA->ss()); 1697 $tRNA->mat_seq($cm_tRNA->seq()); 1698 $tRNA->mat_ss($tRNA->ss()); 1699 $tRNA->update_domain_model("infernal", $cm_tRNA->score(), $cm_tRNA->score(), 0, 0); 1700 $tRNA->model($cm_tRNA->model()); 1701 } 1702 $arrayCMscanResults->close_file(); 1703 } 1704 } 1705 return $ret_value; 1706} 1707 1708# Run Infernal cmsearch for noncanonical introns 1709sub run_cmsearch_intron 1710{ 1711 my $self = shift; 1712 my ($global_vars, $seq_name, $arrayCMscanResults, $merge_range, $cms_merge_file, $round) = @_; 1713 my $global_constants = $global_vars->{global_constants}; 1714 my $log = $global_vars->{log_file}; 1715 1716 my $scan_flag = 5; 1717 my $cms_output_file = ""; 1718 my $file_idx = -1; 1719 1720 # run cmsearch 1721 foreach my $cur_cm (sort keys %{$self->{intron_cm_file_path}}) 1722 { 1723 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_intron_".$cur_cm."_rnd".$round.".out"; 1724 if ($self->exec_cmsearch($self->{intron_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $seq_name, $cms_output_file, $log, $self->{BHB_cm_cutoff})) 1725 { 1726 $file_idx = $arrayCMscanResults->add_file($cms_output_file, $cur_cm); 1727 $arrayCMscanResults->merge_result_file($file_idx, $merge_range); 1728 } 1729 else 1730 { 1731 return ("", 0); 1732 } 1733 } 1734 $arrayCMscanResults->write_merge_file($cms_merge_file, 0); 1735 my $count = $arrayCMscanResults->get_result_count(); 1736 1737 return $count; 1738} 1739 1740sub check_intron_validity 1741{ 1742 my $self = shift; 1743 my ($global_vars, $tRNA, $cm_intron, $padded_seq, $previous_intron_len) = @_; 1744 my $global_constants = $global_vars->{global_constants}; 1745 my $log = $global_vars->{log_file}; 1746 my $ret_value = 1; 1747 my $duplicate = 0; 1748 1749 my ($pre_intron, $intron, $post_intron, $pre_intron_seq, $intron_seq, $post_intron_seq); 1750 if ($cm_intron->ss() =~ /^([\<\-\.]{11,})(\-\<[<.]+[_.]{4,}[>.]{9,}\-[.]*\-)([-.>]+)$/) 1751 { 1752 $pre_intron = $1; 1753 $intron = $2; 1754 $post_intron = $3; 1755 my $full_intron_seq = $cm_intron->seq(); 1756 $full_intron_seq =~ s/U/T/g; 1757 $full_intron_seq =~ s/u/t/g; 1758 $cm_intron->seq($full_intron_seq); 1759 $intron_seq = substr($cm_intron->seq(), length($pre_intron), length($intron)); 1760 $pre_intron_seq = substr($cm_intron->seq(), 0, length($pre_intron)); 1761 $post_intron_seq = substr($cm_intron->seq(), length($pre_intron) + length($intron)); 1762 $intron_seq =~ s/-//g; 1763 $pre_intron_seq =~ s/-//g; 1764 $post_intron_seq =~ s/-//g; 1765 1766 $log->debug("Found intron ".$intron_seq." for ".$tRNA->tRNAscan_id()); 1767 } 1768 else 1769 { 1770 $log->warning("Fail to parse intron sequence for ".$cm_intron->seqname()." from ".$cm_intron->seq()."\t".$cm_intron->ss()); 1771 return 0; 1772 } 1773 1774 my $padded_full_seq = $tRNA->upstream().$tRNA->seq().$tRNA->downstream(); 1775 my ($upstream_start, $upstream_end, $downstream_start, $downstream_end); 1776 if ($tRNA->strand() eq "+") 1777 { 1778 $upstream_start = $tRNA->start() - length($tRNA->upstream()); 1779 $downstream_end = $tRNA->end() + length($tRNA->downstream()); 1780 } 1781 else 1782 { 1783 $downstream_start = $tRNA->start() - length($tRNA->downstream()); 1784 $upstream_end = $tRNA->end() + length($tRNA->upstream()); 1785 } 1786 1787 my $clip_seq = substr($padded_seq, 0, $cm_intron->start() - $previous_intron_len + length($pre_intron_seq) - 1) . substr($padded_seq, $cm_intron->end() - $previous_intron_len - length($post_intron_seq)); 1788 my $trna_file = tRNAscanSE::Sequence->new; 1789 $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); 1790 $trna_file->set_seq_info($tRNA->seqname().".trna".&pad_num($tRNA->id(), 6), $tRNA->tRNAscan_id(), length($clip_seq), $clip_seq); 1791 $trna_file->write_fasta(); 1792 $trna_file->close_file(); 1793 1794 my $scan_flag = 0; 1795 my $cms_output_file = ""; 1796 my $file_idx = -1; 1797 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 1798 my $cms_merge_file = $global_constants->get("temp_dir")."/tscan$$"."_clip_intron_merge.out"; 1799 my $cm_tRNA = tRNAscanSE::tRNA->new; 1800 1801 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 1802 { 1803 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_clip_intron_".$cur_cm.".out"; 1804 if ($self->exec_cmsearch($self->{main_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $tRNA->tRNAscan_id(), $cms_output_file, $log, $self->{cm_cutoff})) 1805 { 1806 $file_idx = $arrayCMscanResults->add_file($cms_output_file, $cur_cm); 1807 $arrayCMscanResults->merge_result_file($file_idx, 0); 1808 } 1809 else 1810 { 1811 $ret_value = 0; 1812 } 1813 } 1814 $arrayCMscanResults->write_merge_file($cms_merge_file, 1); 1815 1816 if ($arrayCMscanResults->get_result_count() > 0 and $ret_value) 1817 { 1818 $arrayCMscanResults->open_file($cms_merge_file); 1819 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 1820 1821 while ($cm_tRNA->seqname() ne "") 1822 { 1823 $duplicate = 0; 1824 $ret_value = 1; 1825 1826 if ((uc(substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 3)) ne "CCA") and 1827 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) eq "....")) 1828 { 1829 if ($cm_tRNA->strand() eq "+") 1830 { 1831 $cm_tRNA->end($cm_tRNA->end() - 3); 1832 } 1833 else 1834 { 1835 $cm_tRNA->start($cm_tRNA->start() + 3); 1836 } 1837 $cm_tRNA->seq(substr($cm_tRNA->seq(), 0, length($cm_tRNA->seq()) - 3)); 1838 $cm_tRNA->ss(substr($cm_tRNA->ss(), 0, length($cm_tRNA->ss()) - 3)); 1839 } 1840 1841 my ($intron_start, $intron_end); 1842 my $upstream_len = $cm_tRNA->start() - 1; 1843 my $downstream_len = length($clip_seq) - $cm_tRNA->end(); 1844 my $seq = substr($padded_full_seq, $cm_tRNA->start() - 1, length($padded_full_seq) - $upstream_len - $downstream_len); 1845 my $pos = index(uc($seq), uc($intron_seq)); 1846 if ($pos == -1) 1847 { 1848 $ret_value = 0; 1849 $log->debug("Intron out of boundary for ".$tRNA->tRNAscan_id()." ".$intron_seq); 1850 } 1851 else 1852 { 1853 $intron_start = $pos + 1; 1854 $intron_end = length($intron_seq) + $pos; 1855 my @ar_introns = $tRNA->ar_introns(); 1856 for (my $i = 0; $i < scalar(@ar_introns); $i++) 1857 { 1858 if ($ar_introns[$i]->{rel_start} == $intron_start and $ar_introns[$i]->{rel_end} == $intron_end) 1859 { 1860 $duplicate = 1; 1861 $log->debug("Duplicate intron detected for ".$tRNA->tRNAscan_id()." ".$intron_seq); 1862 last; 1863 } 1864 elsif ($ar_introns[$i]->{type} eq "CI" and 1865 length($ar_introns[$i]->{seq}) > 40 and 1866 $ar_introns[$i]->{rel_start} < $intron_start and $ar_introns[$i]->{rel_end} > $intron_start and 1867 $ar_introns[$i]->{rel_start} < $intron_end and $ar_introns[$i]->{rel_end} > $intron_end) 1868 { 1869 $ret_value = 0; 1870 $log->debug("Inclusion of intron ".$tRNA->tRNAscan_id()." ".$intron_seq); 1871 last; 1872 } 1873 elsif ($ar_introns[$i]->{type} eq "CI" and 1874# length($ar_introns[$i]->{seq}) > 40 and 1875 $ar_introns[$i]->{rel_start} == $intron_start and 1876 &seg_overlap($ar_introns[$i]->{rel_start}, $ar_introns[$i]->{rel_end}, $intron_start, $intron_end, 0)) 1877 { 1878 $ret_value = 0; 1879 $log->debug("Overlap with intron ".$tRNA->tRNAscan_id()." ".$intron_seq); 1880 last; 1881 } 1882 } 1883 } 1884 1885 my ($start, $end); 1886 if ($tRNA->strand() eq "+") 1887 { 1888 $upstream_end = $upstream_start + $upstream_len - 1; 1889 $downstream_start = $downstream_end - $downstream_len + 1; 1890 $start = $upstream_end + 1; 1891 $end = $downstream_start - 1; 1892 } 1893 else 1894 { 1895 $downstream_end = $downstream_start + $downstream_len - 1; 1896 $upstream_start = $upstream_end - $upstream_len + 1; 1897 $start = $downstream_end + 1; 1898 $end = $upstream_start - 1; 1899 } 1900 1901 my $hit_overlap = &seg_overlap($tRNA->start(), $tRNA->end(), $start, $end, 40); 1902 if ($ret_value and $hit_overlap and $cm_tRNA->score() > $tRNA->score() and length($cm_tRNA->seq()) >= $self->{min_tRNA_no_intron}) 1903 { 1904 $tRNA->upstream(substr($clip_seq, 0, $upstream_len)); 1905 $tRNA->downstream(substr($clip_seq, $cm_tRNA->end())); 1906 $tRNA->seq($seq); 1907 $tRNA->ss($cm_tRNA->ss()); 1908 $tRNA->mat_seq($cm_tRNA->seq()); 1909 $tRNA->mat_ss($tRNA->ss()); 1910 $tRNA->start($start); 1911 $tRNA->end($end); 1912 $tRNA->update_domain_model("infernal", $cm_tRNA->score(), $cm_tRNA->score(), 0, 0); 1913 $tRNA->set_default_scores(); 1914 $tRNA->model($cm_tRNA->model()); 1915 if (!$duplicate) 1916 { 1917 $tRNA->add_rel_intron($intron_start, $intron_end, "NCI", $intron_seq); 1918 } 1919 last; 1920 } 1921 else 1922 { 1923 $ret_value = 0; 1924 } 1925 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 1926 } 1927 $arrayCMscanResults->close_file(); 1928 } 1929 1930 return ($ret_value, $duplicate, $clip_seq, length($intron_seq)); 1931} 1932 1933sub parse_covels_hit 1934{ 1935 my $self = shift; 1936 my ($opts, $covels_hit, $r_covels_hit_elements, $prescan_tRNA) = @_; 1937 1938 my $covels_hit_found = 0; 1939 1940 if ($covels_hit =~ /^\s*(\S+)\s+(\d+)\s+(\d+).+: (\S+)\s*/o) 1941 { 1942 $r_covels_hit_elements->{score} = $1; 1943 $r_covels_hit_elements->{subseq_start} = $2; 1944 $r_covels_hit_elements->{subseq_end} = $3; 1945 $r_covels_hit_elements->{hit_seqname} = $4; 1946 $covels_hit_found = 1; 1947 } 1948 1949 if ($covels_hit_found) 1950 { 1951 if ($prescan_tRNA->strand() eq "+") 1952 { 1953 $r_covels_hit_elements->{tRNA_start} = $prescan_tRNA->start() + $r_covels_hit_elements->{subseq_start} - 1; 1954 $r_covels_hit_elements->{tRNA_end} = $prescan_tRNA->start() + $r_covels_hit_elements->{subseq_end} - 1; 1955 } 1956 else 1957 { 1958 $r_covels_hit_elements->{tRNA_start} = $prescan_tRNA->start() - $r_covels_hit_elements->{subseq_start} + 1; 1959 $r_covels_hit_elements->{tRNA_end} = $prescan_tRNA->start() - $r_covels_hit_elements->{subseq_end} + 1; 1960 } 1961 1962 return 1; 1963 } 1964 else 1965 { 1966 return 0; 1967 } 1968} 1969 1970# Run covels, return hits in $covels_hit_list array 1971sub run_covels 1972{ 1973 my $self = shift; 1974 my ($global_vars, $r_covels_hit_list, $r_cur_cm_file, $prescan_tRNA) = @_; 1975 my $global_constants = $global_vars->{global_constants}; 1976 my $opts = $global_vars->{options}; 1977 my $stats = $global_vars->{stats}; 1978 my $log = $global_vars->{log_file}; 1979 1980 my $scan_len = 0; 1981 my %covels_hit_elements = (); 1982 1983 $self->set_search_params($opts, \$scan_len, $r_cur_cm_file, $self->{max_cove_tRNA_length}, $prescan_tRNA); 1984 1985 # set covels reporting threshold below 0 (default) if -X param is 1986 # set below 0 by user 1987 my $report_cutoff = &min(0, $self->{cm_cutoff}); 1988 1989 my $complement = "-c"; 1990 if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp()) 1991 { 1992 $complement = ""; 1993 } 1994 1995 # run Covels 1996 my $covels_cmd = $self->covels_bin()." $complement -w$scan_len -t$report_cutoff $$r_cur_cm_file ".$global_constants->get("tmp_trnaseq_file"); 1997 $log->broadcast($covels_cmd); 1998 my $covels_output = `$covels_cmd`; 1999 2000 if ($? != 0) 2001 { 2002 $log->error("Covels-SE cannot be completed successfully for ".$prescan_tRNA->seqname().". (Exit Code: $?)"); 2003 print "Exit first loop at 1\n"; 2004 return 0; 2005 } 2006 2007 my ($junk, $allhits) = split(/----------\n\n/, $covels_output); 2008 @$r_covels_hit_list = split(/\n/, $allhits); 2009 2010 # count no. of hits over cutoff 2011 my $total_hits = 0; 2012 foreach my $covels_hit (@$r_covels_hit_list) 2013 { 2014 %covels_hit_elements = (); 2015 if (($self->parse_covels_hit($opts, $covels_hit, \%covels_hit_elements, $prescan_tRNA)) && 2016 ($covels_hit_elements{score} >= $self->{cm_cutoff})) 2017 { 2018 $total_hits++; 2019 } 2020 } 2021 2022 # if no tRNAs detected when using a selenocysteine cove model, 2023 # try main model and run again before giving up 2024 if (($total_hits == 0) && 2025 (($$r_cur_cm_file eq $self->{Pselc_cm_file_path}) || ($$r_cur_cm_file eq $self->{Eselc_cm_file_path}))) 2026 { 2027 $$r_cur_cm_file = $self->{main_cm_file_path}->{Domain}; 2028 2029 # re-run Covels with main model 2030 $covels_cmd = $self->covels_bin()." -w$scan_len -t$report_cutoff $$r_cur_cm_file ".$global_constants->get("tmp_trnaseq_file"); 2031 $covels_output = `$covels_cmd`; 2032 if ($? != 0) 2033 { 2034 $log->error("Covels-SE cannot be completed successfully for ".$prescan_tRNA->seqname().". (Exit Code: $?)"); 2035 print "Exit first loop at 2\n"; 2036 return 0; 2037 } 2038 2039 ($junk,$allhits) = split(/----------\n\n/, $covels_output); 2040 @$r_covels_hit_list = split(/\n/, $allhits); 2041 } 2042 2043 # Go thru hit list, save info for tRNA hits with sub-cutoff scores 2044 my $ct = 0; 2045 my $over_cutoff = 0; 2046 my $trna_desc = ""; 2047 2048 foreach my $covels_hit (@$r_covels_hit_list) 2049 { 2050 %covels_hit_elements = (); 2051 if ($self->parse_covels_hit($opts, $covels_hit, \%covels_hit_elements, $prescan_tRNA)) 2052 { 2053 $ct++; 2054 if ($covels_hit_elements{score} >= $self->{cm_cutoff}) 2055 { 2056 $over_cutoff++; 2057 } 2058 else 2059 { 2060 $log->broadcast("Low covels score for ".$prescan_tRNA->tRNAscan_id().".$ct: $covels_hit_elements{score}"); 2061 $trna_desc .= "(Cove Hit#$ct: $covels_hit_elements{tRNA_start}-$covels_hit_elements{tRNA_end},". 2062 " Sc: $covels_hit_elements{score}, Len: ".(abs($covels_hit_elements{tRNA_start} - $covels_hit_elements{tRNA_end}) + 1).") "; 2063 } 2064 } 2065 } 2066 2067 # report if no scores over 0 bit reporting threshold 2068 if ($over_cutoff == 0) 2069 { 2070 if ((!$opts->results_to_stdout()) && 2071 ($opts->eufind_mode() || $opts->tscan_mode() || $opts->use_prev_ts_run())) 2072 { 2073 $log->broadcast("Covels score(s) below cutoff for ".$prescan_tRNA->tRNAscan_id().". Skipping..."); 2074 } 2075 if ($opts->save_falsepos()) 2076 { 2077 my $fulltrnaDesc = "(Fp Hit: ".$prescan_tRNA->start()."-".$prescan_tRNA->end().", ". 2078 (abs($prescan_tRNA->start() - $prescan_tRNA->end()) + 1)." bp, Src: ".$prescan_tRNA->hit_source().") ".$trna_desc; 2079 2080 $stats->increment_fpos_base_ct(length($prescan_tRNA->seq())); 2081 &write_tRNA($opts->falsepos_file(), $prescan_tRNA->tRNAscan_id(), $fulltrnaDesc, $prescan_tRNA->seq(), 0); 2082 } 2083 } 2084 2085 return 1; 2086} 2087 2088sub run_coves 2089{ 2090 my $self = shift; 2091 my ($log, $tmp_trnaseq_file, $seq_name, $cm_file) = @_; 2092 2093 my ($covseq, $covss, $coves_output, $junk, @coves_lines, $sec_struct, $coves_score); 2094 2095 my $coves_cmd = $self->{coves_bin}." -s $cm_file $tmp_trnaseq_file"; 2096 $log->broadcast($coves_cmd); 2097 $coves_output = `$coves_cmd`; 2098 if ($? != 0) 2099 { 2100 $log->error("Coves-SE cannot be completed successfully for $seq_name. (Exit Code: $?)"); 2101 return ("Error", "", -1); 2102 } 2103 2104 ($junk, $sec_struct) = split(/----------\n\n/, $coves_output); 2105 @coves_lines = split(/\n/,$sec_struct); 2106 $covseq = ''; 2107 $covss = ''; 2108 $coves_score = -1000; 2109 $seq_name =~ s/(\W)/\\$1/g; 2110 2111 foreach (@coves_lines) 2112 { 2113 if (/^\s+$seq_name\s([a-zA-Z\-]{1,60})\s*/) 2114 { 2115 $covseq .= $1; 2116 } 2117 if (/^\s+$seq_name\s([\.\<\>\ ]{1,60})/) 2118 { 2119 $covss .= $1; 2120 } 2121 if (/^\s*(\S+)\sbits\s:\s$seq_name/) 2122 { 2123 $coves_score = $1; 2124 } 2125 } 2126 $covss =~ s/\s//g; # take spaces out of alignment 2127 $covseq =~ s/-//g; # take '-' gaps out of seq 2128 2129 if (($covseq eq '') || ($covss eq '')) 2130 { 2131 print STDERR "Could not complete coves successfully for $seq_name ", 2132 "because unable to parse coves secondary structure string. ", 2133 "Skipping tRNA anticodon & type prediction\n"; 2134 return ("Error", "", -1); 2135 } 2136 2137 return ($covseq, $covss, $coves_score); 2138} 2139 2140sub analyze_with_cove 2141{ 2142 my $self = shift; 2143 my ($global_vars, $prescan_tRNA, $r_curseq_trnact) = @_; 2144 my $global_constants = $global_vars->{global_constants}; 2145 my $opts = $global_vars->{options}; 2146 my $stats = $global_vars->{stats}; 2147 my $log = $global_vars->{log_file}; 2148 2149 my (@covels_hit_list, $cur_cm_file, $cove_confirmed_ct); 2150 my ($covseq, $covss, $coves_score); 2151 my %covels_hit_elements = (); 2152 my $covels_tRNA = undef; 2153 my $cov_hit = {}; 2154 my $rescore = 0; 2155 2156 $cove_confirmed_ct = 0; 2157 if (!$self->run_covels($global_vars, \@covels_hit_list, \$cur_cm_file, $prescan_tRNA)) 2158 { 2159 return 0; 2160 } 2161 2162 # Loop to parse covels tRNA hit(s) and run Coves on each tRNA 2163 foreach my $covels_hit (@covels_hit_list) 2164 { 2165 $rescore = 0; 2166 %covels_hit_elements = (); 2167 if ((!$self->parse_covels_hit($opts, $covels_hit, \%covels_hit_elements, $prescan_tRNA)) || 2168 ($covels_hit_elements{score} < $self->{cm_cutoff})) 2169 { 2170 next; 2171 } 2172 $covels_tRNA = tRNAscanSE::tRNA->new; 2173 $$r_curseq_trnact++; 2174 2175 $covels_tRNA->id($$r_curseq_trnact); 2176 $covels_tRNA->set_covels_hit(\%covels_hit_elements); 2177 $covels_tRNA->upstream($prescan_tRNA->upstream()); 2178 $covels_tRNA->downstream($prescan_tRNA->downstream()); 2179 if (($covels_hit_elements{subseq_start} == 1) && ($covels_hit_elements{subseq_end} == $prescan_tRNA->len())) 2180 { 2181 $covels_hit_elements{tRNA_len} = $prescan_tRNA->len(); 2182 } 2183 else 2184 { 2185 # get correct subseq for coves & save to file 2186 if ($covels_hit_elements{subseq_start} < $covels_hit_elements{subseq_end}) 2187 { 2188 $covels_hit_elements{tRNA_len} = $covels_hit_elements{subseq_end} - $covels_hit_elements{subseq_start} + 1; 2189 &write_tRNA($global_constants->get("tmp_trnaseq_file"), $covels_tRNA->seqname(), " ", 2190 substr($prescan_tRNA->seq(), $covels_hit_elements{subseq_start} - 1, $covels_hit_elements{tRNA_len}), 1); 2191 if ($covels_hit_elements{subseq_start} > 1) 2192 { 2193 $covels_tRNA->upstream($covels_tRNA->upstream() . substr($prescan_tRNA->seq(), 0, $covels_hit_elements{subseq_start} - 1)); 2194 } 2195 if ($covels_hit_elements{subseq_end} < $prescan_tRNA->len()) 2196 { 2197 $covels_tRNA->downstream(substr($prescan_tRNA->seq(), $covels_hit_elements{subseq_end}) . $covels_tRNA->downstream()); 2198 } 2199 } 2200 else 2201 { 2202 $covels_hit_elements{tRNA_len} = $covels_hit_elements{subseq_start} - $covels_hit_elements{subseq_end} + 1; 2203 &write_tRNA($global_constants->get("tmp_trnaseq_file"), $covels_tRNA->seqname(), " ", 2204 &rev_comp_seq(substr($prescan_tRNA->seq(), $covels_hit_elements{subseq_end} - 1, $covels_hit_elements{tRNA_len})), 1); 2205 if ($covels_hit_elements{subseq_end} > 1) 2206 { 2207 $covels_tRNA->upstream($covels_tRNA->upstream() . &rev_comp_seq(substr($prescan_tRNA->seq(), $covels_hit_elements{subseq_start} - 1, $global_constants->get("upstream_len")))); 2208 } 2209 if ($covels_hit_elements{subseq_start} < $prescan_tRNA->len()) 2210 { 2211 $covels_tRNA->downstream(&rev_comp_seq(substr($prescan_tRNA->seq(), $covels_hit_elements{subseq_end} - $global_constants->get("downstream_len"), $global_constants->get("downstream_len")) . $covels_tRNA->downstream())); 2212 } 2213 } 2214 } 2215 $stats->increment_coves_base_ct($covels_hit_elements{tRNA_len}); 2216 2217 ($covseq, $covss, $coves_score) = $self->run_coves($log, $global_constants->get("tmp_trnaseq_file"), $prescan_tRNA->seqname(), $cur_cm_file); 2218 $covels_tRNA->seq($covseq); 2219 $covels_tRNA->ss($covss); 2220 $covels_tRNA->update_domain_model("cove", $coves_score, $coves_score, 0, 0); 2221 2222 if ((uc(substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 3)) ne "CCA") && 2223 (substr($covels_tRNA->ss(), length($covels_tRNA->ss()) - 4) eq "....") && 2224 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cove_cm", "eukaryota")) && 2225 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cove_cm", "general"))) 2226 { 2227 $covels_hit_elements{subseq_end} -= 3; 2228 if ($covels_tRNA->strand() eq "+") 2229 { 2230 $covels_tRNA->end($covels_tRNA->end() - 3); 2231 } 2232 else 2233 { 2234 $covels_tRNA->start($covels_tRNA->start() + 3); 2235 } 2236 $covels_tRNA->seq(substr($covels_tRNA->seq(), 0, length($covels_tRNA->seq()) - 3)); 2237 $covels_tRNA->ss(substr($covels_tRNA->ss(), 0, length($covels_tRNA->ss()) - 3)); 2238 $rescore = 1; 2239 } 2240 elsif ((uc(substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 3)) eq "CCA") && 2241 (((substr($covels_tRNA->ss(), length($covels_tRNA->ss()) - 6) eq "<.....") && 2242 (substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 5, 2) =~ /[acgtn][ACGTN]/)) || 2243 ((substr($covels_tRNA->ss(), length($covels_tRNA->ss()) - 7) eq "<......") && 2244 (substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 6, 3) =~ /[ACGTN][acgtn][ACGTN]/)) || 2245 ((substr($covels_tRNA->ss(), length($covels_tRNA->ss()) - 7) eq "<......") && 2246 (substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 6, 3) =~ /[acgtn]{2}[ACGTN]/))) && 2247 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cove_cm", "eukaryota")) && 2248 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cove_cm", "general"))) 2249 { 2250 my $trim_len = 4; 2251 if (substr($covels_tRNA->ss(), length($covels_tRNA->ss()) - 7) eq "<......" && substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 6, 3) =~ /[acgtn]{2}[ACGTN]/) 2252 { 2253 $trim_len = 5; 2254 } 2255 2256 if ($covels_tRNA->strand() eq "+") 2257 { 2258 $covels_tRNA->end($covels_tRNA->end() - $trim_len); 2259 } 2260 else 2261 { 2262 $covels_tRNA->start($covels_tRNA->start() + $trim_len); 2263 } 2264 $covels_tRNA->seq(substr($covels_tRNA->seq(), 0, length($covels_tRNA->seq()) - $trim_len)); 2265 $covels_tRNA->seq(substr($covels_tRNA->seq(), 0, length($covels_tRNA->seq()) - 1).uc(substr($covels_tRNA->seq(), length($covels_tRNA->seq()) - 1, 1))); 2266 $covels_tRNA->ss(substr($covels_tRNA->ss(), 0, length($covels_tRNA->ss()) - $trim_len)); 2267 $rescore = 1; 2268 } 2269 2270 if ($rescore) 2271 { 2272 my $trna_file = tRNAscanSE::Sequence->new; 2273 $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); 2274 $trna_file->set_seq_info($prescan_tRNA->seqname(), $prescan_tRNA->seqname(), length($covels_tRNA->seq()), $covels_tRNA->seq()); 2275 $trna_file->write_fasta(); 2276 $trna_file->close_file(); 2277 ($covseq, $covss, $coves_score) = $self->run_coves($log, $global_constants->get("tmp_trnaseq_file"), $prescan_tRNA->seqname(), $cur_cm_file); 2278 $covels_tRNA->update_domain_model("cove", $coves_score, $coves_score, 0, 0); 2279 } 2280 2281 # look for intron 2282 $self->decode_tRNA_properties($global_vars, $covels_tRNA, $prescan_tRNA, $cur_cm_file); 2283 $covels_tRNA->set_mature_tRNA(); 2284 2285 $covels_tRNA->tRNAscan_id($covels_tRNA->seqname().".tRNA".$$r_curseq_trnact."-".$covels_tRNA->isotype().$covels_tRNA->anticodon()); 2286 $covels_tRNA->src_seqlen($prescan_tRNA->src_seqlen()); 2287 $covels_tRNA->src_seqid($covels_hit_elements{hit_seqname}); 2288 $covels_tRNA->hit_source($prescan_tRNA->hit_source()); 2289 $covels_tRNA->model($cur_cm_file); 2290 $covels_tRNA->ordered_seqname($prescan_tRNA->src_seqid()); 2291 $covels_tRNA->set_default_scores(); 2292 2293 if (!$self->{CM_check_for_introns}) 2294 { 2295 $cove_confirmed_ct++; 2296 } 2297 $global_vars->{sp_int_results}->write_tRNA($covels_tRNA); 2298 } # while more covels_hits 2299 2300 return $cove_confirmed_ct; 2301} 2302 2303# Format command and run Infernal cmscan 2304sub exec_cmscan 2305{ 2306 my $self = shift; 2307 my ($cm_db, $scan_flag, $tmp_trnaseq_file, $seq_name, $cms_output_file, $cms_tab_file, $log) = @_; 2308 2309 my $cm_options = "-g --nohmm --toponly --notrunc"; 2310 if ($scan_flag == 1) 2311 { 2312 $cm_options = "-g --mid --notrunc"; 2313 } 2314 elsif ($scan_flag == 2) 2315 { 2316 $cm_options = "-g --mid --toponly --notrunc"; 2317 } 2318 elsif ($scan_flag == 3) 2319 { 2320 $cm_options = "-g --mid --notrunc -T ".$self->{infernal_fp_cutoff}; 2321 } 2322 elsif ($scan_flag == 4) 2323 { 2324 $cm_options = "-g --nohmm --notrunc"; 2325 } 2326 if ($self->{infernal_thread} != -1) 2327 { 2328 $cm_options .= " --cpu ".$self->{infernal_thread}; 2329 } 2330 2331 my $cms_cmd = "$self->{cmscan_bin} $cm_options --fmt 2 --tblout $cms_tab_file -o $cms_output_file $cm_db $tmp_trnaseq_file"; 2332 $log->broadcast($cms_cmd); 2333 system($cms_cmd); 2334 2335 if ($? != 0) 2336 { 2337 $log->error("Infernal cmscan cannot be completed successfully for ".$seq_name.". $cms_cmd (Exit Code: $? $!)"); 2338 return 0; 2339 } 2340 return 1; 2341} 2342 2343# Format command and run Infernal cmsearch 2344sub exec_cmsearch 2345{ 2346 my $self = shift; 2347 my ($cm_file, $scan_flag, $tmp_trnaseq_file, $seq_name, $cms_output_file, $log, $score_cutoff) = @_; 2348 2349 my $cm_options = "-g --nohmm --toponly --notrunc"; 2350 if ($scan_flag == 1) 2351 { 2352 $cm_options = "-g --mid --notrunc"; 2353 } 2354 elsif ($scan_flag == 2) 2355 { 2356 $cm_options = "-g --mid --toponly --notrunc"; 2357 } 2358 elsif ($scan_flag == 3) 2359 { 2360 $cm_options = "-g --mid --notrunc -T ".$self->{infernal_fp_cutoff}; 2361 } 2362 elsif ($scan_flag == 4) 2363 { 2364 $cm_options = "-g --nohmm --notrunc"; 2365 } 2366 elsif ($scan_flag == 5) 2367 { 2368 $cm_options = "-g --max --toponly --notrunc --notextw -T ".$self->{BHB_cm_cutoff}; 2369 } 2370 elsif ($scan_flag == 6) 2371 { 2372 $cm_options = "-g --toponly --notextw"; 2373 } 2374 elsif ($scan_flag == 7) 2375 { 2376 $cm_options = "-g --max --toponly --notrunc --notextw -T 0"; 2377 } 2378 2379 if ($scan_flag != 3 and $scan_flag != 5 and $scan_flag != 7) 2380 { 2381 if ($score_cutoff <= 10) 2382 { 2383 $cm_options .= " -T ".$score_cutoff; 2384 } 2385 } 2386 if ($self->{infernal_thread} != -1) 2387 { 2388 $cm_options .= " --cpu ".$self->{infernal_thread}; 2389 } 2390 2391 my $cms_cmd = "$self->{cmsearch_bin} $cm_options $cm_file $tmp_trnaseq_file > $cms_output_file"; 2392 $log->broadcast($cms_cmd); 2393 system($cms_cmd); 2394 2395 if ($? != 0) 2396 { 2397 $log->error("Infernal cmsearch cannot be completed successfully for ".$seq_name.". $cms_cmd (Exit Code: $? $!)"); 2398 return 0; 2399 } 2400 2401 return 1; 2402} 2403 2404# Run Infernal cmsearch, return results in $r_cms_hit_list array reference 2405sub run_cmsearch 2406{ 2407 my $self = shift; 2408 my ($global_vars, $seq_name, $arrayCMscanResults, $merge_range) = @_; 2409 my $global_constants = $global_vars->{global_constants}; 2410 my $opts = $global_vars->{options}; 2411 my $stats = $global_vars->{stats}; 2412 my $log = $global_vars->{log_file}; 2413 2414 my $score_cutoff = -1; 2415 my $scan_flag = -1; 2416 my $cms_output_file = ""; 2417 my $file_idx = -1; 2418 my $cms_merge_file = $global_constants->get("temp_dir")."/tscan$$"."_cm_merge.out"; 2419 2420 # run cmsearch 2421 if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp()) 2422 { 2423 $scan_flag = 0; 2424 if ($opts->hmm_filter()) 2425 { 2426 $scan_flag = 2; 2427 } 2428 } 2429 else 2430 { 2431 $scan_flag = 4; 2432 if ($opts->hmm_filter()) 2433 { 2434 $scan_flag = 1; 2435 } 2436 } 2437 2438# if ($opts->mito_mode()) 2439# { 2440# $score_cutoff = $self->{organelle_cm_cutoff}; 2441# } 2442# else 2443# { 2444 $score_cutoff = $self->{cm_cutoff}; 2445# } 2446 2447 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 2448 { 2449 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_cm_".$cur_cm.".out"; 2450 if ($self->exec_cmsearch($self->{main_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $seq_name, $cms_output_file, $log, $score_cutoff)) 2451 { 2452 $file_idx = $arrayCMscanResults->add_file($cms_output_file, $cur_cm); 2453 $arrayCMscanResults->merge_result_file($file_idx, $merge_range); 2454 } 2455 else 2456 { 2457 return ""; 2458 } 2459 } 2460 $arrayCMscanResults->write_merge_file($cms_merge_file, 1); 2461 my $count = $arrayCMscanResults->get_result_count(); 2462 2463 return ($cms_merge_file, $count); 2464} 2465 2466# Run Infernal for scanning truncation in predicited tRNA 2467sub truncated_tRNA_search 2468{ 2469 my $self = shift; 2470 my ($global_vars, $seq_name) = @_; 2471 my $global_constants = $global_vars->{global_constants}; 2472 my $opts = $global_vars->{options}; 2473 my $log = $global_vars->{log_file}; 2474 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 2475 my $sp_int_results = $global_vars->{sp_int_results}; 2476 my $tmp_trnaseq_file = $global_constants->get("tmp_trnaseq_file"); 2477 my $tRNA = tRNAscanSE::tRNA->new; 2478 my $cm_tRNA = tRNAscanSE::tRNA->new; 2479 2480 my %cms_output_file = (); 2481 my $scan_flag = 6; 2482 my $file_idx = -1; 2483 my $cms_merge_file = $global_constants->get("temp_dir")."/tscan$$"."_trunc_cm_merge.out"; 2484 2485 my $sp_trunc_int_results = tRNAscanSE::IntResultFile->new; 2486 $sp_trunc_int_results->file_name($opts->truncated_int_result_file()); 2487 2488 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 2489 { 2490 my $count = &write_tRNAs($tmp_trnaseq_file, $sp_int_results, 1, 1, $cur_cm); 2491 2492 if ($count > 0) 2493 { 2494 $cms_output_file{$cur_cm} = $global_constants->get("temp_dir")."/tscan$$"."_trunc_cm_".$cur_cm.".out"; 2495 if ($self->exec_cmsearch($self->{main_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $seq_name, $cms_output_file{$cur_cm}, $log, $self->{cm_cutoff})) 2496 { 2497 $file_idx = $arrayCMscanResults->add_file($cms_output_file{$cur_cm}, $cur_cm); 2498 $arrayCMscanResults->merge_result_file($file_idx, 0); 2499 } 2500 else 2501 { 2502 return ""; 2503 } 2504 } 2505 } 2506 $arrayCMscanResults->write_merge_file($cms_merge_file, 0); 2507 2508 my @sp_indexes = $sp_int_results->get_indexes(); 2509 if ($sp_int_results->open_file("read") and $sp_trunc_int_results->open_file("write")) 2510 { 2511 $arrayCMscanResults->open_file($cms_merge_file); 2512 my $cm_model = $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 2513 2514 for (my $i = 0; $i < scalar(@sp_indexes); $i++) 2515 { 2516 $sp_int_results->get_tRNA($sp_indexes[$i]->[0], $tRNA); 2517 if (($cm_tRNA->seqname() ne "") and ($cm_tRNA->seqname() eq $tRNA->seqname().".t".&pad_num($tRNA->id(), 6))) 2518 { 2519 my $trunc_label = $self->check_truncation($tRNA, $cm_tRNA, $cm_model, $global_constants); 2520 $tRNA->trunc($trunc_label); 2521 $cm_model = $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 2522 } 2523 $sp_trunc_int_results->write_tRNA($tRNA); 2524 } 2525 $sp_int_results->close_file(); 2526 $sp_trunc_int_results->close_file(); 2527 $sp_int_results->clear_index(); 2528 $global_vars->{sp_int_results} = $sp_trunc_int_results; 2529 } 2530} 2531 2532sub check_truncation 2533{ 2534 my $self = shift; 2535 my ($tRNA, $trunc_tRNA, $cm_model, $global_constants) = @_; 2536 2537 my $label = ""; 2538 if ($trunc_tRNA->trunc() eq "" or $trunc_tRNA->trunc() eq "no") 2539 {} 2540 else 2541 { 2542 if (index($trunc_tRNA->trunc(),"5'") > -1) 2543 { 2544 if ($cm_model =~/^\<\[(\d+)\]\*/) 2545 { 2546 $label = "trunc_start:".$1; 2547 } 2548 } 2549 if (index($trunc_tRNA->trunc(), "3'") > -1) 2550 { 2551 if ($cm_model =~/\*\[(\d+)\]\>$/) 2552 { 2553 my $diff = $1; 2554 if ($diff <= 3 and ((uc(substr($tRNA->mat_seq(), length($tRNA->mat_seq()) - 3)) ne "CCA" and index($trunc_tRNA->trunc(),"5'") == -1) or 2555 index($trunc_tRNA->trunc(),"5'") > -1) and 2556 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "eukaryota")) and 2557 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "general"))) 2558 {} 2559 else 2560 { 2561 if ($label ne "") 2562 { 2563 $label .= ","; 2564 } 2565 $label .= "trunc_end:".$diff; 2566 } 2567 } 2568 } 2569 } 2570 2571 return $label; 2572} 2573 2574# Run Infernal with isotype specific CMs for predicited tRNA 2575sub isotype_cmsearch 2576{ 2577 my $self = shift; 2578 my ($global_vars) = @_; 2579 my $global_constants = $global_vars->{global_constants}; 2580 my $opts = $global_vars->{options}; 2581 my $sp_int_results = $global_vars->{sp_int_results}; 2582 my $iso_int_results = $global_vars->{iso_int_results}; 2583 my $tmp_trnaseq_file = $global_constants->get("tmp_trnaseq_file"); 2584 my $tRNA = tRNAscanSE::tRNA->new; 2585 my $seqname = ""; 2586 2587 &write_tRNAs($tmp_trnaseq_file, $sp_int_results, 1, 1, ""); 2588 2589 my @sp_indexes = $sp_int_results->get_indexes(); 2590 if ($sp_int_results->open_file("read") && $iso_int_results->open_file("write")) 2591 { 2592 $iso_int_results->write_line(""); 2593 for (my $i = 0; $i < scalar(@sp_indexes); $i++) 2594 { 2595 $sp_int_results->get_tRNA($sp_indexes[$i]->[0], $tRNA); 2596 $seqname = $tRNA->seqname(); 2597 $iso_int_results->write_line($tRNA->seqname().".t".&pad_num($tRNA->id(), 6)); 2598 } 2599 $iso_int_results->close_file(); 2600 $sp_int_results->close_file(); 2601 } 2602 2603 # run cmscan 2604 $self->scan_isotype_cm($global_vars, $self->{isotype_cm_db_file_path}, "cyto", $seqname); 2605 if ($opts->euk_mode() and $opts->mito_model() ne "") 2606 { 2607 $self->scan_isotype_cm($global_vars, $self->{mito_isotype_cm_db_file_path}, "mito", $seqname); 2608 } 2609} 2610 2611sub scan_isotype_cm 2612{ 2613 my $self = shift; 2614 my ($global_vars, $cm_db, $type, $seq_name) = @_; 2615 my $global_constants = $global_vars->{global_constants}; 2616 my $opts = $global_vars->{options}; 2617 my $log = $global_vars->{log_file}; 2618 my $iso_int_results = $global_vars->{iso_int_results}; 2619 my $tmp_trnaseq_file = $global_constants->get("tmp_trnaseq_file"); 2620 my $cmscanResults = undef; 2621 my $tRNA = tRNAscanSE::tRNA->new; 2622 2623 my $scan_flag = 2; 2624 2625 my $iso_result_file = $global_constants->get("temp_dir")."/tscan$$"."_iso_temp.out"; 2626 my $iso_results_out = tRNAscanSE::MultiResultFile->new; 2627 my ($cur_iso_cm, $cur_iso_cm_file); 2628 my $cms_output_file = ""; 2629 my $cms_tab_file = ""; 2630 my $line = ""; 2631 my $tRNAid = ""; 2632 my $iso_index_ct = -1; 2633 my %found_models = (); 2634 2635 my $models = $self->get_models_from_db($cm_db); 2636 2637 $iso_results_out->file_name($iso_result_file); 2638 2639 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_iso_cm.out"; 2640 $cms_tab_file = $global_constants->get("temp_dir")."/tscan$$"."_iso_cm.tab"; 2641 if ($self->exec_cmscan($cm_db, $scan_flag, $global_constants->get("tmp_trnaseq_file"), $seq_name, $cms_output_file, $cms_tab_file, $log)) 2642 { 2643 $cmscanResults = tRNAscanSE::CMscanResultFile->new($cms_tab_file, "multi"); 2644 } 2645 else 2646 { 2647 return 0; 2648 } 2649 2650 if ($iso_int_results->open_file("read") && $iso_results_out->open_file("write")) 2651 { 2652 # header line 2653 $line = $iso_int_results->get_line(); 2654 foreach $cur_iso_cm (sort @$models) 2655 { 2656 my $model = $cur_iso_cm; 2657 if ($cur_iso_cm =~ /^arch-(\S+)/ || $cur_iso_cm =~ /^euk-(\S+)/ || $cur_iso_cm =~ /^bact-(\S+)/) 2658 { 2659 $model = $1; 2660 } 2661 2662 if ($type eq "mito") 2663 { 2664 $line = $line."\tmito_".$model; 2665 } 2666 else 2667 { 2668 $line = $line."\t".$model; 2669 } 2670 } 2671 $iso_results_out->write_line($line); 2672 2673 #content 2674 if ($cmscanResults->open_file()) 2675 { 2676 my $hits = $cmscanResults->get_next_tab_seq_hits(); 2677 ($tRNAid, $line) = $iso_int_results->get_next_record(); 2678 while ($tRNAid ne "") 2679 { 2680 %found_models = (); 2681 if (scalar(@$hits) > 0) 2682 { 2683 $iso_index_ct = 0; 2684 if ($tRNAid eq $hits->[0]->[1]) 2685 { 2686 foreach $cur_iso_cm (sort @$models) 2687 { 2688 if ($iso_index_ct < scalar(@$hits)) 2689 { 2690 if ($cur_iso_cm eq $hits->[$iso_index_ct]->[0]) 2691 { 2692 if (!defined $found_models{$cur_iso_cm}) 2693 { 2694 $line .= "\t".$hits->[$iso_index_ct]->[5]; 2695 $found_models{$cur_iso_cm} = 1; 2696 } 2697 $iso_index_ct++; 2698 } 2699 elsif ($cur_iso_cm lt $hits->[$iso_index_ct]->[0]) 2700 { 2701 $line .= "\t"; 2702 } 2703 elsif ($cur_iso_cm gt $hits->[$iso_index_ct]->[0]) 2704 { 2705 $iso_index_ct++; 2706 } 2707 } 2708 else 2709 { 2710 $line .= "\t"; 2711 } 2712 } 2713 $iso_results_out->write_line($line); 2714 2715 $hits = $cmscanResults->get_next_tab_seq_hits(); 2716 ($tRNAid, $line) = $iso_int_results->get_next_record(); 2717 } 2718 elsif ($tRNAid lt $hits->[0]->[1]) 2719 { 2720 for (my $i = 0; $i < scalar(@$models); $i++) 2721 { 2722 $line .= "\t"; 2723 } 2724 $iso_results_out->write_line($line); 2725 ($tRNAid, $line) = $iso_int_results->get_next_record(); 2726 } 2727 elsif ($tRNAid gt $hits->[0]->[1]) 2728 { 2729 $hits = $cmscanResults->get_next_tab_seq_hits(); 2730 } 2731 } 2732 else 2733 { 2734 for (my $i = 0; $i < scalar(@$models); $i++) 2735 { 2736 $line .= "\t"; 2737 } 2738 $iso_results_out->write_line($line); 2739 ($tRNAid, $line) = $iso_int_results->get_next_record(); 2740 } 2741 } 2742 $cmscanResults->close_file(); 2743 } 2744 $iso_results_out->close_file(); 2745 $iso_int_results->close_file(); 2746 copy($iso_results_out->file_name(), $iso_int_results->file_name()); 2747 } 2748} 2749 2750sub get_models_from_db 2751{ 2752 my $self = shift; 2753 my ($cm_db) = @_; 2754 my $line = ""; 2755 my @models = (); 2756 my $start = 0; 2757 2758 open(FILE_IN, "$cm_db") or die "Fail to open $cm_db\n"; 2759 while ($line = <FILE_IN>) 2760 { 2761 chomp($line); 2762 if ($line =~ /^INFERNAL/) 2763 { 2764 $start = 1; 2765 } 2766 elsif ($start and $line =~ /^NAME\s+(\S+)/) 2767 { 2768 push(@models, $1); 2769 $start = 0; 2770 } 2771 } 2772 close(FILE_IN); 2773 2774 my @sorted_models = sort @models; 2775 return \@sorted_models; 2776} 2777 2778sub rescore_tRNA_cove 2779{ 2780 my $self = shift; 2781 my ($global_vars, $sp_int_results) = @_; 2782 my $global_constants = $global_vars->{global_constants}; 2783 my $opts = $global_vars->{options}; 2784 my $log = $global_vars->{log_file}; 2785 my $tmp_trnaseq_file = $global_constants->get("tmp_trnaseq_file"); 2786 2787# $sp_int_results->sort_records("tRNAscan_id"); 2788 2789 &write_tRNAs($tmp_trnaseq_file, $sp_int_results, 1, 1, ""); 2790 2791 # run cove 2792 my $cove_output_file = $global_constants->get("temp_dir")."/tscan$$"."_cove.out"; 2793 my $coves_cmd = $self->{coves_bin}." -s ".$self->{cove_cm_file_path}." ".$tmp_trnaseq_file." > ".$cove_output_file; 2794 system($coves_cmd); 2795 if ($? != 0) 2796 { 2797 $log->error("Coves-SE for rescoring tRNAs cannot be completed successfully. (Exit Code: $?)"); 2798 return ("Error", "", -1); 2799 } 2800 2801 my $line = ""; 2802 my ($score, $trna_id); 2803 my $index = -1; 2804 my $cove_model = undef; 2805 my $new_int_result_file = $global_constants->get("temp_dir")."/tscan$$"."_sp_rescore.out"; 2806 my $int_results_out = tRNAscanSE::IntResultFile->new; 2807 my $tRNA = tRNAscanSE::tRNA->new; 2808 2809 $int_results_out->file_name($new_int_result_file); 2810 &open_for_read(\*FILE_H, $cove_output_file); 2811 if ($sp_int_results->open_file("read") && $int_results_out->open_file("write")) 2812 { 2813 while ($line = <FILE_H>) 2814 { 2815 chomp($line); 2816 if ($line =~ /^\s*([0-9\.]+) bits : (.+)$/) 2817 { 2818 $score = $1; 2819 $trna_id = $2; 2820 $index = $sp_int_results->bsearch_tRNAscan_id($trna_id); 2821 if ($index > -1) 2822 { 2823 $sp_int_results->get_tRNA($sp_int_results->get_pos($index), $tRNA); 2824 $tRNA->tRNAscan_id($tRNA->seqname().".tRNA".$tRNA->id()."-".$tRNA->isotype().$tRNA->anticodon()); 2825 if ($tRNA->model() ne $self->{Pselc_cm_file_path} and $tRNA->model() ne $self->{Eselc_cm_file_path} and 2826 $tRNA->model() ne $self->{isotype_cm_file_paths}->{SeC}) 2827 { 2828 $cove_model = $tRNA->get_domain_model("cove"); 2829 if (!defined $cove_model) 2830 { 2831 $tRNA->set_domain_model("cove", $score); 2832 } 2833 elsif ($score > $cove_model->{score}) 2834 { 2835 $tRNA->update_domain_model("cove", $score, $score, 0, 0); 2836 } 2837 } 2838 $tRNA->set_default_scores(); 2839 $int_results_out->write_tRNA($tRNA); 2840 } 2841 } 2842 } 2843 $sp_int_results->close_file(); 2844 $int_results_out->close_file(); 2845 $opts->secondpass_int_result_file($new_int_result_file); 2846 $sp_int_results->clear(); 2847 $sp_int_results->file_name($new_int_result_file); 2848 } 2849 close(FILE_H); 2850} 2851 2852sub rescore_tRNA 2853{ 2854 my $self = shift; 2855 my ($global_vars, $cm_tRNA, $prescan_tRNA) = @_; 2856 my $global_constants = $global_vars->{global_constants}; 2857 my $opts = $global_vars->{options}; 2858 my $log = $global_vars->{log_file}; 2859 2860 my $trna_file = tRNAscanSE::Sequence->new; 2861 $trna_file->open_file($global_constants->get("tmp_trnaseq_file"), "write"); 2862 $trna_file->set_seq_info($cm_tRNA->seqname().".trna".&pad_num($cm_tRNA->id(), 6), $cm_tRNA->seqname(), length($cm_tRNA->seq()), $cm_tRNA->seq()); 2863 $trna_file->write_fasta(); 2864 $trna_file->close_file(); 2865 2866 my $cm_file = $self->{main_cm_file_path}->{$cm_tRNA->model()}; 2867 my $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_cm_rescore.out"; 2868 2869 my ($hit_score, $hit_start, $hit_end, $hit_ct) = 2870 $self->cmsearch_scoring($opts, $log, $global_constants->get("tmp_trnaseq_file"), $prescan_tRNA->tRNAscan_id(), $cm_file, $cms_output_file); 2871 $cm_tRNA->update_domain_model("infernal", $hit_score, $hit_score, 0, 0); 2872} 2873 2874# Runs Infernal cmsearch, and returns the score 2875sub cmsearch_scoring 2876{ 2877 my $self = shift; 2878 my ($opts, $log, $tmp_trnaseq_file, $seq_name, $cur_cm_file, $cms_output_file) = @_; 2879 2880 my (@cms_lines, $subseq_start, $subseq_end, $evalue, $score, $besthit_score, $line, 2881 $besthit_start, $besthit_end, $hit_ct); 2882 2883 my $scan_flag = 0; 2884 if ($opts->arch_mode() or $opts->bact_mode()) 2885 { 2886 $scan_flag = 7; 2887 } 2888 if (!$self->exec_cmsearch($cur_cm_file, $scan_flag, $tmp_trnaseq_file, $seq_name, $cms_output_file, $log, 0)) 2889 { 2890 return 0; 2891 } 2892 2893 @cms_lines = split(/\n/, `cat $cms_output_file`); 2894 $hit_ct = 0; 2895 $score = 0; 2896 $besthit_score = 0; 2897 2898 foreach $line (@cms_lines) 2899 { 2900 if ($line =~ /^\s+\(\d+\)\s+\S+\s+([e0-9.\-]+)\s+([0-9.\-]+)\s+\S+\s+\S+\s+(\d+)\s+(\d+)\s+\S+\s+(\d+)\s+(\d+)\s+([+-])\s+\S+\s+\S+\s+(\S+)\s+([0-9.]+)/) 2901 { 2902 $evalue = $1; 2903 $score = $2; 2904 $subseq_start = $5; 2905 $subseq_end = $6; 2906 $hit_ct++; 2907 2908 if ($score > $besthit_score) 2909 { 2910 $besthit_score = $score; 2911 $besthit_start = $subseq_start; 2912 $besthit_end = $subseq_end; 2913 } 2914 } 2915 } 2916 return ($besthit_score, $besthit_start, $besthit_end, $hit_ct); 2917} 2918 2919sub run_first_pass_cmsearch 2920{ 2921 my $self = shift; 2922 my ($global_vars, $seq_name) = @_; 2923 my $global_constants = $global_vars->{global_constants}; 2924 my $log = $global_vars->{log_file}; 2925 2926 my $scan_flag = 3; 2927 my $cms_output_file = ""; 2928 my $cms_merge_file = $global_constants->get("temp_dir")."/tscan$$"."_fp_cm_merge.out"; 2929 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 2930 my $file_idx = -1; 2931 2932 foreach my $cur_cm (sort keys %{$self->{main_cm_file_path}}) 2933 { 2934 $cms_output_file = $global_constants->get("temp_dir")."/tscan$$"."_fp_cm_".$cur_cm.".out"; 2935 if ($self->exec_cmsearch($self->{main_cm_file_path}->{$cur_cm}, $scan_flag, $global_constants->get("tmp_fa"), $seq_name, $cms_output_file, $log, $self->{infernal_fp_cutoff})) 2936 { 2937 $file_idx = $arrayCMscanResults->add_file($cms_output_file, $cur_cm); 2938 $arrayCMscanResults->merge_result_file($file_idx, 0); 2939 } 2940 else 2941 { 2942 return ""; 2943 } 2944 } 2945 $arrayCMscanResults->write_merge_file($cms_merge_file, 1); 2946 my $count = $arrayCMscanResults->get_result_count(); 2947 2948 return ($cms_merge_file, $count); 2949} 2950 2951sub process_fp_cmsearch_hits 2952{ 2953 my $self = shift; 2954 my ($global_vars, $start_index, $cms_merge_file, $result_count) = @_; 2955 my $global_constants = $global_vars->{global_constants}; 2956 my $fp_tRNAs = $global_vars->{fp_tRNAs}; 2957 my $stats = $global_vars->{stats}; 2958 2959 my $line = ""; 2960 my @columns = (); 2961 my $trna = undef; 2962 my $i = 0; 2963 my $ct = 0; 2964 2965 &open_for_read(\*FILE_H, $cms_merge_file); 2966 while ($line = <FILE_H>) 2967 { 2968 $ct++; 2969 chomp($line); 2970 @columns = split(/\t/, $line); 2971 2972 $trna = tRNAscanSE::tRNA->new; 2973 $trna->seqname($columns[0]); 2974 $trna->score($columns[4]); 2975 if ($columns[3] eq "+") 2976 { 2977 $trna->start($columns[1] + $start_index); 2978 $trna->end($columns[2] + $start_index); 2979 } 2980 else 2981 { 2982 $trna->start($columns[2] + $start_index); 2983 $trna->end($columns[1] + $start_index); 2984 } 2985 $trna->strand($columns[3]); 2986 $trna->isotype("???"); 2987 $trna->anticodon("???"); 2988 $trna->ss($columns[5]); 2989 $trna->seq($columns[6]); 2990 $trna->model($columns[10]); 2991 2992 if ($trna->start() < $trna->end()) 2993 { 2994 $trna->position($trna->start()); 2995 } 2996 else 2997 { 2998 $trna->position($global_constants->get("really_big_number") - $trna->start() + 1); 2999 } 3000 3001 $trna->hit_source(0); 3002 3003 my $merge = 0; 3004 if ($ct < 3 or $ct > ($result_count - 3)) 3005 { 3006 $merge = $self->merge_repeat_hit($global_vars->{stats}, $global_vars->{fp_tRNAs}, $trna); 3007 } 3008 if (!$merge) 3009 { 3010 # insert non-redundant hit in order 3011 # 'Merge_repeat_hits' depends on list being in order 3012 $i = &max(0, $i - 10); 3013 while (($i < $fp_tRNAs->get_count()) && ($fp_tRNAs->get($i)->position() < $trna->position())) 3014 { 3015 $i++; 3016 } 3017 3018 $fp_tRNAs->insert($trna, $i); 3019 $stats->increment_trnatotal(); 3020 } 3021 } 3022 close(FILE_H); 3023} 3024 3025# check current hit for redundancy against all previous hits in hitlist 3026# 3027# if it IS a repeat, merge it with overlapping hit and return 1 3028# if it doesn't overlap with any hits, return 0 3029sub merge_repeat_hit 3030{ 3031 my $self = shift; 3032 my ($stats, $fp_tRNAs, $trna) = @_; 3033 3034 for (my $i = 0; $i < $fp_tRNAs->get_count(); $i++) 3035 { 3036 if ($trna->strand() eq "+") 3037 { 3038 if (($fp_tRNAs->get($i)->strand() eq "+") && 3039 (&seg_overlap($trna->start(), $trna->end(), $fp_tRNAs->get($i)->start(), $fp_tRNAs->get($i)->end(), 0))) 3040 { 3041 $fp_tRNAs->get($i)->start(&min($trna->start(), $fp_tRNAs->get($i)->start())); 3042 $fp_tRNAs->get($i)->end(&max($trna->end(), $fp_tRNAs->get($i)->end())); 3043 $fp_tRNAs->get($i)->hit_source($fp_tRNAs->get($i)->hit_source()); 3044 $fp_tRNAs->get($i)->isotype($trna->isotype()); 3045 $fp_tRNAs->get($i)->score($trna->score()); 3046 3047 # check to see if extended endpoint overlaps i+1 hit's start boundary 3048 # if so, combine hit[i] and hit[i+1] into one hit and delete hit[i+1] 3049 if (($i != ($fp_tRNAs->get_count() - 1)) and ($fp_tRNAs->get($i+1)->strand() eq "+") 3050 and ($fp_tRNAs->get($i)->end() >= $fp_tRNAs->get($i+1)->start())) 3051 { 3052 $fp_tRNAs->get($i)->end(&max($fp_tRNAs->get($i)->end(), $fp_tRNAs->get($i+1)->end())); 3053 $fp_tRNAs->get($i)->hit_source($fp_tRNAs->get($i)->hit_source() | $fp_tRNAs->get($i+1)->hit_source()); 3054 3055 $fp_tRNAs->remove($i+1); 3056 $stats->decrement_trnatotal(); 3057 } 3058 return 1; # exit loop immediately 3059 } 3060 } 3061 else # else (antisense) strand 3062 { 3063 if (($fp_tRNAs->get($i)->strand() eq "-") && 3064 (&seg_overlap($trna->end(), $trna->start(), $fp_tRNAs->get($i)->end(), $fp_tRNAs->get($i)->start(), 0))) 3065 { 3066 $fp_tRNAs->get($i)->start(&max($trna->start(), $fp_tRNAs->get($i)->start())); 3067 $fp_tRNAs->get($i)->end(&min($trna->end(), $fp_tRNAs->get($i)->end())); 3068 $fp_tRNAs->get($i)->hit_source($fp_tRNAs->get($i)->hit_source() | $self->{eufind_mask}); 3069 $fp_tRNAs->get($i)->isotype($trna->isotype()); 3070 $fp_tRNAs->get($i)->score($trna->score()); 3071 3072 if (($i != ($fp_tRNAs->get_count() - 1)) and 3073 ($fp_tRNAs->get($i)->end() <= $fp_tRNAs->get($i+1)->start())) 3074 { 3075 $fp_tRNAs->get($i)->end(&min($fp_tRNAs->get($i)->end(), $fp_tRNAs->get($i+1)->end())); 3076 $fp_tRNAs->get($i)->hit_source($fp_tRNAs->get($i)->hit_source() | $fp_tRNAs->get($i+1)->hit_source()); 3077 3078 $fp_tRNAs->remove($i+1); 3079 $stats->decrement_trnatotal(); 3080 } 3081 return 1; # exit loop immediately 3082 } 3083 } # else (antisense) strand 3084 3085 } # for each (hit) 3086 3087 return 0; # current hit is not a repeat 3088} 3089 3090sub first_pass_scan 3091{ 3092 my $self = shift; 3093 my ($global_vars, $start_index, $seq_name) = @_; 3094 my $opts = $global_vars->{options}; 3095 my $log = $global_vars->{log_file}; 3096 3097 if (!$opts->numt_mode()) 3098 { 3099 my ($cms_merge_file, $count) = $self->run_first_pass_cmsearch($global_vars, $seq_name); 3100 if ($cms_merge_file eq "") 3101 { 3102 $log->error("Fail to run infernal for first pass scan"); 3103 return 0; 3104 } 3105 else 3106 { 3107 $self->process_fp_cmsearch_hits($global_vars, $start_index, $cms_merge_file, $count); 3108 } 3109 } 3110 3111 return 1; 3112} 3113 3114sub analyze_alternate 3115{ 3116 my $self = shift; 3117 my ($global_vars, $seqinfo_flag, $seq_ct, $seq_name, $r_curseq_trnact) = @_; 3118 my $global_constants = $global_vars->{global_constants}; 3119 my $opts = $global_vars->{options}; 3120 my $stats = $global_vars->{stats}; 3121 my $log = $global_vars->{log_file}; 3122 my $fp_result_file = $global_vars->{fp_result_file}; 3123 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 3124 my $prescan_tRNA = tRNAscanSE::tRNA->new; 3125 my $cm_tRNA = tRNAscanSE::tRNA->new; 3126 3127 my $over_cutoff = 0; 3128 my $cms_hit_count = 0; 3129 my ($trnaDesc, $fulltrnaDesc, $subseq_start, $subseq_end); 3130 my ($cms_confirmed_ct, $cur_cm_file); 3131 my $cm_hit = {}; 3132 3133 $cms_confirmed_ct = 0; 3134 3135 if (scalar(keys %{$self->{main_cm_file_path}}) == 0) 3136 { 3137 $log->error("No covariance models are found for scanning $seq_name. Please check the configuration file."); 3138 return 0; 3139 } 3140 3141 my ($cms_merge_file, $count) = $self->run_cmsearch($global_vars, $seq_name, $arrayCMscanResults, 0); 3142 if ($cms_merge_file eq "") 3143 { 3144 $log->error("Fail to run infernal for $seq_name"); 3145 return 0; 3146 } 3147 3148 $arrayCMscanResults->open_file($cms_merge_file); 3149 $fp_result_file->reset_current_seq(); 3150 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3151 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3152 while ($cm_tRNA->seqname() ne "") 3153 { 3154 $over_cutoff = 0; 3155 while (($cm_tRNA->seqname() ne "") and ($cm_tRNA->seqname() eq $prescan_tRNA->seqname().".t".&pad_num($prescan_tRNA->id(), 6))) 3156 { 3157 $cms_hit_count++; 3158 3159 $subseq_start = $cm_tRNA->start(); 3160 $subseq_end = $cm_tRNA->end(); 3161 $cm_tRNA->start($cm_tRNA->start() + $prescan_tRNA->start() - 1); 3162 $cm_tRNA->end($cm_tRNA->end() + $prescan_tRNA->start() - 1); 3163 3164 if ($cm_tRNA->score() < $self->{cm_cutoff}) 3165 { 3166 $log->broadcast("Low cmsearch score for ".$prescan_tRNA->tRNAscan_id().".$cms_hit_count: ".$cm_tRNA->score()); 3167 $trnaDesc .= "(CMSearch Hit#$cms_hit_count: ".$cm_tRNA->start()."-".$cm_tRNA->end().",". 3168 " Sc: ".$cm_tRNA->score().", Len: ".(abs($cm_tRNA->start() - $cm_tRNA->end()) + 1).") "; 3169 } 3170 else 3171 { 3172 $over_cutoff++; 3173 $$r_curseq_trnact++; 3174 3175 $cm_tRNA->id($$r_curseq_trnact); 3176 $cm_tRNA->seqname($prescan_tRNA->seqname()); 3177 $cm_tRNA->set_domain_model("infernal", $cm_tRNA->score()); 3178 if ($cm_tRNA->strand() eq "-") 3179 { 3180 my $temp = $cm_tRNA->start(); 3181 $cm_tRNA->start($cm_tRNA->end()); 3182 $cm_tRNA->end($temp); 3183 } 3184 3185 if ((length($prescan_tRNA->seq()) >= $subseq_end + 2) && 3186 (uc(substr($prescan_tRNA->seq(), $subseq_end, 3)) eq "CCA") && 3187 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) ne "....")) 3188 { 3189 $subseq_end += 3; 3190 if ($cm_tRNA->strand() eq "+") 3191 { 3192 $cm_tRNA->end($cm_tRNA->end() + 3); 3193 } 3194 else 3195 { 3196 $cm_tRNA->start($cm_tRNA->start() - 3); 3197 } 3198 $cm_tRNA->seq($cm_tRNA->seq()."CCA"); 3199 $cm_tRNA->ss($cm_tRNA->ss()."..."); 3200 } 3201 3202 $cm_tRNA->upstream($prescan_tRNA->upstream()); 3203 $cm_tRNA->downstream($prescan_tRNA->downstream()); 3204 if ($subseq_start > 1) 3205 { 3206 $cm_tRNA->upstream($cm_tRNA->upstream() . substr($prescan_tRNA->seq(), 0, $subseq_start - 1)); 3207 } 3208 if ($subseq_end < $prescan_tRNA->len()) 3209 { 3210 $cm_tRNA->downstream(substr($prescan_tRNA->seq(), $subseq_end) . $cm_tRNA->downstream()); 3211 } 3212 my $upstream_len = $global_constants->get("upstream_len"); 3213 my $downstream_len = $global_constants->get("downstream_len"); 3214 3215 $cm_tRNA->upstream(substr($cm_tRNA->upstream(), &max((length($cm_tRNA->upstream()) - $upstream_len), 0))); 3216 $cm_tRNA->downstream(substr($cm_tRNA->downstream(), 0, &min(length($cm_tRNA->downstream()), $downstream_len))); 3217 $self->decode_tRNA_properties($global_vars, $cm_tRNA, $prescan_tRNA, $self->{main_cm_file_path}->{$cm_tRNA->model()}); 3218 3219 $cm_tRNA->tRNAscan_id($cm_tRNA->seqname().".tRNA".$$r_curseq_trnact."-".$cm_tRNA->isotype().$cm_tRNA->anticodon()); 3220 $cm_tRNA->set_mature_tRNA(); 3221 $cm_tRNA->src_seqlen($prescan_tRNA->src_seqlen()); 3222 $cm_tRNA->src_seqid($cm_tRNA->seqname()); 3223 $cm_tRNA->ordered_seqname($prescan_tRNA->src_seqid()); 3224 $cm_tRNA->set_default_scores(); 3225 3226 $cms_confirmed_ct++; 3227 $global_vars->{sp_int_results}->write_tRNA($cm_tRNA); 3228 } 3229 3230 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3231 } 3232 3233 if ($over_cutoff == 0) 3234 { 3235 if ((!$opts->results_to_stdout()) && ($opts->eufind_mode() || $opts->tscan_mode() || $opts->use_prev_ts_run())) 3236 { 3237 $log->broadcast("CMSearch score(s) below cutoff for ".$prescan_tRNA->tRNAscan_id().". Skipping..."); 3238 } 3239 if ($opts->save_falsepos()) 3240 { 3241 $fulltrnaDesc = "(Fp Hit: ".$prescan_tRNA->start()."-".$prescan_tRNA->end().", ". 3242 (abs($prescan_tRNA->start() - $prescan_tRNA->end()) + 1)." bp, Src: ".$prescan_tRNA->hit_source().") ".$trnaDesc; 3243 3244 $stats->increment_fpos_base_ct(length($prescan_tRNA->seq())); 3245 &write_tRNA($opts->falsepos_file(), $prescan_tRNA->tRNAscan_id(), $fulltrnaDesc, $prescan_tRNA->seq(), 0); 3246 } 3247 } 3248 3249 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3250 } 3251 $arrayCMscanResults->close_file(); 3252 3253 return $cms_confirmed_ct; 3254} 3255 3256sub analyze_mito 3257{ 3258 my $self = shift; 3259 my ($global_vars, $seqinfo_flag, $seq_ct, $seq_name, $r_curseq_trnact) = @_; 3260 my $global_constants = $global_vars->{global_constants}; 3261 my $opts = $global_vars->{options}; 3262 my $stats = $global_vars->{stats}; 3263 my $log = $global_vars->{log_file}; 3264 my $fp_result_file = $global_vars->{fp_result_file}; 3265 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 3266 my $prescan_tRNA = tRNAscanSE::tRNA->new; 3267 my $cm_tRNA = tRNAscanSE::tRNA->new; 3268 3269 my $over_cutoff = 0; 3270 my $cms_hit_count = 0; 3271 my ($trnaDesc, $fulltrnaDesc, $subseq_start, $subseq_end); 3272 my ($cms_confirmed_ct, $cur_cm_file); 3273 my $cm_hit = {}; 3274 3275 $cms_confirmed_ct = 0; 3276 3277 if (scalar(keys %{$self->{main_cm_file_path}}) == 0) 3278 { 3279 $log->error("No covariance models are found for scanning $seq_name. Please check the configuration file."); 3280 return 0; 3281 } 3282 3283 my ($cms_merge_file, $count) = $self->run_cmsearch($global_vars, $seq_name, $arrayCMscanResults, 10); 3284 if ($cms_merge_file eq "") 3285 { 3286 $log->error("Fail to run infernal for $seq_name"); 3287 return 0; 3288 } 3289 3290 $arrayCMscanResults->open_file($cms_merge_file); 3291 $fp_result_file->reset_current_seq(); 3292 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3293 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3294 while ($cm_tRNA->seqname() ne "") 3295 { 3296 $over_cutoff = 0; 3297 while (($cm_tRNA->seqname() ne "") and ($cm_tRNA->seqname() eq $prescan_tRNA->seqname().".t".&pad_num($prescan_tRNA->id(), 6))) 3298 { 3299 $cms_hit_count++; 3300 3301 $subseq_start = $cm_tRNA->start(); 3302 $subseq_end = $cm_tRNA->end(); 3303 $cm_tRNA->start($cm_tRNA->start() + $prescan_tRNA->start() - 1); 3304 $cm_tRNA->end($cm_tRNA->end() + $prescan_tRNA->start() - 1); 3305 3306 if ($cm_tRNA->score() < $self->{cm_cutoff}) 3307 { 3308 $log->broadcast("Low cmsearch score for ".$prescan_tRNA->tRNAscan_id().".$cms_hit_count: ".$cm_tRNA->score()); 3309 $trnaDesc .= "(CMSearch Hit#$cms_hit_count: ".$cm_tRNA->start()."-".$cm_tRNA->end().",". 3310 " Sc: ".$cm_tRNA->score().", Len: ".(abs($cm_tRNA->start() - $cm_tRNA->end()) + 1).") "; 3311 } 3312 else 3313 { 3314 $over_cutoff++; 3315 $$r_curseq_trnact++; 3316 3317 $cm_tRNA->id($$r_curseq_trnact); 3318 $cm_tRNA->seqname($prescan_tRNA->seqname()); 3319 $cm_tRNA->set_domain_model("infernal", $cm_tRNA->score()); 3320 if ($cm_tRNA->strand() eq "-") 3321 { 3322 my $temp = $cm_tRNA->start(); 3323 $cm_tRNA->start($cm_tRNA->end()); 3324 $cm_tRNA->end($temp); 3325 } 3326 3327 if ((length($prescan_tRNA->seq()) >= $subseq_end + 2) && 3328 (uc(substr($prescan_tRNA->seq(), $subseq_end, 3)) eq "CCA") && 3329 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) ne "....")) 3330 { 3331 $subseq_end += 3; 3332 if ($cm_tRNA->strand() eq "+") 3333 { 3334 $cm_tRNA->end($cm_tRNA->end() + 3); 3335 } 3336 else 3337 { 3338 $cm_tRNA->start($cm_tRNA->start() - 3); 3339 } 3340 $cm_tRNA->seq($cm_tRNA->seq()."CCA"); 3341 $cm_tRNA->ss($cm_tRNA->ss()."..."); 3342 } 3343 3344 $cm_tRNA->upstream($prescan_tRNA->upstream()); 3345 $cm_tRNA->downstream($prescan_tRNA->downstream()); 3346 if ($subseq_start > 1) 3347 { 3348 $cm_tRNA->upstream($cm_tRNA->upstream() . substr($prescan_tRNA->seq(), 0, $subseq_start - 1)); 3349 } 3350 if ($subseq_end < $prescan_tRNA->len()) 3351 { 3352 $cm_tRNA->downstream(substr($prescan_tRNA->seq(), $subseq_end) . $cm_tRNA->downstream()); 3353 } 3354 my $upstream_len = $global_constants->get("upstream_len"); 3355 my $downstream_len = $global_constants->get("downstream_len"); 3356 3357 $cm_tRNA->upstream(substr($cm_tRNA->upstream(), &max((length($cm_tRNA->upstream()) - $upstream_len), 0))); 3358 $cm_tRNA->downstream(substr($cm_tRNA->downstream(), 0, &min(length($cm_tRNA->downstream()), $downstream_len))); 3359 $self->decode_mito_tRNA_properties($global_vars, $cm_tRNA, $prescan_tRNA, $self->{main_cm_file_path}->{$cm_tRNA->model()}); 3360 3361 $cm_tRNA->tRNAscan_id($cm_tRNA->seqname().".tRNA".$$r_curseq_trnact."-".$cm_tRNA->isotype().$cm_tRNA->anticodon()); 3362 $cm_tRNA->set_mature_tRNA(); 3363 $cm_tRNA->src_seqlen($prescan_tRNA->src_seqlen()); 3364 $cm_tRNA->src_seqid($cm_tRNA->seqname()); 3365 $cm_tRNA->ordered_seqname($prescan_tRNA->src_seqid()); 3366 $cm_tRNA->set_default_scores(); 3367 3368 $cms_confirmed_ct++; 3369 $global_vars->{sp_int_results}->write_tRNA($cm_tRNA); 3370 } 3371 3372 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3373 } 3374 3375 if ($over_cutoff == 0) 3376 { 3377 if ((!$opts->results_to_stdout()) && ($opts->eufind_mode() || $opts->tscan_mode() || $opts->use_prev_ts_run())) 3378 { 3379 $log->broadcast("CMSearch score(s) below cutoff for ".$prescan_tRNA->tRNAscan_id().". Skipping..."); 3380 } 3381 if ($opts->save_falsepos()) 3382 { 3383 $fulltrnaDesc = "(Fp Hit: ".$prescan_tRNA->start()."-".$prescan_tRNA->end().", ". 3384 (abs($prescan_tRNA->start() - $prescan_tRNA->end()) + 1)." bp, Src: ".$prescan_tRNA->hit_source().") ".$trnaDesc; 3385 3386 $stats->increment_fpos_base_ct(length($prescan_tRNA->seq())); 3387 &write_tRNA($opts->falsepos_file(), $prescan_tRNA->tRNAscan_id(), $fulltrnaDesc, $prescan_tRNA->seq(), 0); 3388 } 3389 } 3390 3391 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3392 } 3393 $arrayCMscanResults->close_file(); 3394 3395 return $cms_confirmed_ct; 3396} 3397 3398sub analyze_with_cmsearch 3399{ 3400 my $self = shift; 3401 my ($global_vars, $seqinfo_flag, $seq_ct, $seq_name, $r_curseq_trnact) = @_; 3402 my $global_constants = $global_vars->{global_constants}; 3403 my $opts = $global_vars->{options}; 3404 my $stats = $global_vars->{stats}; 3405 my $log = $global_vars->{log_file}; 3406 my $fp_result_file = $global_vars->{fp_result_file}; 3407 my $arrayCMscanResults = tRNAscanSE::ArrayCMscanResults->new; 3408 my $prescan_tRNA = tRNAscanSE::tRNA->new; 3409 my $cm_tRNA = tRNAscanSE::tRNA->new; 3410 my $flanking_exist = 0; 3411 3412 my $over_cutoff = 0; 3413 my $cms_hit_count = 0; 3414 my $rescore = 0; 3415 my ($trnaDesc, $fulltrnaDesc, $subseq_start, $subseq_end); 3416 my ($cms_confirmed_ct, $cur_cm_file); 3417 my $cm_hit = {}; 3418 3419 $cms_confirmed_ct = 0; 3420 3421 my ($cms_merge_file, $count) = $self->run_cmsearch($global_vars, $seq_name, $arrayCMscanResults, 0); 3422 if ($cms_merge_file eq "") 3423 { 3424 $log->error("Fail to run infernal for $seq_name"); 3425 return 0; 3426 } 3427 3428 $arrayCMscanResults->open_file($cms_merge_file); 3429 $fp_result_file->reset_current_seq(); 3430 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3431 if ($fp_result_file->open_flanking("read")) 3432 { 3433 $fp_result_file->read_tRNA_flanking($prescan_tRNA); 3434 $flanking_exist = 1; 3435 } 3436 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3437 while ($cm_tRNA->seqname() ne "") 3438 { 3439 $over_cutoff = 0; 3440 while (($cm_tRNA->seqname() ne "") and ($cm_tRNA->seqname() eq $prescan_tRNA->seqname().".t".&pad_num($prescan_tRNA->id(), 6))) 3441 { 3442 $rescore = 0; 3443 $cms_hit_count++; 3444 if ($prescan_tRNA->hit_source() ne "") 3445 { 3446 $cm_tRNA->hit_source($prescan_tRNA->hit_source()); 3447 } 3448 $subseq_start = $cm_tRNA->start(); 3449 $subseq_end = $cm_tRNA->end(); 3450 if ($opts->infernal_fp()) 3451 { 3452 $cm_tRNA->strand($prescan_tRNA->strand()); 3453 if ($prescan_tRNA->strand() eq "+") 3454 { 3455 $cm_tRNA->start($cm_tRNA->start() + $prescan_tRNA->start() - 1); 3456 $cm_tRNA->end($cm_tRNA->end() + $prescan_tRNA->start() - 1); 3457 } 3458 else 3459 { 3460 if ($prescan_tRNA->start() > $prescan_tRNA->end()) 3461 { 3462 $cm_tRNA->start($prescan_tRNA->start() - $cm_tRNA->start() + 1); 3463 $cm_tRNA->end($prescan_tRNA->start() - $cm_tRNA->end() + 1); 3464 } 3465 else 3466 { 3467 $cm_tRNA->start($prescan_tRNA->end() - $cm_tRNA->start() + 1); 3468 $cm_tRNA->end($prescan_tRNA->end() - $cm_tRNA->end() + 1); 3469 } 3470 } 3471 } 3472 else 3473 { 3474 $cm_tRNA->start($cm_tRNA->start() + $prescan_tRNA->start() - 1); 3475 $cm_tRNA->end($cm_tRNA->end() + $prescan_tRNA->start() - 1); 3476 } 3477 3478 if ($cm_tRNA->score() < $self->{cm_cutoff}) 3479 { 3480 $log->broadcast("Low cmsearch score for ".$prescan_tRNA->tRNAscan_id().".$cms_hit_count: ".$cm_tRNA->score()); 3481 $trnaDesc .= "(CMSearch Hit#$cms_hit_count: ".$cm_tRNA->start()."-".$cm_tRNA->end().",". 3482 " Sc: ".$cm_tRNA->score().", Len: ".(abs($cm_tRNA->start() - $cm_tRNA->end()) + 1).") "; 3483 } 3484 else 3485 { 3486 $over_cutoff++; 3487 $$r_curseq_trnact++; 3488 3489 $cm_tRNA->id($$r_curseq_trnact); 3490 $cm_tRNA->seqname($prescan_tRNA->seqname()); 3491 $cm_tRNA->set_domain_model("infernal", $cm_tRNA->score()); 3492 if ($cm_tRNA->strand() eq "-") 3493 { 3494 my $temp = $cm_tRNA->start(); 3495 $cm_tRNA->start($cm_tRNA->end()); 3496 $cm_tRNA->end($temp); 3497 } 3498 3499 if ((length($prescan_tRNA->seq()) >= $subseq_end + 2) && 3500 (uc(substr($prescan_tRNA->seq(), $subseq_end, 3)) eq "CCA") && 3501 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) ne "....") && 3502 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "eukaryota")) && 3503 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "general"))) 3504 { 3505 $subseq_end += 3; 3506 if ($cm_tRNA->strand() eq "+") 3507 { 3508 $cm_tRNA->end($cm_tRNA->end() + 3); 3509 } 3510 else 3511 { 3512 $cm_tRNA->start($cm_tRNA->start() - 3); 3513 } 3514 $cm_tRNA->seq($cm_tRNA->seq()."CCA"); 3515 $cm_tRNA->ss($cm_tRNA->ss()."..."); 3516 $rescore = 1; 3517 } 3518 3519 if ((uc(substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 3)) ne "CCA") && 3520 (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 4) eq "....") && 3521 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "eukaryota")) && 3522 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "general"))) 3523 { 3524 $subseq_end -= 3; 3525 if ($cm_tRNA->strand() eq "+") 3526 { 3527 $cm_tRNA->end($cm_tRNA->end() - 3); 3528 } 3529 else 3530 { 3531 $cm_tRNA->start($cm_tRNA->start() + 3); 3532 } 3533 $cm_tRNA->seq(substr($cm_tRNA->seq(), 0, length($cm_tRNA->seq()) - 3)); 3534 $cm_tRNA->ss(substr($cm_tRNA->ss(), 0, length($cm_tRNA->ss()) - 3)); 3535 $rescore = 1; 3536 } 3537 elsif ((uc(substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 3)) eq "CCA") && 3538 (((substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 6) eq "<.....") && 3539 (substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 5, 2) =~ /[acgtn][ACGTN]/)) || 3540 ((substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 7) eq "<......") && 3541 (substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 6, 3) =~ /[ACGTN][acgtn][ACGTN]/)) || 3542 ((substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 7) eq "<......") && 3543 (substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 6, 3) =~ /[acgtn]{2}[ACGTN]/))) && 3544 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "eukaryota")) && 3545 ($self->{main_cm_file_path}->{Domain} ne $global_constants->get_subvalue("cm", "general"))) 3546 { 3547 my $trim_len = 4; 3548 if (substr($cm_tRNA->ss(), length($cm_tRNA->ss()) - 7) eq "<......" && substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 6, 3) =~ /[acgtn]{2}[ACGTN]/) 3549 { 3550 $trim_len = 5; 3551 } 3552 3553 $subseq_end -= $trim_len; 3554 if ($cm_tRNA->strand() eq "+") 3555 { 3556 $cm_tRNA->end($cm_tRNA->end() - $trim_len); 3557 } 3558 else 3559 { 3560 $cm_tRNA->start($cm_tRNA->start() + $trim_len); 3561 } 3562 $cm_tRNA->seq(substr($cm_tRNA->seq(), 0, length($cm_tRNA->seq()) - $trim_len)); 3563 $cm_tRNA->seq(substr($cm_tRNA->seq(), 0, length($cm_tRNA->seq()) - 1).uc(substr($cm_tRNA->seq(), length($cm_tRNA->seq()) - 1, 1))); 3564 $cm_tRNA->ss(substr($cm_tRNA->ss(), 0, length($cm_tRNA->ss()) - $trim_len)); 3565 $rescore = 1; 3566 } 3567 3568 if ($rescore) 3569 { 3570 $self->rescore_tRNA($global_vars, $cm_tRNA, $prescan_tRNA); 3571 } 3572 3573 $cm_tRNA->upstream($prescan_tRNA->upstream()); 3574 $cm_tRNA->downstream($prescan_tRNA->downstream()); 3575 if ($subseq_start > 1) 3576 { 3577 $cm_tRNA->upstream($cm_tRNA->upstream() . substr($prescan_tRNA->seq(), 0, $subseq_start - 1)); 3578 } 3579 if ($subseq_end < $prescan_tRNA->len()) 3580 { 3581 $cm_tRNA->downstream(substr($prescan_tRNA->seq(), $subseq_end) . $cm_tRNA->downstream()); 3582 } 3583 if (!$opts->tscan_mode() and !$opts->eufind_mode() and !$opts->infernal_fp()) 3584 { 3585 my $upstream_len = $global_constants->get("upstream_len"); 3586 my $downstream_len = $global_constants->get("downstream_len"); 3587 3588 $cm_tRNA->upstream(substr($cm_tRNA->upstream(), &max((length($cm_tRNA->upstream()) - $upstream_len), 0))); 3589 $cm_tRNA->downstream(substr($cm_tRNA->downstream(), 0, &min(length($cm_tRNA->downstream()), $downstream_len))); 3590 } 3591 $self->decode_tRNA_properties($global_vars, $cm_tRNA, $prescan_tRNA, $self->{main_cm_file_path}->{$cm_tRNA->model()}); 3592 3593 $cm_tRNA->tRNAscan_id($cm_tRNA->seqname().".tRNA".$$r_curseq_trnact."-".$cm_tRNA->isotype().$cm_tRNA->anticodon()); 3594 $cm_tRNA->src_seqlen($prescan_tRNA->src_seqlen()); 3595 $cm_tRNA->src_seqid($cm_tRNA->seqname()); 3596 $cm_tRNA->ordered_seqname($prescan_tRNA->src_seqid()); 3597 $cm_tRNA->set_default_scores(); 3598 $self->fix_fMet($global_vars, $cm_tRNA); 3599 $self->fix_His($global_vars, $cm_tRNA); 3600 $cm_tRNA->set_mature_tRNA(); 3601 3602 $cms_confirmed_ct++; 3603 $global_vars->{sp_int_results}->write_tRNA($cm_tRNA); 3604 } 3605 3606 $arrayCMscanResults->get_next_cmsearch_hit($cm_tRNA); 3607 } 3608 3609 if ($over_cutoff == 0) 3610 { 3611 if ((!$opts->results_to_stdout()) && ($opts->eufind_mode() || $opts->tscan_mode() || $opts->use_prev_ts_run())) 3612 { 3613 $log->broadcast("CMSearch score(s) below cutoff for ".$prescan_tRNA->tRNAscan_id().". Skipping..."); 3614 } 3615 if ($opts->save_falsepos()) 3616 { 3617 $fulltrnaDesc = "(Fp Hit: ".$prescan_tRNA->start()."-".$prescan_tRNA->end().", ". 3618 (abs($prescan_tRNA->start() - $prescan_tRNA->end()) + 1)." bp, Src: ".$prescan_tRNA->hit_source().") ".$trnaDesc; 3619 3620 $stats->increment_fpos_base_ct(length($prescan_tRNA->seq())); 3621 &write_tRNA($opts->falsepos_file(), $prescan_tRNA->tRNAscan_id(), $fulltrnaDesc, $prescan_tRNA->seq(), 0); 3622 } 3623 } 3624 3625 $fp_result_file->get_next_tRNA_candidate($opts, $seqinfo_flag, $seq_ct, $prescan_tRNA); 3626 if ($flanking_exist) 3627 { 3628 $fp_result_file->read_tRNA_flanking($prescan_tRNA); 3629 } 3630 } 3631 $arrayCMscanResults->close_file(); 3632 3633 return $cms_confirmed_ct; 3634} 3635 3636sub sort_cm_hits_by_start 3637{ 3638 my $self = shift; 3639 my $cms_hits = shift; 3640 3641 my $a_start = $a->{start}; 3642 my $b_start = $b->{start}; 3643 3644 if ($a->{strand} == 0) { 3645 $a_start = $a->{end}; 3646 } 3647 if ($b->{strand} == 0) { 3648 $b_start = $b->{end}; 3649 } 3650 3651 return ($a->{seqname} cmp $b->{seqname} || 3652 $a_start <=> $b_start); 3653} 3654 36551; 3656