1# tRNAscanSE/Stats.pm 2# This class describes the statistics of each run in tRNAscan-SE. 3# 4# -------------------------------------------------------------- 5# This module is part of the tRNAscan-SE program. 6# Copyright (C) 2017 Patricia Chan and Todd Lowe 7# -------------------------------------------------------------- 8# 9package tRNAscan::Stats; 10 11use strict; 12use tRNAscanSE::Utils; 13use tRNAscanSE::Options; 14use tRNAscanSE::GeneticCode; 15 16sub new 17{ 18 my $class = shift; 19 my $self = {}; 20 21 initialize($self); 22 23 bless ($self, $class); 24 return $self; 25} 26 27sub DESTROY 28{ 29 my $self = shift; 30} 31 32sub initialize 33{ 34 my $self = shift; 35 $self->{file_name} = ""; # name of log file 36 $self->{FILE_H} = undef; # file handle 37 38 $self->{fp_start_time} = +[]; # save first pass starting time 39 $self->{fp_end_time} = +[]; # save first pass ending time 40 $self->{sp_end_time} = +[]; # save second pass ending time 41 $self->{seqs_hit} = 0; # num seqs with at least one trna hit 42 $self->{numscanned} = 0; # total sequences scanned 43 $self->{trnatotal} = 0; # total trnas found by tscan 44 45 $self->{first_pass_base_ct} = 0; # no bases in all seqs in first pass scans 46 $self->{fpass_trna_base_ct} = 0; # no bases in tRNAs in first pass scans 47 $self->{fpos_base_ct} = 0; # no bases in false positive tRNAs 48 $self->{secpass_base_ct} = 0; 49 $self->{coves_base_ct} = 0; 50 $self->{total_secpass_ct} = 0; 51} 52 53sub file_name 54{ 55 my $self = shift; 56 if (@_) { $self->{file_name} = shift; } 57 return $self->{file_name}; 58} 59 60sub FILE_H 61{ 62 my $self = shift; 63 return $self->{FILE_H}; 64} 65 66sub start_fp_timer 67{ 68 my $self = shift; 69 @{$self->{fp_start_time}} = (times)[0,2,1,3]; 70 $self->{fp_end_time} = +[]; 71 $self->{sp_end_time} = +[]; 72} 73 74sub end_fp_timer 75{ 76 my $self = shift; 77 @{$self->{fp_end_time}} = (times)[0,2,1,3]; 78} 79 80sub start_sp_timer 81{ 82 my $self = shift; 83 if (!defined $self->{fp_end_time}->[0]) { 84 push (@{$self->{fp_end_time}}, @{$self->{fp_start_time}}); 85 } 86} 87 88sub end_sp_timer 89{ 90 my $self = shift; 91 @{$self->{sp_end_time}} = (times)[0,2,1,3]; 92} 93 94sub seqs_hit 95{ 96 my $self = shift; 97 if (@_) { $self->{seqs_hit} = shift; } 98 return $self->{seqs_hit}; 99} 100 101sub increment_seqs_hit 102{ 103 my $self = shift; 104 if (@_) { $self->{seqs_hit} += shift; } 105 else { $self->{seqs_hit} += 1;} 106} 107 108sub numscanned 109{ 110 my $self = shift; 111 if (@_) { $self->{numscanned} = shift; } 112 return $self->{numscanned}; 113} 114 115sub increment_numscanned 116{ 117 my $self = shift; 118 if (@_) { $self->{numscanned} += shift; } 119 else { $self->{numscanned} += 1;} 120} 121 122sub trnatotal 123{ 124 my $self = shift; 125 if (@_) { $self->{trnatotal} = shift; } 126 return $self->{trnatotal}; 127} 128 129sub increment_trnatotal 130{ 131 my $self = shift; 132 if (@_) { $self->{trnatotal} += shift; } 133 else { $self->{trnatotal} += 1;} 134} 135 136sub decrement_trnatotal 137{ 138 my $self = shift; 139 if (@_) { $self->{trnatotal} -= shift; } 140 else { $self->{trnatotal} -= 1;} 141} 142 143sub first_pass_base_ct 144{ 145 my $self = shift; 146 if (@_) { $self->{first_pass_base_ct} = shift; } 147 return $self->{first_pass_base_ct}; 148} 149 150sub increment_first_pass_base_ct 151{ 152 my $self = shift; 153 if (@_) { $self->{first_pass_base_ct} += shift; } 154 else { $self->{first_pass_base_ct} += 1;} 155} 156 157sub fpass_trna_base_ct 158{ 159 my $self = shift; 160 if (@_) { $self->{fpass_trna_base_ct} = shift; } 161 return $self->{fpass_trna_base_ct}; 162} 163 164sub fpos_base_ct 165{ 166 my $self = shift; 167 if (@_) { $self->{fpos_base_ct} = shift; } 168 return $self->{fpos_base_ct}; 169} 170 171sub increment_fpos_base_ct 172{ 173 my $self = shift; 174 if (@_) { $self->{fpos_base_ct} += shift; } 175 else { $self->{fpos_base_ct} += 1;} 176} 177 178sub secpass_base_ct 179{ 180 my $self = shift; 181 if (@_) { $self->{secpass_base_ct} = shift; } 182 return $self->{secpass_base_ct}; 183} 184 185sub increment_secpass_base_ct 186{ 187 my $self = shift; 188 if (@_) { $self->{secpass_base_ct} += shift; } 189 else { $self->{secpass_base_ct} += 1;} 190} 191 192sub coves_base_ct 193{ 194 my $self = shift; 195 if (@_) { $self->{coves_base_ct} = shift; } 196 return $self->{coves_base_ct}; 197} 198 199sub increment_coves_base_ct 200{ 201 my $self = shift; 202 if (@_) { $self->{coves_base_ct} += shift; } 203 else { $self->{coves_base_ct} += 1;} 204} 205 206sub total_secpass_ct 207{ 208 my $self = shift; 209 if (@_) { $self->{total_secpass_ct} = shift; } 210 return $self->{total_secpass_ct}; 211} 212 213sub increment_total_secpass_ct 214{ 215 my $self = shift; 216 if (@_) { $self->{total_secpass_ct} += shift; } 217 else { $self->{total_secpass_ct} += 1;} 218} 219 220sub open_file 221{ 222 my $self = shift; 223 224 my $success = 0; 225 226 if ($self->{file_name} ne "") 227 { 228 &open_for_append(\$self->{FILE_H}, $self->{file_name}); 229 $success = 1; 230 } 231 else 232 { 233 die "Statistics file name is not set.\n" 234 } 235 236 return $success; 237} 238 239sub close_file 240{ 241 my $self = shift; 242 243 if (defined $self->{FILE_H}) 244 { 245 close($self->{FILE_H}); 246 } 247} 248 249sub write_line 250{ 251 my $self = shift; 252 my $line = shift; 253 254 my $fh = $self->{FILE_H}; 255 256 print $fh $line . "\n"; 257} 258 259sub save_firstpass_stats 260{ 261 my $self = shift; 262 my $fh = $self->{FILE_H}; 263 264 print $fh "First-pass Stats:\n", 265 "---------------\n"; 266 print $fh "Sequences read: $self->{numscanned}\n"; 267 print $fh "Seqs w/at least 1 hit: $self->{seqs_hit}\n"; 268 print $fh "Bases read: $self->{first_pass_base_ct} (x2 for both strands)\n"; 269 print $fh "Bases in tRNAs: $self->{fpass_trna_base_ct}\n"; 270 print $fh "tRNAs predicted: $self->{trnatotal}\n"; 271 printf $fh "Av. tRNA length: %d\n", 272 int($self->{fpass_trna_base_ct} / &max(1, $self->{trnatotal})); 273 printf $fh "Script CPU time: %.2f s\n", 274 $self->{fp_end_time}->[0] - $self->{fp_start_time}->[0]; 275 printf $fh "Scan CPU time: %.2f s\n", 276 $self->{fp_end_time}->[1] - $self->{fp_start_time}->[1]; 277 printf $fh "Scan speed: %.1f Kbp/sec\n", $self->{first_pass_base_ct}*2/ 278 (&max(0.001, $self->{fp_end_time}->[1] - $self->{fp_start_time}->[1]))/1000; 279 print $fh "\nFirst pass search(es) ended: ",`date`,"\n"; 280} 281 282sub save_final_stats 283{ 284 my $self = shift; 285 my $opts = shift; 286 my $gc = shift; 287 my $prescan_count = shift; 288 my $r_tab_results = shift; 289 my $fh = $self->{FILE_H}; 290 my $second_pass_label = $opts->second_pass_label(); 291 292 if ($opts->CM_mode() ne "") 293 { 294 print $fh "$second_pass_label Stats:\n-----------\n"; 295 296 if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp()) 297 { 298 print $fh "Candidate tRNAs read: ", $prescan_count,"\n"; 299 } 300 else 301 { 302 print $fh "Sequences read: $self->{numscanned}\n"; 303 } 304 print $fh "$second_pass_label","-confirmed tRNAs: $self->{total_secpass_ct}\n"; 305 print $fh "Bases scanned by $second_pass_label: $self->{secpass_base_ct}\n"; 306 printf $fh "%% seq scanned by $second_pass_label: %2.1f %%\n", 307 &min(($self->{secpass_base_ct} / &max(1, $self->{first_pass_base_ct} * 2)) * 100,100); 308 printf $fh "Script CPU time: %2.2f s\n", $self->{sp_end_time}->[0] - $self->{fp_end_time}->[0]; 309 printf $fh "$second_pass_label CPU time: %2.2f s\n", $self->{sp_end_time}->[1] - $self->{fp_end_time}->[1]; 310 printf $fh "Scan speed: %.1f bp/sec\n", $self->{secpass_base_ct}/ 311 &max(0.001, $self->{sp_end_time}->[1] - $self->{fp_end_time}->[1]); 312 print $fh "\n$second_pass_label analysis of tRNAs ended: ",`date`,"\n"; 313 if ($opts->tscan_mode() || $opts->eufind_mode()) 314 { 315 print $fh "Summary\n--------\n"; 316 } 317 } 318 my $total_time = ($self->{sp_end_time}->[0] - $self->{fp_start_time}->[0]) + 319 ($self->{sp_end_time}->[1] - $self->{fp_start_time}->[1]); 320 printf $fh "Overall scan speed: %.1f bp/sec\n", 321 &max($self->{first_pass_base_ct} * 2, $self->{secpass_base_ct}) / &max(0.001, $total_time); 322 323 $self->output_summary($opts, $gc, $r_tab_results); 324} 325 326sub output_summary 327{ 328 my $self = shift; 329 my $opts = shift; 330 my $gc = shift; 331 my $r_tab_results = shift; 332 my $fh = $self->{FILE_H}; 333 334 my ($trna_ct, $selcys_ct, $stop_sup_ct, $undet_ct, $pseudo_ct, $mismatch_ct, 335 $total, $intron_ct, $line); 336 my (%iso_AR, %ac_AR, %intron_ac_AR, %iso_cm_AR); 337 my ($iso, $ac, $acset, $iso_count, $iso_cm_count, $istart, $aa, $iso_cm); 338 my @columns = (); 339 my $note_col = -1; 340 my $iso_cm_col = -1; 341 342 $trna_ct = 0; 343 $selcys_ct = 0; 344 $pseudo_ct = 0; 345 $mismatch_ct = 0; 346 $undet_ct = 0; 347 $intron_ct = 0; 348 $stop_sup_ct = 0; 349 $total = 0; 350 351 $line = shift(@$r_tab_results); 352 353 while ($line ne '') 354 { 355 if ($line =~ /^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+([0-9\,]+)\s+([0-9\,]+)\s+(\S+)/) 356 { 357 @columns = split(/\t/, $line, -1); 358 359 $iso = $columns[4]; 360 $ac = $columns[5]; 361 $istart = &trim($columns[6]); 362 363 if ($note_col == -1) 364 { 365 $note_col = scalar(@columns) - 1; 366 } 367 if ($iso_cm_col == -1) 368 { 369 if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode()) 370 { 371 $iso_cm_col = $note_col - 2; 372 } 373 if ($opts->euk_mode() and $opts->mito_model() ne "") 374 { 375 $iso_cm_col--; 376 } 377 } 378 if ($iso_cm_col > -1) 379 { 380 $iso_cm = $columns[$iso_cm_col]; 381 } 382 383 if ($note_col > -1 and index($columns[$note_col], "pseudo") > -1) 384 { 385 $pseudo_ct++; 386 $iso_AR{"Pseudo"}++; 387 } 388 elsif ($note_col > -1 and index($columns[$note_col], "IPD") > -1) 389 { 390 $mismatch_ct++; 391 } 392 elsif ($iso eq $gc->undef_isotype()) 393 { 394 $undet_ct++; 395 } 396 elsif ($iso =~ /SeC/) 397 { 398 $selcys_ct++; 399 } 400 elsif ($iso eq "Sup") 401 { 402 $stop_sup_ct++; 403 } 404 else 405 { 406 $trna_ct++; 407 } 408 409 if ($iso =~ /SeC/) 410 { 411 $iso_AR{"SelCys"}++; 412 $ac_AR{"SelCys"}->{$ac}++; 413 } 414 elsif ($iso eq "Sup") 415 { 416 $iso_AR{"Supres"}++; 417 $ac_AR{"Supres"}->{$ac}++; 418 } 419 else 420 { 421 $iso_AR{$iso}++; 422 $ac_AR{$iso}->{$ac}++; 423 } 424 if (defined $iso_cm) 425 { 426 $iso_cm_AR{$iso_cm}++; 427 } 428 429 if ($istart ne "0") 430 { 431 my @introns = split(/\,/, $istart); 432 $intron_ct += scalar(@introns); 433 my $tag = $iso; 434 if ($iso =~ /SeC/) 435 { 436 $tag = "SelCys"; 437 } 438 elsif ($iso eq "Sup") 439 { 440 $tag = "Supres"; 441 } 442 $intron_ac_AR{$tag}->{$ac} += scalar(@introns); 443 } 444 } 445 $line = shift(@$r_tab_results); 446 } 447 448 $total = $trna_ct + $selcys_ct + $pseudo_ct + $mismatch_ct + $undet_ct + $stop_sup_ct; 449 450 451 print $fh "\n", 452 "tRNAs decoding Standard 20 AA: $trna_ct\n", 453 "Selenocysteine tRNAs (TCA): $selcys_ct\n", 454 "Possible suppressor tRNAs (CTA,TTA,TCA): $stop_sup_ct\n", 455 "tRNAs with undetermined/unknown isotypes: $undet_ct\n"; 456 457 if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode()) 458 { 459 print $fh "tRNAs with mismatch isotypes: $mismatch_ct\n"; 460 } 461 462 print $fh "Predicted pseudogenes: $pseudo_ct\n", 463 " -------\n", 464 "Total tRNAs: $total\n\n", 465 466 "tRNAs with introns: \t$intron_ct\n\n"; 467 468 foreach $aa (@{$gc->isotypes()}) 469 { 470 foreach $acset ($gc->ac_list()->{$aa}) 471 { 472 foreach $ac (@$acset) 473 { 474 if (defined($intron_ac_AR{$aa}->{$ac})) 475 { 476 print $fh "| $aa-$ac: $intron_ac_AR{$aa}->{$ac} "; 477 } 478 } 479 } 480 } 481 if (scalar(keys %intron_ac_AR) > 0) 482 { 483 print $fh "|\n\n"; 484 } 485 print $fh "Isotype / Anticodon Counts:\n\n"; 486 487 foreach $aa (@{$gc->isotypes()}) 488 { 489 my $label = $aa; 490 $iso_count = $iso_AR{$aa} + 0; 491 $iso_cm_count = $iso_cm_AR{$aa} + 0; 492 if ($aa eq "Met") 493 { 494 if (defined $iso_AR{iMet}) 495 { 496 $label = "Met/iMet"; 497 $iso_count += $iso_AR{iMet}; 498 } 499 elsif (defined $iso_AR{fMet}) 500 { 501 $label = "Met/fMet"; 502 $iso_count += $iso_AR{fMet}; 503 } 504 if (defined $iso_cm_AR{iMet}) 505 { 506 $iso_cm_count += $iso_cm_AR{iMet}; 507 } 508 elsif (defined $iso_cm_AR{fMet}) 509 { 510 $iso_cm_count += $iso_cm_AR{fMet}; 511 } 512 } 513 elsif ($aa eq "Ile") 514 { 515 if (defined $iso_AR{Ile2}) 516 { 517 $iso_count += $iso_AR{Ile2}; 518 } 519 if (defined $iso_cm_AR{Ile2}) 520 { 521 $iso_cm_count += $iso_cm_AR{Ile2}; 522 } 523 } 524 525 if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode()) 526 { 527 printf $fh ("%-8s: %d (%d)\t", $label, $iso_count, $iso_cm_count); 528 } 529 else 530 { 531 printf $fh ("%-8s: %d\t", $label, $iso_count); 532 } 533 534 foreach $acset ($gc->ac_list()->{$aa}) 535 { 536 foreach $ac (@$acset) 537 { 538 if ($ac eq " ") 539 { 540 print $fh " "; 541 } 542 else 543 { 544 my $count = $ac_AR{$aa}->{$ac}; 545 if ($aa eq "Met") 546 { 547 if (defined $ac_AR{iMet}->{$ac}) 548 { 549 $count += $ac_AR{iMet}->{$ac}; 550 } 551 elsif (defined $ac_AR{fMet}->{$ac}) 552 { 553 $count += $ac_AR{fMet}->{$ac}; 554 } 555 } 556 elsif ($aa eq "Ile") 557 { 558 if (defined $ac_AR{Ile2}->{$ac}) 559 { 560 $count += $ac_AR{Ile2}->{$ac}; 561 } 562 } 563 printf $fh ("%5s: %-6s",$ac,$count); 564 } 565 } 566 } 567 568 print $fh "\n"; 569 570 } 571 print $fh "\n"; 572} 573 5741; 575