1# tRNAscanSE/Stats.pm
2# This class describes the statistics of each run in tRNAscan-SE.
3#
4# --------------------------------------------------------------
5# This module is part of the tRNAscan-SE program.
6# Copyright (C) 2017 Patricia Chan and Todd Lowe
7# --------------------------------------------------------------
8#
9package tRNAscan::Stats;
10
11use strict;
12use tRNAscanSE::Utils;
13use tRNAscanSE::Options;
14use tRNAscanSE::GeneticCode;
15
16sub new
17{
18    my $class = shift;
19    my $self = {};
20
21    initialize($self);
22
23    bless ($self, $class);
24    return $self;
25}
26
27sub DESTROY
28{
29    my $self = shift;
30}
31
32sub initialize
33{
34    my $self = shift;
35    $self->{file_name} = "";             # name of log file
36    $self->{FILE_H} = undef;             # file handle
37
38    $self->{fp_start_time} = +[];        # save first pass starting time
39    $self->{fp_end_time} = +[];          # save first pass ending time
40    $self->{sp_end_time} = +[];          # save second pass ending time
41    $self->{seqs_hit} = 0;               # num seqs with at least one trna hit
42    $self->{numscanned} = 0;             # total sequences scanned
43    $self->{trnatotal} = 0;              # total trnas found by tscan
44
45    $self->{first_pass_base_ct} = 0;     # no bases in all seqs in first pass scans
46    $self->{fpass_trna_base_ct} = 0;     # no bases in tRNAs in first pass scans
47    $self->{fpos_base_ct} = 0;           # no bases in false positive tRNAs
48    $self->{secpass_base_ct} = 0;
49    $self->{coves_base_ct} = 0;
50    $self->{total_secpass_ct} = 0;
51}
52
53sub file_name
54{
55    my $self = shift;
56    if (@_) { $self->{file_name} = shift; }
57    return $self->{file_name};
58}
59
60sub FILE_H
61{
62    my $self = shift;
63    return $self->{FILE_H};
64}
65
66sub start_fp_timer
67{
68    my $self = shift;
69    @{$self->{fp_start_time}} = (times)[0,2,1,3];
70    $self->{fp_end_time} = +[];
71    $self->{sp_end_time} = +[];
72}
73
74sub end_fp_timer
75{
76    my $self = shift;
77    @{$self->{fp_end_time}} = (times)[0,2,1,3];
78}
79
80sub start_sp_timer
81{
82    my $self = shift;
83    if (!defined $self->{fp_end_time}->[0]) {
84        push (@{$self->{fp_end_time}}, @{$self->{fp_start_time}});
85    }
86}
87
88sub end_sp_timer
89{
90    my $self = shift;
91    @{$self->{sp_end_time}} = (times)[0,2,1,3];
92}
93
94sub seqs_hit
95{
96    my $self = shift;
97    if (@_) { $self->{seqs_hit} = shift; }
98    return $self->{seqs_hit};
99}
100
101sub increment_seqs_hit
102{
103    my $self = shift;
104    if (@_) { $self->{seqs_hit} += shift; }
105    else { $self->{seqs_hit} += 1;}
106}
107
108sub numscanned
109{
110    my $self = shift;
111    if (@_) { $self->{numscanned} = shift; }
112    return $self->{numscanned};
113}
114
115sub increment_numscanned
116{
117    my $self = shift;
118    if (@_) { $self->{numscanned} += shift; }
119    else { $self->{numscanned} += 1;}
120}
121
122sub trnatotal
123{
124    my $self = shift;
125    if (@_) { $self->{trnatotal} = shift; }
126    return $self->{trnatotal};
127}
128
129sub increment_trnatotal
130{
131    my $self = shift;
132    if (@_) { $self->{trnatotal} += shift; }
133    else { $self->{trnatotal} += 1;}
134}
135
136sub decrement_trnatotal
137{
138    my $self = shift;
139    if (@_) { $self->{trnatotal} -= shift; }
140    else { $self->{trnatotal} -= 1;}
141}
142
143sub first_pass_base_ct
144{
145    my $self = shift;
146    if (@_) { $self->{first_pass_base_ct} = shift; }
147    return $self->{first_pass_base_ct};
148}
149
150sub increment_first_pass_base_ct
151{
152    my $self = shift;
153    if (@_) { $self->{first_pass_base_ct} += shift; }
154    else { $self->{first_pass_base_ct} += 1;}
155}
156
157sub fpass_trna_base_ct
158{
159    my $self = shift;
160    if (@_) { $self->{fpass_trna_base_ct} = shift; }
161    return $self->{fpass_trna_base_ct};
162}
163
164sub fpos_base_ct
165{
166    my $self = shift;
167    if (@_) { $self->{fpos_base_ct} = shift; }
168    return $self->{fpos_base_ct};
169}
170
171sub increment_fpos_base_ct
172{
173    my $self = shift;
174    if (@_) { $self->{fpos_base_ct} += shift; }
175    else { $self->{fpos_base_ct} += 1;}
176}
177
178sub secpass_base_ct
179{
180    my $self = shift;
181    if (@_) { $self->{secpass_base_ct} = shift; }
182    return $self->{secpass_base_ct};
183}
184
185sub increment_secpass_base_ct
186{
187    my $self = shift;
188    if (@_) { $self->{secpass_base_ct} += shift; }
189    else { $self->{secpass_base_ct} += 1;}
190}
191
192sub coves_base_ct
193{
194    my $self = shift;
195    if (@_) { $self->{coves_base_ct} = shift; }
196    return $self->{coves_base_ct};
197}
198
199sub increment_coves_base_ct
200{
201    my $self = shift;
202    if (@_) { $self->{coves_base_ct} += shift; }
203    else { $self->{coves_base_ct} += 1;}
204}
205
206sub total_secpass_ct
207{
208    my $self = shift;
209    if (@_) { $self->{total_secpass_ct} = shift; }
210    return $self->{total_secpass_ct};
211}
212
213sub increment_total_secpass_ct
214{
215    my $self = shift;
216    if (@_) { $self->{total_secpass_ct} += shift; }
217    else { $self->{total_secpass_ct} += 1;}
218}
219
220sub open_file
221{
222    my $self = shift;
223
224    my $success = 0;
225
226    if ($self->{file_name} ne "")
227    {
228        &open_for_append(\$self->{FILE_H}, $self->{file_name});
229        $success = 1;
230    }
231    else
232    {
233        die "Statistics file name is not set.\n"
234    }
235
236    return $success;
237}
238
239sub close_file
240{
241    my $self = shift;
242
243    if (defined $self->{FILE_H})
244    {
245        close($self->{FILE_H});
246    }
247}
248
249sub write_line
250{
251    my $self = shift;
252    my $line = shift;
253
254    my $fh = $self->{FILE_H};
255
256    print $fh $line . "\n";
257}
258
259sub save_firstpass_stats
260{
261    my $self = shift;
262    my $fh = $self->{FILE_H};
263
264    print $fh "First-pass Stats:\n",
265        "---------------\n";
266    print $fh  "Sequences read:         $self->{numscanned}\n";
267    print $fh  "Seqs w/at least 1 hit:  $self->{seqs_hit}\n";
268    print $fh  "Bases read:             $self->{first_pass_base_ct} (x2 for both strands)\n";
269    print $fh  "Bases in tRNAs:         $self->{fpass_trna_base_ct}\n";
270    print $fh  "tRNAs predicted:        $self->{trnatotal}\n";
271    printf $fh "Av. tRNA length:        %d\n",
272        int($self->{fpass_trna_base_ct} / &max(1, $self->{trnatotal}));
273    printf $fh "Script CPU time:        %.2f s\n",
274        $self->{fp_end_time}->[0] - $self->{fp_start_time}->[0];
275    printf $fh "Scan CPU time:          %.2f s\n",
276        $self->{fp_end_time}->[1] - $self->{fp_start_time}->[1];
277    printf $fh "Scan speed:             %.1f Kbp/sec\n", $self->{first_pass_base_ct}*2/
278         (&max(0.001, $self->{fp_end_time}->[1] - $self->{fp_start_time}->[1]))/1000;
279    print $fh "\nFirst pass search(es) ended: ",`date`,"\n";
280}
281
282sub save_final_stats
283{
284    my $self = shift;
285    my $opts = shift;
286    my $gc = shift;
287    my $prescan_count = shift;
288    my $r_tab_results = shift;
289    my $fh = $self->{FILE_H};
290    my $second_pass_label = $opts->second_pass_label();
291
292    if ($opts->CM_mode() ne "")
293    {
294        print $fh "$second_pass_label Stats:\n-----------\n";
295
296        if ($opts->tscan_mode() || $opts->eufind_mode() || $opts->infernal_fp())
297        {
298            print $fh "Candidate tRNAs read:     ", $prescan_count,"\n";
299        }
300        else
301        {
302            print $fh "Sequences read:           $self->{numscanned}\n";
303        }
304        print $fh  "$second_pass_label","-confirmed tRNAs:     $self->{total_secpass_ct}\n";
305        print $fh  "Bases scanned by $second_pass_label:  $self->{secpass_base_ct}\n";
306        printf $fh "%% seq scanned by $second_pass_label:  %2.1f %%\n",
307            &min(($self->{secpass_base_ct} / &max(1, $self->{first_pass_base_ct} * 2)) * 100,100);
308        printf $fh "Script CPU time:          %2.2f s\n", $self->{sp_end_time}->[0] - $self->{fp_end_time}->[0];
309        printf $fh "$second_pass_label CPU time:            %2.2f s\n", $self->{sp_end_time}->[1] - $self->{fp_end_time}->[1];
310        printf $fh "Scan speed:               %.1f bp/sec\n", $self->{secpass_base_ct}/
311            &max(0.001, $self->{sp_end_time}->[1] - $self->{fp_end_time}->[1]);
312        print $fh "\n$second_pass_label analysis of tRNAs ended: ",`date`,"\n";
313        if ($opts->tscan_mode() || $opts->eufind_mode())
314        {
315            print $fh "Summary\n--------\n";
316        }
317    }
318    my $total_time = ($self->{sp_end_time}->[0] - $self->{fp_start_time}->[0]) +
319        ($self->{sp_end_time}->[1] - $self->{fp_start_time}->[1]);
320    printf $fh "Overall scan speed: %.1f bp/sec\n",
321        &max($self->{first_pass_base_ct} * 2, $self->{secpass_base_ct}) / &max(0.001, $total_time);
322
323    $self->output_summary($opts, $gc, $r_tab_results);
324}
325
326sub output_summary
327{
328    my $self = shift;
329    my $opts = shift;
330    my $gc = shift;
331    my $r_tab_results = shift;
332    my $fh = $self->{FILE_H};
333
334    my ($trna_ct, $selcys_ct, $stop_sup_ct, $undet_ct, $pseudo_ct, $mismatch_ct,
335           $total, $intron_ct, $line);
336    my (%iso_AR, %ac_AR, %intron_ac_AR, %iso_cm_AR);
337    my ($iso, $ac, $acset, $iso_count, $iso_cm_count, $istart, $aa, $iso_cm);
338    my @columns = ();
339    my $note_col = -1;
340    my $iso_cm_col = -1;
341
342    $trna_ct   = 0;
343    $selcys_ct = 0;
344    $pseudo_ct = 0;
345    $mismatch_ct = 0;
346    $undet_ct  = 0;
347    $intron_ct = 0;
348    $stop_sup_ct = 0;
349    $total = 0;
350
351    $line = shift(@$r_tab_results);
352
353    while ($line ne '')
354    {
355        if ($line =~ /^(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+([0-9\,]+)\s+([0-9\,]+)\s+(\S+)/)
356        {
357            @columns = split(/\t/, $line, -1);
358
359            $iso     = $columns[4];
360            $ac      = $columns[5];
361            $istart  = &trim($columns[6]);
362
363            if ($note_col == -1)
364            {
365                $note_col = scalar(@columns) - 1;
366            }
367            if ($iso_cm_col == -1)
368            {
369                if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode())
370                {
371					$iso_cm_col = $note_col - 2;
372				}
373                if ($opts->euk_mode() and $opts->mito_model() ne "")
374                {
375                    $iso_cm_col--;
376                }
377			}
378			if ($iso_cm_col > -1)
379            {
380				$iso_cm = $columns[$iso_cm_col];
381			}
382
383            if ($note_col > -1 and index($columns[$note_col], "pseudo") > -1)
384            {
385                $pseudo_ct++;
386                $iso_AR{"Pseudo"}++;
387            }
388            elsif ($note_col > -1 and index($columns[$note_col], "IPD") > -1)
389            {
390                $mismatch_ct++;
391            }
392            elsif ($iso eq $gc->undef_isotype())
393            {
394                $undet_ct++;
395            }
396            elsif ($iso =~ /SeC/)
397            {
398                $selcys_ct++;
399            }
400            elsif ($iso eq "Sup")
401            {
402                $stop_sup_ct++;
403            }
404            else
405            {
406                $trna_ct++;
407            }
408
409            if ($iso =~ /SeC/)
410            {
411                $iso_AR{"SelCys"}++;
412                $ac_AR{"SelCys"}->{$ac}++;
413            }
414            elsif ($iso eq "Sup")
415            {
416                $iso_AR{"Supres"}++;
417                $ac_AR{"Supres"}->{$ac}++;
418            }
419            else
420            {
421                $iso_AR{$iso}++;
422                $ac_AR{$iso}->{$ac}++;
423            }
424            if (defined $iso_cm)
425            {
426				$iso_cm_AR{$iso_cm}++;
427			}
428
429            if ($istart ne "0")
430            {
431                my @introns = split(/\,/, $istart);
432                $intron_ct += scalar(@introns);
433                my $tag = $iso;
434                if ($iso =~ /SeC/)
435                {
436                    $tag = "SelCys";
437                }
438                elsif ($iso eq "Sup")
439                {
440                    $tag = "Supres";
441                }
442                $intron_ac_AR{$tag}->{$ac} += scalar(@introns);
443            }
444        }
445        $line = shift(@$r_tab_results);
446    }
447
448    $total = $trna_ct + $selcys_ct + $pseudo_ct + $mismatch_ct + $undet_ct + $stop_sup_ct;
449
450
451    print $fh "\n",
452    "tRNAs decoding Standard 20 AA:              $trna_ct\n",
453    "Selenocysteine tRNAs (TCA):                 $selcys_ct\n",
454    "Possible suppressor tRNAs (CTA,TTA,TCA):    $stop_sup_ct\n",
455    "tRNAs with undetermined/unknown isotypes:   $undet_ct\n";
456
457    if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode())
458    {
459        print $fh "tRNAs with mismatch isotypes:               $mismatch_ct\n";
460    }
461
462    print $fh "Predicted pseudogenes:                      $pseudo_ct\n",
463    "                                            -------\n",
464    "Total tRNAs:                                $total\n\n",
465
466    "tRNAs with introns:     \t$intron_ct\n\n";
467
468    foreach $aa (@{$gc->isotypes()})
469    {
470        foreach $acset ($gc->ac_list()->{$aa})
471        {
472            foreach $ac (@$acset)
473            {
474                if (defined($intron_ac_AR{$aa}->{$ac}))
475                {
476                    print $fh "| $aa-$ac: $intron_ac_AR{$aa}->{$ac} ";
477                }
478            }
479        }
480    }
481    if (scalar(keys %intron_ac_AR) > 0)
482    {
483        print $fh "|\n\n";
484    }
485    print $fh "Isotype / Anticodon Counts:\n\n";
486
487    foreach $aa (@{$gc->isotypes()})
488    {
489        my $label = $aa;
490        $iso_count = $iso_AR{$aa} + 0;
491        $iso_cm_count = $iso_cm_AR{$aa} + 0;
492        if ($aa eq "Met")
493        {
494            if (defined $iso_AR{iMet})
495            {
496				$label = "Met/iMet";
497                $iso_count += $iso_AR{iMet};
498			}
499			elsif (defined $iso_AR{fMet})
500            {
501				$label = "Met/fMet";
502                $iso_count += $iso_AR{fMet};
503			}
504            if (defined $iso_cm_AR{iMet})
505            {
506				$iso_cm_count += $iso_cm_AR{iMet};
507			}
508            elsif (defined $iso_cm_AR{fMet})
509            {
510				$iso_cm_count += $iso_cm_AR{fMet};
511			}
512		}
513        elsif ($aa eq "Ile")
514        {
515            if (defined $iso_AR{Ile2})
516            {
517                $iso_count += $iso_AR{Ile2};
518			}
519            if (defined $iso_cm_AR{Ile2})
520            {
521				$iso_cm_count += $iso_cm_AR{Ile2};
522			}
523		}
524
525        if ((($opts->euk_mode() or $opts->bact_mode() or $opts->arch_mode()) and !$opts->no_isotype() and $opts->detail()) or $opts->metagenome_mode())
526        {
527            printf $fh ("%-8s: %d (%d)\t", $label, $iso_count, $iso_cm_count);
528        }
529        else
530        {
531            printf $fh ("%-8s: %d\t", $label, $iso_count);
532        }
533
534        foreach $acset ($gc->ac_list()->{$aa})
535        {
536            foreach $ac (@$acset)
537            {
538                if ($ac eq "&nbsp")
539                {
540                    print $fh "             ";
541                }
542                else
543                {
544                    my $count = $ac_AR{$aa}->{$ac};
545                    if ($aa eq "Met")
546                    {
547						if (defined $ac_AR{iMet}->{$ac})
548                        {
549                            $count += $ac_AR{iMet}->{$ac};
550                        }
551                        elsif (defined $ac_AR{fMet}->{$ac})
552                        {
553                            $count += $ac_AR{fMet}->{$ac};
554                        }
555					}
556					elsif ($aa eq "Ile")
557                    {
558						if (defined $ac_AR{Ile2}->{$ac})
559                        {
560                            $count += $ac_AR{Ile2}->{$ac};
561                        }
562                    }
563                    printf $fh ("%5s: %-6s",$ac,$count);
564                }
565            }
566        }
567
568        print $fh "\n";
569
570    }
571    print $fh "\n";
572}
573
5741;
575