1#! @PERL@ -w
2#
3# --------------------------------------------------------------------
4# EukHighConfidenceFilter v1.0
5#
6# Annotate eukaryotic tRNA high confidence set from tRNAscan-SE 2.0 predictions
7#
8# Copyright (C) 2017 Patricia Chan and Todd Lowe
9#
10# Baskin School of Engineering, University of California, Santa Cruz
11# lowe@soe.ucsc.edu
12# http://lowelab.ucsc.edu/
13# --------------------------------------------------------------------
14
15use strict;
16use lib "@libdir@/tRNAscan-SE";
17use Getopt::Long;
18
19use tRNAscanSE::tRNA;
20use tRNAscanSE::ArraytRNA;
21
22our @isotypes = ('Ala', 'Gly', 'Pro', 'Thr', 'Val',
23	     'Ser', 'Arg', 'Leu',
24	     'Phe','Asn', 'Lys', 'Asp', 'Glu', 'His', 'Gln',
25	     'Ile', 'Met', 'Tyr', 'Sup', 'Cys', 'Trp',  'SeC');
26
27our %ac_list = (
28	   'Ala' => [qw/AGC GGC CGC TGC/],
29	   'Gly' => [qw/ACC GCC CCC TCC/],
30	   'Pro' => [qw/AGG GGG CGG TGG/],
31	   'Thr' => [qw/AGT GGT CGT TGT/],
32	   'Val' => [qw/AAC GAC CAC TAC/],
33
34	   'Ser' => [qw/AGA GGA CGA TGA ACT GCT &nbsp &nbsp/],
35	   'Arg' => [qw/ACG GCG CCG TCG &nbsp &nbsp CCT TCT/],
36	   'Leu' => [qw/AAG GAG CAG TAG &nbsp &nbsp CAA TAA/],
37
38	   'Phe' => [qw/AAA GAA &nbsp &nbsp/],
39
40	   'Asn' => [qw/ATT GTT &nbsp &nbsp/],
41	   'Lys' => [qw/&nbsp &nbsp CTT TTT/],
42
43	   'Asp' => [qw/ATC GTC &nbsp &nbsp /],
44	   'Glu' => [qw/&nbsp &nbsp CTC TTC/],
45
46	   'His' => [qw/ATG GTG &nbsp &nbsp /],
47	   'Gln' => [qw/&nbsp &nbsp CTG TTG/],
48
49	   'Tyr' => [qw/ATA GTA &nbsp &nbsp /],
50	   'Sup' => [qw/&nbsp &nbsp CTA TTA/],
51
52	   'Ile' => [qw/AAT GAT &nbsp TAT/],
53	   'Met' => [qw/&nbsp &nbsp CAT &nbsp/],
54
55	   'Cys' => [qw/ACA GCA &nbsp &nbsp /],
56	   'Trp' => [qw/&nbsp &nbsp CCA &nbsp/],
57	   'SeC' => [qw/&nbsp &nbsp &nbsp TCA/]
58           );
59
60our %aa_list = (
61		   'AGC'=>'Ala', 'GGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala',
62		   'ACC'=>'Gly', 'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly',
63		   'AGG'=>'Pro', 'GGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro',
64		   'AGT'=>'Thr', 'GGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr',
65		   'AAC'=>'Val', 'GAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val',
66
67		   'AGA'=>'Ser', 'GGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'ACT'=>'Ser', 'GCT'=>'Ser',
68		   'ACG'=>'Arg', 'GCG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg',
69		   'AAG'=>'Leu', 'GAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu',
70
71		   'AAA'=>'Phe', 'GAA'=>'Phe',
72
73		   'ATT'=>'Asn', 'GTT'=>'Asn',
74		   'CTT'=>'Lys', 'TTT'=>'Lys',
75
76		   'ATC'=>'Asp', 'GTC'=>'Asp',
77		   'CTC'=>'Glu', 'TTC'=>'Glu',
78
79		   'ATG'=>'His', 'GTG'=>'His',
80		   'CTG'=>'Gln', 'TTG'=>'Gln',
81
82		   'ATA'=>'Tyr', 'GTA'=>'Tyr',
83		   'CTA'=>'Sup', 'TTA'=>'Sup',
84
85		   'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile',
86		   'CAT'=>'Met',
87
88		   'ACA'=>'Cys', 'GCA'=>'Cys',
89		   'CCA'=>'Trp',
90		   'TCA'=>'SeC',
91		   );
92
93
94our %euk_aa_list = (
95		   'AGC'=>'Ala', 'CGC'=>'Ala', 'TGC'=>'Ala',
96		   'GCC'=>'Gly', 'CCC'=>'Gly', 'TCC'=>'Gly',
97		   'AGG'=>'Pro', 'CGG'=>'Pro', 'TGG'=>'Pro',
98		   'AGT'=>'Thr', 'CGT'=>'Thr', 'TGT'=>'Thr',
99		   'AAC'=>'Val', 'CAC'=>'Val', 'TAC'=>'Val',
100
101		   'AGA'=>'Ser', 'CGA'=>'Ser', 'TGA'=>'Ser', 'GCT'=>'Ser',
102		   'ACG'=>'Arg', 'CCG'=>'Arg', 'TCG'=>'Arg', 'CCT'=>'Arg', 'TCT'=>'Arg',
103		   'AAG'=>'Leu', 'CAG'=>'Leu', 'TAG'=>'Leu', 'CAA'=>'Leu', 'TAA'=>'Leu',
104
105		   'GAA'=>'Phe',
106
107		   'GTT'=>'Asn',
108		   'CTT'=>'Lys', 'TTT'=>'Lys',
109
110		   'GTC'=>'Asp',
111		   'CTC'=>'Glu', 'TTC'=>'Glu',
112
113		   'GTG'=>'His',
114		   'CTG'=>'Gln', 'TTG'=>'Gln',
115
116		   'GTA'=>'Tyr',
117		   'CTA'=>'Sup', 'TTA'=>'Sup',
118
119		   'AAT'=>'Ile', 'GAT'=>'Ile', 'TAT'=>'Ile',
120		   'CAT'=>'Met',
121
122		   'GCA'=>'Cys',
123		   'CCA'=>'Trp',
124		   'TCA'=>'SeC',
125		   );
126
127our ($opt_result, $opt_ss, $opt_output, $opt_prefix, $opt_remove, $opt_cmscore1, $opt_ssscore1, $opt_isoscore1,
128	 $opt_isoscore2, $opt_isomaxscore2, $opt_help);
129
130our $ANTICODON_COUNT_CUTOFF = 40;
131
132our %file_names = ();
133our $tRNAs = tRNAscanSE::ArraytRNA->new();
134our %tRNA_counts = ();
135
136&set_options();
137&set_file_names();
138my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = &filtering();
139&print_results($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count);
140
141exit;
142
143sub set_options
144{
145	$opt_result = "";
146	$opt_ss = "";
147	$opt_output = "";
148	$opt_prefix = "";
149	$opt_remove = 0;
150	$opt_cmscore1 = 50;
151	$opt_ssscore1 = 10;
152	$opt_isoscore1 = 70;
153	$opt_isoscore2 = 70;
154	$opt_isomaxscore2 = 95;
155
156	Getopt::Long::GetOptions("result|i=s", "ss|s=s", "output|o=s", "prefix|p=s", "remove|r",
157							 "cmscore1|c1=f", "ssscore1|m1=f", "isoscore1|e1=f",
158							 "isoscore2|e2=f", "isomaxscore2|x=f", "help|h");
159
160	if ($opt_help || $opt_result eq "" || $opt_ss eq "" || $opt_output eq "" || $opt_prefix eq "" ||
161		$opt_cmscore1 < -1 || $opt_ssscore1 < -1 || $opt_isoscore1 < -1 || $opt_isoscore2 < -1 || $opt_isomaxscore2 < -1)
162	{
163		die "Usage: EukQualityFilter [options]\n",
164			"Options\n",
165			"--result -i <file>         tRNAscan-SE output file used as input\n",
166			"--ss -s <file>             tRNAscan-SE secondary structure file used as input\n",
167			"--output -o <file path>    Directory where output files will be written\n",
168			"--prefix -p <name>         Prefix for output file name\n",
169			"--remove -r                Remove filtered tRNA hits (default: filtered tRNA hits are only tagged)\n",
170			"--cmscore1 -c1 <num>       Domain-specific model score cutoff for secondary filtering (default = 50; -1 if not used for filtering)\n",
171			"--ssscore1 -m1 <num>       Secondary structure score cutoff for secondary filtering (default = 10; -1 if not used for filtering)\n",
172			"--isoscore1 -e1 <num>      Isotype-specific model score cutoff for secondary filtering (default = 70; -1 if not used for filtering)\n",
173			"--isoscore2 -e2 <num>      Isotype-specific model starting score cutoff for tertiary filtering (default = 70; -1 if not used for filtering)\n",
174			"--isomaxscore2 -x <num>    Maximum isotype-specific model score cutoff for tertiary filtering (default = 95)\n",
175			"--help -h                  Print this help\n\n";
176	}
177}
178
179sub set_file_names
180{
181	system("mkdir -p ".$opt_output);
182	$file_names{tRNAscan_out} = $opt_result;
183	$file_names{tRNAscan_ss} = $opt_ss;
184	$file_names{output_tRNAscan_out} = $opt_output."/".$opt_prefix.".out";
185	$file_names{output_tRNAscan_ss} = $opt_output."/".$opt_prefix.".ss";
186	$file_names{log} = $opt_output."/".$opt_prefix.".log";
187}
188
189sub filtering
190{
191	my $sec_pass_filtered_ac = {};
192	my $iso_score_cutoff = {};
193
194	&pseudogene_filter();
195	&secondary_filter();
196	if ($opt_isoscore2 != -1)
197	{
198		($sec_pass_filtered_ac, $iso_score_cutoff) = &tertiary_filter();
199	}
200	$tRNAs->sort_array("tRNAscan_id");
201	my $ac_count = &get_ac_count();
202
203	return ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count);
204}
205
206sub print_results
207{
208	my ($sec_pass_filtered_ac, $iso_score_cutoff, $ac_count) = @_;
209	&write_out_file($ac_count);
210	&write_ss_file();
211	&write_summary($sec_pass_filtered_ac, $iso_score_cutoff);
212}
213
214sub pseudogene_filter
215{
216	my $line = "";
217	my $tRNA = undef;
218	my %header = ();
219	my ($startpos, $endpos);
220	my @columns = ();
221	my $ct = 0;
222
223	$tRNA_counts{total} = 0;
224	$tRNA_counts{pseudo_filter} = 0;
225
226	print "Status: Filtering pseudogenes\n";
227
228	open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n";
229
230	while ($line = <FILE_IN>)
231	{
232		$ct++;
233
234		print STDERR "." if ($ct % 1000 == 0);
235		print STDERR "\n" if ($ct % 50000 == 0);
236
237		chomp($line);
238		if ($line =~ /^Name/)
239		{
240			$line =~ s/tRNA #/tRNA#/;
241		}
242		if ($line =~ /^Sequence/)
243		{
244			$line =~ s/Intron Bounds/Intron\tBound/;
245		}
246
247		@columns = split(/\t/, $line, -1);
248		for (my $i = 0; $i < scalar(@columns); $i++)
249		{
250			$columns[$i] = &trim($columns[$i]);
251		}
252
253		if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/)
254		{
255			if ($columns[0] =~ /^Sequence/)
256			{
257				for (my $i = 0; $i < scalar(@columns); $i++)
258				{
259					if ($columns[$i] eq "Sequence")
260					{
261						$header{seqname} = $i;
262					}
263					elsif ($columns[$i] eq "Anti")
264					{
265						$header{anticodon} = $i;
266					}
267					elsif ($columns[$i] eq "Intron")
268					{
269						$header{intron_start} = $i;
270						$header{intron_end} = $i+1;
271					}
272					elsif ($columns[$i] eq "Inf")
273					{
274						$header{score} = $i;
275					}
276					elsif ($columns[$i] eq "Cove")
277					{
278						$header{score} = $i;
279					}
280					elsif ($columns[$i] eq "HMM")
281					{
282						$header{hmm_score} = $i;
283					}
284					elsif ($columns[$i] eq "2'Str")
285					{
286						$header{ss_score} = $i;
287					}
288					elsif ($columns[$i] eq "Hit")
289					{
290						$header{hit_origin} = $i;
291					}
292					elsif ($columns[$i] eq "HMM")
293					{
294						$header{hmm_score} = $i;
295					}
296					elsif ($columns[$i] eq "Type")
297					{
298						$header{isotype_type} = $i;
299					}
300				}
301			}
302			elsif ($columns[0] =~ /^Name/)
303			{
304				for (my $i = 0; $i < scalar(@columns); $i++)
305				{
306					if ($columns[$i] eq "tRNA#")
307					{
308						$header{trna_id} = $i;
309					}
310					elsif ($columns[$i] eq "Begin" and !defined $header{start})
311					{
312						$header{start} = $i;
313						$header{end} = $i+1;
314					}
315					elsif ($columns[$i] eq "Type")
316					{
317						$header{isotype} = $i;
318					}
319					elsif ($columns[$i] eq "CM")
320					{
321						$header{isotype_cm} = $i;
322						$header{isotype_score} = $i+1;
323					}
324					elsif ($columns[$i] eq "Note")
325					{
326						$header{note} = $i;
327					}
328					elsif ($columns[$i] eq "Count")
329					{
330						$header{intron_count} = $i;
331					}
332				}
333			}
334		}
335		else
336		{
337			if (!defined $header{isotype_cm})
338			{
339				die "Error: This filter requires isotype-specific model scan result in the tRNAscan-SE v2 output file.\n";
340			}
341			if (!defined $header{note})
342			{
343				die "Error: This filter requires tRNAscan-SE v2 output file.\n";
344			}
345
346			$tRNA_counts{total}++;
347
348			if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT")
349			{
350				$tRNA_counts{total}--;
351				next;
352			}
353			elsif ($columns[$header{note}] =~ /pseudo/)
354			{
355				$tRNA_counts{pseudo_filter}++;
356				next;
357			}
358			else
359			{
360				$tRNA = tRNAscanSE::tRNA->new;
361				$tRNA->seqname($columns[$header{seqname}]);
362				$tRNA->tRNAscan_id($columns[$header{seqname}].".trna".$columns[$header{trna_id}]);
363				$startpos = $columns[$header{start}];
364				$endpos = $columns[$header{end}];
365				if ($startpos < $endpos)
366				{
367					$tRNA->start($startpos);
368					$tRNA->end($endpos);
369					$tRNA->strand("+");
370				}
371				else
372				{
373					$tRNA->end($startpos);
374					$tRNA->start($endpos);
375					$tRNA->strand("-");
376				}
377				$tRNA->category("cyto");
378				$tRNA->isotype($columns[$header{isotype}]);
379				$tRNA->anticodon($columns[$header{anticodon}]);
380				$tRNA->score($columns[$header{score}]);
381				$tRNA->hmm_score($columns[$header{hmm_score}]);
382				$tRNA->ss_score($columns[$header{ss_score}]);
383				$tRNA->tRNAscan_id($tRNA->tRNAscan_id()."-".$tRNA->isotype().$tRNA->anticodon());
384				my $isotype_type = "cytosolic";
385				if (defined $header{isotype_type})
386				{
387					$isotype_type = $columns[$header{isotype_type}];
388				}
389				$tRNA->add_model_hit($isotype_type, $columns[$header{isotype_cm}], $columns[$header{isotype_score}], "");
390				$tRNAs->put($tRNA);
391			}
392		}
393	}
394
395	close(FILE_IN);
396
397	print STDERR "\n";
398}
399
400sub euk_anticodon_filter
401{
402	my ($ac_count, $tRNA) = @_;
403	my $tag = "";
404
405	if (!$tRNA->is_pseudo())
406	{
407		if ($tRNA->isotype() eq "Sup")
408		{
409			$tag = "unexpected anticodon";
410			$tRNA_counts{ac_filter}++;
411		}
412		elsif (!defined $euk_aa_list{$tRNA->anticodon()})
413		{
414			my $alt_anticodon = $tRNA->anticodon();
415			if (substr($tRNA->anticodon(), 0, 1) eq "A")
416			{
417				$alt_anticodon = "G".substr($tRNA->anticodon(), 1);
418			}
419			elsif (substr($tRNA->anticodon(), 0, 1) eq "G")
420			{
421				$alt_anticodon = "A".substr($tRNA->anticodon(), 1);
422			}
423
424			if (defined $aa_list{$tRNA->anticodon()} and defined $aa_list{$alt_anticodon} and defined $euk_aa_list{$alt_anticodon} and
425				$ac_count->{$tRNA->anticodon()} > $ac_count->{$alt_anticodon})
426			{}
427			else
428			{
429				$tag = "unexpected anticodon";
430				$tRNA_counts{ac_filter}++;
431			}
432		}
433	}
434
435	return $tag;
436}
437
438sub secondary_filter
439{
440	print "Status: Secondary filtering\n";
441
442	$tRNA_counts{secondary_filter} = 0;
443	for (my $i = 0; $i < $tRNAs->get_count(); $i++)
444	{
445		my $tRNA = $tRNAs->get($i);
446		my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model();
447		if ($opt_isoscore1 > -1 and !$tRNA->is_pseudo() and ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $opt_isoscore1)))
448		{
449			$tRNA->pseudo(1);
450			$tRNA_counts{secondary_filter}++;
451		}
452		elsif ($opt_cmscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->score() < $opt_cmscore1)
453		{
454			$tRNA->pseudo(1);
455			$tRNA_counts{secondary_filter}++;
456		}
457		elsif ($opt_ssscore1 > -1 and !$tRNA->is_pseudo() and $tRNA->ss_score() < $opt_ssscore1 and $tRNA->isotype() ne "SeC")
458		{
459			$tRNA->pseudo(1);
460			$tRNA_counts{secondary_filter}++;
461		}
462	}
463}
464
465sub get_ac_count
466{
467	my %ac_count = ();
468
469	for my $ac (sort keys %aa_list)
470	{
471		$ac_count{$ac} = 0;
472	}
473
474	for (my $i = 0; $i < $tRNAs->get_count(); $i++)
475	{
476		my $tRNA = $tRNAs->get($i);
477		if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo())
478		{
479			$ac_count{$tRNA->anticodon()} += 1;
480		}
481	}
482
483	return \%ac_count;
484}
485
486sub get_isotype_count
487{
488	my %isotype_count = ();
489	my $iso = "";
490
491	for my $isotype (sort @isotypes)
492	{
493		$isotype_count{$isotype} = 0;
494	}
495
496	for (my $i = 0; $i < $tRNAs->get_count(); $i++)
497	{
498		my $tRNA = $tRNAs->get($i);
499		if ($tRNA->is_cytosolic() and !$tRNA->is_pseudo())
500		{
501			$isotype_count{$tRNA->isotype()} += 1;
502		}
503	}
504
505	return \%isotype_count;
506}
507
508sub tertiary_filter
509{
510	print "Status: Tertiary filtering\n";
511
512	$tRNA_counts{tertiary_filter} = 0;
513	my %sec_pass_filtered_ac = ();
514	my %iso_score_cutoff = ();
515	my $count_pair = [];
516	my %filtering_isotypes = ();
517	my $tRNA_index = 0;
518	my $start_index = -1;
519	my $end_index = -1;
520	my $isotype_tRNAs = tRNAscanSE::ArraytRNA->new();
521	my $trna = undef;
522	my $find = 0;
523
524	my $ac_count = &get_ac_count();
525
526	foreach my $ac (sort {$ac_count->{$b} <=> $ac_count->{$a}} keys %$ac_count)
527	{
528		if ($ac_count->{$ac} > $ANTICODON_COUNT_CUTOFF)
529		{
530			if (!defined $filtering_isotypes{$aa_list{$ac}})
531			{
532				$filtering_isotypes{$aa_list{$ac}} = 1;
533			}
534		}
535		else
536		{
537			last;
538		}
539	}
540
541	$tRNAs->sort_array("isotype");
542
543	foreach my $isotype (sort keys %filtering_isotypes)
544	{
545		$iso_score_cutoff{$isotype} = $opt_isoscore2;
546		$find = 0;
547		$isotype_tRNAs->clear();
548		$start_index = -1;
549		$end_index = -1;
550
551		while (!$find and $tRNA_index < $tRNAs->get_count())
552		{
553			if ($tRNAs->get($tRNA_index)->isotype() lt $isotype)
554			{
555				$tRNA_index++;
556			}
557			elsif ($tRNAs->get($tRNA_index)->isotype() eq $isotype)
558			{
559				if ($start_index == -1)
560				{
561					$start_index = $tRNA_index;
562				}
563				$end_index = $tRNA_index;
564				my $tRNA = $tRNAs->get($tRNA_index);
565				if (!$tRNA->is_pseudo())
566				{
567					my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model();
568					$trna = tRNAscanSE::tRNA->new;
569					$trna->tRNAscan_id($tRNA->tRNAscan_id());
570					$trna->anticodon($tRNA->anticodon());
571					$trna->score($iso_score);
572					$isotype_tRNAs->put($trna);
573				}
574				$tRNA_index++;
575			}
576			else
577			{
578				$find = 1;
579			}
580		}
581
582		if ($find)
583		{
584			foreach my $ac (@{$ac_list{$isotype}})
585			{
586				if ($ac ne "&nbsp")
587				{
588					my $local_ac_count = $ac_count->{$ac};
589					while ($local_ac_count > $ANTICODON_COUNT_CUTOFF and $iso_score_cutoff{$isotype} <= $opt_isomaxscore2)
590					{
591						$local_ac_count = &iso_score_filter($isotype_tRNAs, $ac, $iso_score_cutoff{$isotype}, $local_ac_count);
592						$iso_score_cutoff{$isotype}++;
593					}
594					if ($ac_count->{$ac} > $local_ac_count)
595					{
596						$iso_score_cutoff{$isotype}--;
597					}
598				}
599			}
600
601			foreach my $ac (@{$ac_list{$isotype}})
602			{
603				if ($ac ne "&nbsp")
604				{
605					$count_pair = [];
606					$count_pair->[0] = $ac_count->{$ac};
607					$sec_pass_filtered_ac{$ac} = $count_pair;
608					for (my $i = $start_index; $i <= $end_index; $i++)
609					{
610						my $tRNA = $tRNAs->get($i);
611						my ($type, $model, $iso_score, $iso_ss) = $tRNA->get_highest_score_model();
612						if ($tRNA->isotype() eq $isotype and !$tRNA->is_pseudo())
613						{
614							if ($type eq "mito" or ($type eq "cytosolic" and $iso_score < $iso_score_cutoff{$isotype}))
615							{
616								$tRNA->pseudo(2);
617								$tRNA_counts{tertiary_filter}++;
618							}
619						}
620					}
621				}
622			}
623		}
624	}
625
626	$ac_count = &get_ac_count();
627	foreach my $ac (sort keys %sec_pass_filtered_ac)
628	{
629		$sec_pass_filtered_ac{$ac}->[1] = $ac_count->{$ac};
630	}
631	return (\%sec_pass_filtered_ac, \%iso_score_cutoff);
632}
633
634sub iso_score_filter
635{
636	my ($isotype_tRNAs, $ac, $isotype_score_cutoff, $local_ac_count) = @_;
637
638	my $count = $local_ac_count;
639	for (my $i = 0; $i < $isotype_tRNAs->get_count(); $i++)
640	{
641		my $tRNA = $isotype_tRNAs->get($i);
642		if ($tRNA->anticodon() eq $ac and !$tRNA->is_pseudo())
643		{
644			if ($tRNA->score() < $isotype_score_cutoff)
645			{
646				$tRNA->pseudo(1);
647				$count--;
648			}
649		}
650	}
651	return $count;
652}
653
654sub write_out_file
655{
656	my ($ac_count) = @_;
657	my $line = "";
658	my $tRNA = undef;
659	my %header = ();
660	my @columns = ();
661	my $tRNAscan_id = "";
662	my $index = -1;
663	my $include = 0;
664	my $tag = "";
665	my $ct = 0;
666
667	$tRNA_counts{iso_filter} = 0;
668	$tRNA_counts{undet_filter} = 0;
669	$tRNA_counts{ac_filter} = 0;
670
671	print "Status: Writing output file $file_names{output_tRNAscan_out}\n";
672
673	open(FILE_IN, "$file_names{tRNAscan_out}") or die "Fail to open $file_names{tRNAscan_out}\n";
674	open(FILE_OUT, ">$file_names{output_tRNAscan_out}") or die "Fail to open $file_names{output_tRNAscan_out}\n";
675
676	while ($line = <FILE_IN>)
677	{
678		$ct++;
679
680		print STDERR "." if ($ct % 1000 == 0);
681		print STDERR "\n" if ($ct % 50000 == 0);
682
683		chomp($line);
684		if ($line =~ /^Name/)
685		{
686			$line =~ s/tRNA #/tRNA#/;
687		}
688
689		@columns = split(/\t/, $line, -1);
690		for (my $i = 0; $i < scalar(@columns); $i++)
691		{
692			$columns[$i] = &trim($columns[$i]);
693		}
694
695		if ($columns[0] =~ /^Sequence/ || $columns[0] =~ /^Name/ || $columns[0] =~ /^-----/)
696		{
697			print FILE_OUT $line."\n";
698			if ($columns[0] =~ /^Sequence/)
699			{
700				for (my $i = 0; $i < scalar(@columns); $i++)
701				{
702					if ($columns[$i] eq "Sequence")
703					{
704						$header{seqname} = $i;
705					}
706					elsif ($columns[$i] eq "Anti")
707					{
708						$header{anticodon} = $i;
709					}
710					elsif ($columns[$i] eq "Type")
711					{
712						$header{isotype_type} = $i;
713					}
714				}
715			}
716			elsif ($columns[0] =~ /^Name/)
717			{
718				for (my $i = 0; $i < scalar(@columns); $i++)
719				{
720					if ($columns[$i] eq "tRNA#")
721					{
722						$header{trna_id} = $i;
723					}
724					elsif ($columns[$i] eq "Type")
725					{
726						$header{isotype} = $i;
727					}
728					elsif ($columns[$i] eq "CM")
729					{
730						$header{isotype_cm} = $i;
731						$header{isotype_score} = $i+1;
732					}
733					elsif ($columns[$i] eq "Note")
734					{
735						$header{note} = $i;
736					}
737				}
738			}
739		}
740		else
741		{
742			$include = 0;
743			$tag = "";
744			$tRNAscan_id = $columns[$header{seqname}].".trna".$columns[$header{trna_id}]."-".$columns[$header{isotype}].$columns[$header{anticodon}];
745			$index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id");
746			if ($index == -1)
747			{
748				if ($columns[$header{seqname}] eq "chrM" or $columns[$header{seqname}] eq "M" or $columns[$header{seqname}] eq "chrMT" or $columns[$header{seqname}] eq "MT")
749				{}
750				elsif (!$opt_remove)
751				{
752					$include = 1;
753				}
754			}
755			else
756			{
757				$tRNA = $tRNAs->get($index);
758				if ($tRNA->is_pseudo() and !$opt_remove)
759				{
760					$include = 1;
761					if ($tRNA->pseudo() == 1)
762					{
763						$tag = "secondary filtered";
764					}
765					elsif ($tRNA->pseudo() == 2)
766					{
767						$tag = "tertiary filtered";
768					}
769				}
770				elsif (!$tRNA->is_pseudo())
771				{
772					$include = 1;
773					if ($columns[$header{note}] =~ /IPD/ and $columns[$header{isotype}] ne "Sup")
774					{
775						$tRNA_counts{iso_filter}++;
776						$tag = "isotype mismatch";
777					}
778					elsif ($columns[$header{isotype}] eq "Undet")
779					{
780						$tRNA_counts{undet_filter}++;
781						$tag = "undetermined isotype";
782					}
783					else
784					{
785						$tag = &euk_anticodon_filter($ac_count, $tRNA);
786						if ($tag eq "")
787						{
788							$tag = "high confidence set";
789						}
790					}
791				}
792			}
793			if ($include)
794			{
795				for (my $i = 0; $i < $header{note}; $i++)
796				{
797					print FILE_OUT $columns[$i]."\t";
798				}
799				if ($columns[$header{note}] ne "")
800				{
801					print FILE_OUT $columns[$header{note}];
802					if ($tag ne "")
803					{
804						print FILE_OUT ",".$tag;
805					}
806				}
807				else
808				{
809					print FILE_OUT $tag;
810				}
811				for (my $i = $header{note} + 1; $i < scalar(@columns); $i++)
812				{
813					print FILE_OUT "\t".$columns[$i];
814				}
815				print FILE_OUT "\n";
816			}
817		}
818	}
819	close(FILE_IN);
820	close(FILE_OUT);
821
822	print STDERR "\n";
823}
824
825sub write_ss_file
826{
827	my $tRNA = undef;
828	my $line = "";
829	my $tRNAscan_id = "";
830	my $seqname = "";
831	my $index = -1;
832	my $print_line = "";
833	my $include = 0;
834	my $tag = "";
835	my $ct = 0;
836
837	print "Status: Writing secondary structure file $file_names{output_tRNAscan_ss}\n";
838
839	open(FILE_IN, "$file_names{tRNAscan_ss}") || die "Error: Fail to open $file_names{tRNAscan_ss}\n";
840	open(FILE_OUT, ">$file_names{output_tRNAscan_ss}") or die "Fail to open $file_names{output_tRNAscan_ss}\n";
841	while ($line = <FILE_IN>)
842	{
843		if ($line =~ /^(\S+)\s+\(\d+\-\d+\)\s+Length:\s\d+\sbp/)
844		{
845			$tRNAscan_id = $1;
846			$seqname = substr($tRNAscan_id, 0, rindex($tRNAscan_id, "."));
847			$print_line = $line;
848			$include = 0;
849			$tag = "";
850
851			$ct++;
852			print STDERR "." if ($ct % 1000 == 0);
853			print STDERR "\n" if ($ct % 50000 == 0);
854		}
855		elsif ($line =~ /^Type:\s(\S+)\s+Anticodon:\s(\S+)\sat\s.+\s\(.+\)\s+Score:\s\S+/)
856		{
857			$tRNAscan_id .= "-".$1.$2;
858			$print_line .= $line;
859
860			$index = $tRNAs->bsearch_id($tRNAscan_id, "tRNAscan_id");
861			if ($index == -1)
862			{
863				if (!$opt_remove)
864				{
865					$include = 1;
866					$tag = "Filtered";
867				}
868			}
869			else
870			{
871				$tRNA = $tRNAs->get($index);
872				if ($tRNA->is_pseudo() and !$opt_remove)
873				{
874					$include = 1;
875					$tag = "Filtered";
876				}
877				elsif (!$tRNA->is_pseudo())
878				{
879					$include = 1;
880				}
881			}
882		}
883		elsif ($line =~ /^HMM Sc=\S+\s+Sec struct Sc=\S+/)
884		{
885			if ($tag ne "")
886			{
887				$print_line .= $tag.":  ".$line;
888			}
889			else
890			{
891				$print_line .= $line;
892			}
893		}
894		elsif (($line =~ /^Possible pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/)
895			or ($line =~ /^Possible truncation, pseudogene:\s+HMM Sc=\S+\s+Sec struct Sc=\S+/))
896		{
897			if ($tag ne "")
898			{
899				$print_line .= $tag.", ".$line;
900			}
901			else
902			{
903				$print_line .= $line;
904			}
905		}
906		elsif ($line =~ /^Possible intron: \d+-\d+ \(\d+-\d+\)/)
907		{
908			$print_line .= $line;
909		}
910		elsif (index($line, "     *    |    *    |    *    |") > -1)
911		{
912			$print_line .= $line;
913		}
914		elsif ($line =~ /^Seq:\s\S+$/)
915		{
916			$print_line .= $line;
917		}
918		elsif ($line =~ /^Str:\s\S+$/)
919		{
920			$print_line .= $line;
921			if ($seqname eq "chrM" or$seqname eq "M" or $seqname eq "chrMT" or $seqname eq "MT")
922			{}
923			elsif ($include)
924			{
925				print FILE_OUT $print_line."\n";
926			}
927		}
928	}
929	close(FILE_IN);
930	close(FILE_OUT);
931
932	print STDERR "\n";
933}
934
935sub write_summary
936{
937	my ($sec_pass_filtered_ac, $iso_score_cutoff) = @_;
938
939	print "Status: Writing summary file $file_names{log}\n";
940
941	open(FILE_OUT, ">$file_names{log}") or die "Fail to open $file_names{log}\n";
942
943	print FILE_OUT "EukHighConfidenceFilter v1.0 Summary\n",
944		"Completed Time: ".localtime()."\n\n";
945	print FILE_OUT "Inputs\n",
946		"------------------------------------------------------\n",
947		"tRNAscan-SE output file: ".$file_names{tRNAscan_out}."\n",
948		"tRNAscan-SE ss file: ".$file_names{tRNAscan_ss}."\n";
949	if ($opt_cmscore1 == -1)
950	{
951		print FILE_OUT "Secondary filtering domain-specific model score: Disabled\n";
952	}
953	else
954	{
955		print FILE_OUT "Secondary filtering domain-specific model score cutoff: $opt_cmscore1\n";
956	}
957	if ($opt_ssscore1 == -1)
958	{
959		print FILE_OUT "Secondary filtering secondary structure score: Disabled\n";
960	}
961	else
962	{
963		print FILE_OUT "Secondary filtering secondary structure score cutoff: $opt_ssscore1\n";
964	}
965	if ($opt_isoscore1 == -1)
966	{
967		print FILE_OUT "Secondary filtering isotype-specific model score: Disabled\n";
968	}
969	else
970	{
971		print FILE_OUT "Secondary filtering isotype-specific model score cutoff: $opt_isoscore1\n";
972	}
973	if ($opt_isoscore2 == -1)
974	{
975		print FILE_OUT "Tertiary filtering isotype-specific model score: Disabled\n";
976	}
977	else
978	{
979		print FILE_OUT "Tertiary filtering isotype-specific model starting score cutoff: $opt_isoscore2\n";
980		print FILE_OUT "Tertiary filtering maximum isotype-specific model score cutoff: $opt_isomaxscore2\n";
981	}
982	if ($opt_remove)
983	{
984		print FILE_OUT "Remove filtered hits: Yes\n";
985	}
986	else
987	{
988		print FILE_OUT "Remove filtered hits: No\n";
989	}
990	print FILE_OUT "\n";
991	print FILE_OUT "Outputs\n",
992		"------------------------------------------------------\n",
993		"tRNAscan-SE output file: ".$file_names{output_tRNAscan_out}."\n",
994		"tRNAscan-SE ss file: ".$file_names{output_tRNAscan_ss}."\n",
995		"Summary file: ".$file_names{log}."\n\n";
996
997	my $remaining_hits = $tRNA_counts{total} - $tRNA_counts{pseudo_filter};
998	print FILE_OUT "Summary statistics\n",
999		"------------------------------------------------------\n";
1000	print FILE_OUT "Total tRNA predictions:                                ".$tRNA_counts{total}."\n";
1001	print FILE_OUT "Possible pseudogenes:                                  ".$tRNA_counts{pseudo_filter}."\n";
1002	if ($opt_isoscore1 > -1 or $opt_ssscore1 > -1 or $opt_cmscore1 > -1)
1003	{
1004		print FILE_OUT "tRNAs with scores lower than secondary cutoffs:        ".$tRNA_counts{secondary_filter}."\n";
1005
1006		$remaining_hits -= $tRNA_counts{secondary_filter};
1007	}
1008	if ($opt_isoscore2 > -1)
1009	{
1010		$remaining_hits -= $tRNA_counts{tertiary_filter};
1011		print FILE_OUT "tRNAs failed tertiary filter:                          ".$tRNA_counts{tertiary_filter}."\n\n";
1012	}
1013	print FILE_OUT "tRNAs after filtering:                                 ".$remaining_hits."\n\n";
1014	print FILE_OUT "tRNAs with mismatched isotype:                         ".$tRNA_counts{iso_filter}."\n";
1015	print FILE_OUT "tRNAs with undetermined isotype:                       ".$tRNA_counts{undet_filter}."\n";
1016	print FILE_OUT "tRNAs with unexpected anticodon:                       ".$tRNA_counts{ac_filter}."\n\n";
1017	print FILE_OUT "High confidence set:                                   ".($remaining_hits - $tRNA_counts{iso_filter} - $tRNA_counts{undet_filter} - $tRNA_counts{ac_filter})."\n\n";
1018
1019	if ($opt_isoscore2 > -1 and $tRNA_counts{tertiary_filter} > 0)
1020	{
1021		print FILE_OUT "Tertiary filter - anticodon counts\n",
1022			"------------------------------------------------------\n\n";
1023		print FILE_OUT "Isotype  Anticodon  BeforeFiltering  AfterFiltering  FinalScoreCutoff\n";
1024		foreach my $isotype (sort keys %$iso_score_cutoff)
1025		{
1026			foreach my $ac (@{$ac_list{$isotype}})
1027			{
1028				if ($ac ne "&nbsp" and defined $euk_aa_list{$ac})
1029				{
1030					printf FILE_OUT "%-7s  %-9s  %-15d  %-14d  %-16d\n",
1031						$isotype, $ac, $sec_pass_filtered_ac->{$ac}->[0], $sec_pass_filtered_ac->{$ac}->[1], $iso_score_cutoff->{$isotype};
1032				}
1033			}
1034		}
1035		print FILE_OUT "\n";
1036	}
1037
1038	my $ac_count = &get_ac_count();
1039	my %isotype_count = ();
1040	foreach my $ac (sort keys %aa_list)
1041	{
1042		if (!defined $isotype_count{$aa_list{$ac}})
1043		{
1044			$isotype_count{$aa_list{$ac}} = $ac_count->{$ac};
1045		}
1046		else
1047		{
1048			$isotype_count{$aa_list{$ac}} += $ac_count->{$ac};
1049		}
1050	}
1051
1052	print FILE_OUT "Isotype / Anticodon Counts After Filtering\n",
1053		"------------------------------------------------------\n\n";
1054
1055    foreach my $aa (@isotypes)
1056    {
1057        my $iso_count = 0;
1058        if (defined $isotype_count{$aa})
1059        {
1060            $iso_count = $isotype_count{$aa};
1061        }
1062		if ($aa eq "SeC")
1063		{
1064			printf FILE_OUT ("%-8s: %d\t", "SelCys", $iso_count);
1065		}
1066		elsif ($aa eq "Sup")
1067		{
1068			printf FILE_OUT ("%-8s: %d\t", "Supres", $iso_count);
1069		}
1070		else
1071		{
1072			printf FILE_OUT ("%-8s: %d\t", $aa, $iso_count);
1073		}
1074
1075	    foreach my $ac (@{$ac_list{$aa}})
1076        {
1077			if ($ac eq "&nbsp")
1078			{
1079				print FILE_OUT "             ";
1080			}
1081			else
1082			{
1083				if (defined $ac_count->{$ac})
1084				{
1085					printf FILE_OUT ("%5s: %-6s", $ac, $ac_count->{$ac});
1086				}
1087				else
1088				{
1089					printf FILE_OUT ("%5s: %-6s", $ac, "");
1090				}
1091			}
1092	    }
1093	    print FILE_OUT "\n";
1094	}
1095    print FILE_OUT "\n";
1096
1097	close(FILE_OUT);
1098}
1099
1100sub trim
1101{
1102	my $string = shift;
1103	$string =~ s/^\s+//;
1104	$string =~ s/\s+$//;
1105	return $string;
1106}
1107