1#!/usr/bin/perl -w
2
3################################################################
4# copyright (c) 2014 by William R. Pearson and The Rector &
5# Visitors of the University of Virginia */
6################################################################
7# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
9# You may obtain a copy of the License at
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing,
14# software distributed under this License is distributed on an "AS
15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16# express or implied.  See the License for the specific language
17# governing permissions and limitations under the License.
18################################################################
19
20# ann_feats_up_www.pl gets an annotation file from fasta36 -V with a line of the form:
21
22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n  (at least from pir1.lseg)
23#
24# it must:
25# (1) read in the line
26# (2) parse it to get the up_acc
27# (3) return the tab delimited features
28#
29
30# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains
31
32use strict;
33
34use Getopt::Long;
35use Pod::Usage;
36use LWP::Simple;
37## use IO::String;
38
39my $up_base = 'http://www.uniprot.org/uniprot';
40my $gff_post = "gff";
41
42my %domains = ();
43my $domain_cnt = 0;
44
45my $hostname = `/bin/hostname`;
46
47my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $shelp, $help) = (0,0,0,0,0,0,0);
48my ($min_nodom) = (10);
49
50GetOptions(
51    "lav" => \$lav,
52    "no_doms" => \$no_doms,
53    "no-doms" => \$no_doms,
54    "nodoms" => \$no_doms,
55    "neg" => \$neg_doms,
56    "neg_doms" => \$neg_doms,
57    "neg-doms" => \$neg_doms,
58    "negdoms" => \$neg_doms,
59    "min_nodom=i" => \$min_nodom,
60    "no_feats" => \$no_feats,
61    "no-feats" => \$no_feats,
62    "nofeats" => \$no_feats,
63    "sstr" => \$sstr,
64    "h|?" => \$shelp,
65    "help" => \$help,
66    );
67
68pod2usage(1) if $shelp;
69pod2usage(exitstatus => 0, verbose => 2) if $help;
70pod2usage(1) unless @ARGV;
71
72my @feat_keys = ('Active site','Modified residue', 'Binding', 'Metal', 'Site');
73
74my @feat_vals = ( '=','*','#','^','!');
75
76my @dom_keys = qw( Domain Repeat );
77my @dom_vals = ( [ '[', ']'],[ '[', ']']);
78
79my @ssr_keys = ('Beta strand', 'Helix');
80my @ssr_vals = ( [ '[', ']']);
81
82my %annot_types = ();
83
84my $get_annot_sub = \&gff2_annots;
85if ($lav) {
86  $no_feats = 1;
87  $get_annot_sub = \&gff2_annots;
88}
89
90if ($sstr) {@annot_types{@ssr_keys} = @ssr_vals;}
91else {
92  @annot_types{@feat_keys} = @feat_vals unless ($no_feats);
93  @annot_types{@dom_keys} = @dom_vals unless ($no_doms);
94}
95
96if ($neg_doms) {
97  $domains{'NODOM'}=0;
98}
99
100my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
101
102unless ($no_feats || $sstr) {
103  for my $i ( 0 .. $#feat_keys) {
104    print "=",$feat_vals[$i],":",$feat_keys[$i],"\n";
105  }
106  # print "=*:phosphorylation\n";
107  # print "==".":active site\n";
108  # print "=@".":site\n";
109  # print "=^:binding\n";
110  # print "=!:metal binding\n";
111}
112
113# get the query
114my ($query, $seq_len) =  @ARGV;
115$seq_len = 0 unless defined($seq_len);
116
117$query =~ s/^>//;
118
119my $ANN_F;
120
121my @annots = ();
122
123#if it's a file I can open, read and parse it
124if ($query !~ m/\|/ && open($ANN_F, $query)) {
125
126  while (my $a_line = <$ANN_F>) {
127    $a_line =~ s/^>//;
128    chomp $a_line;
129    push @annots, lwp_annots($a_line, $get_annot_sub);
130  }
131}
132else {
133  push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub);
134}
135
136for my $seq_annot (@annots) {
137  print ">",$seq_annot->{seq_info},"\n";
138  for my $annot (@{$seq_annot->{list}}) {
139    if (!$lav && defined($domains{$annot->[-1]})) {
140      $annot->[-1] .= " :".$domains{$annot->[-1]};
141    }
142    print join("\t",@$annot),"\n";
143  }
144}
145
146exit(0);
147
148sub lwp_annots {
149  my ($query_len, $get_annot_sub) = @_;
150
151  my ($annot_line, $seq_len) = split(/\t/,$query_len);
152
153  my %annot_data = (seq_info=>$annot_line);
154
155  if ($annot_line =~ m/^gi\|/) {
156    ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
157  }
158  elsif ($annot_line =~ m/^(SP|TR):(\w+)\s(\w+)/) {
159    $sdb = lc($1);
160    $id = $2;
161    $acc = $3;
162  }
163  else {
164    ($sdb, $acc, $id) = split(/\|/,$annot_line);
165  }
166
167  $annot_data{list} = [];
168  my $lwp_features = "";
169
170  if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) {
171    $lwp_features = get("$up_base/$acc.$gff_post");
172  }
173
174  if ($lwp_features && ($lwp_features !~ /ERROR/)) {
175    $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_features, $seq_len);
176  }
177  else {
178    $annot_data{list} = [];
179  }
180
181  return \%annot_data;
182}
183
184# parses www.uniprot.org gff feature table
185sub gff2_annots {
186  my ($annot_types, $annot_data, $seq_len) = @_;
187
188  my ($acc, $pos, $end, $label, $value, $comment, $len);
189  my ($seq_acc, $seq_start, $seq_end, $tmp);
190
191  $seq_len = 0;
192
193  my @feats2 = ();	# features with start/stop, for checking overlap, adding negative
194  my @sites = ();	# sites with one position
195
196#  my $io = IO::String->new($annot_data);
197
198  my @gff_lines = split(/\n/m,$annot_data);
199
200  my $gff_line = shift @gff_lines; # skip ##gff
201  $gff_line = shift @gff_lines;	# get sequence-region
202  ($tmp, $seq_acc, $seq_start, $seq_end) = split(/\s+/,$gff_line);
203  $seq_len = $seq_end if ($seq_end > $seq_len);
204
205  while ($gff_line = shift(@gff_lines)) {
206    chomp($gff_line);
207
208    my @gff_line_arr = split(/\t/,$gff_line);
209    ($acc, $label, $pos, $end, $comment) = @gff_line_arr[(0,2,3,4,-1)];
210
211    # combine different binding sites
212    if ($label =~ /^Metal/) { $label = 'Metal';}
213    elsif ($label =~ /binding/i) { $label = 'Binding';}
214
215    if ($annot_types->{$label}) {
216
217      my @comments = ();
218      if ($comment =~ m/;/) {
219	@comments = split(/;/,$comment);
220      } else {
221	$comments[0] = $comment;
222      }
223
224      # select first comment with 'Note='
225      ($value) = grep {/Note=/} @comments;
226      if ($value) {
227	$value =~ s/^Note=//;
228	$value =~ s/\%3B//g;
229      }
230      else { $value = "";}
231
232      if ($label =~ m/Domain/ || $label =~ m/Repeat/) {
233	$value = domain_name($label,$value);
234	push @feats2, [$pos, "-", $end, $value];
235      } elsif ($label =~ m/Helix/) {
236	push @feats2, [$pos, "-", $end, $value];
237      } elsif ($label =~ m/Beta/) {
238	push @feats2, [$pos, "-", $end, $value];
239      }
240      else {
241#	print join("\t",($pos, $annot_types->{$label})),"\n";
242#	print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n";
243	push @sites, [$pos, $annot_types->{$label}, "-", "$label: $value"];
244      }
245    }
246  }
247
248  @feats2 = sort { $a->[0] <=> $b->[0] } @feats2;
249
250  # check for containment
251  my $have_contained = 0;
252  my $last_container = 0;
253  for (my $i=1; $i < scalar(@feats2); $i++) {
254    if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) {
255      $feats2[$i]->[1] = 'Delete';
256      $have_contained = 1;
257    }
258    else {
259      $last_container=$i;
260    }
261  }
262
263  if ($have_contained) {
264    @feats2 = grep { $_->[1] !~ /Delete/ } @feats2;
265  }
266
267  # ensure that domains do not overlap
268  for (my $i=1; $i < scalar(@feats2); $i++) {
269    my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
270    if ($diff >= 0) {
271      $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
272      $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
273    }
274  }
275
276  my @n_feats2 = ();
277
278  if ($neg_doms) {
279    my $last_end = 0;
280    for my $feat ( @feats2 ) {
281      if ($feat->[0] - $last_end > $min_nodom) {
282	push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"];
283      }
284      $last_end = $feat->[2];
285    }
286    if ($seq_len - $last_end > $min_nodom) {
287      push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"];
288    }
289  }
290
291  my @feats = ();
292  for my $feat (@feats2, @n_feats2) {
293    if (!$lav) {
294      push @feats, [$feat->[0], '[', '-', $feat->[-1] ];
295      push @feats, [$feat->[2], ']', '-', ""];
296    }
297    else {
298      push @feats, [$feat->[0], $feat->[2], $feat->[-1]];
299    }
300  }
301
302  @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
303
304  return \@feats;
305}
306
307sub get_lav_annots {
308  my ($annot_types, $get_annots_sql, $seq_len) = @_;
309
310  my ($pos, $end, $label, $value, $comment);
311
312  my @feats = ();
313
314  my %annot = ();
315  while (($acc, $pos, $end, $label, $value) = $get_annots_sql->fetchrow_array()) {
316    next unless ($label =~ m/^DOMAIN/ || $label =~ m/^REPEAT/);
317    $value = domain_name($label,$value);
318    push @feats, [$pos, $end, $value];
319  }
320
321  return \@feats;
322}
323
324# domain name takes a uniprot domain label, removes comments ( ;
325# truncated) and numbers and returns a canonical form. Thus:
326# Cortactin 6.
327# Cortactin 7; truncated.
328# becomes "Cortactin"
329#
330
331sub domain_name {
332
333  my ($label, $value) = @_;
334
335  if ($label =~ /Domain|Repeat/i) {
336    $value =~ s/;.*$//;
337    $value =~ s/\.\s*$//;
338    $value =~ s/\s+\d+$//;
339    if (!defined($domains{$value})) {
340      $domain_cnt++;
341      $domains{$value} = $domain_cnt;
342    }
343    return $value;
344  }
345  else {
346    return $value;
347  }
348}
349
350
351
352__END__
353
354=pod
355
356=head1 NAME
357
358ann_feats_up_www.pl
359
360=head1 SYNOPSIS
361
362 ann_feats_up_www.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
363
364=head1 OPTIONS
365
366 -h	short help
367 --help include description
368 --no-doms  do not show domain boundaries (domains are always shown with --lav)
369 --no-feats do not show feature (variants, active sites, phospho-sites)
370 --lav  produce lav2plt.pl annotation format, only show domains/repeats
371
372 --neg-doms,  -- report domains between annotated domains as NODOM
373                 (also --neg, --neg_doms)
374 --min_nodom=10  -- minimum length between domains for NODOM
375
376 --host, --user, --password, --port --db -- info for mysql database
377
378=head1 DESCRIPTION
379
380C<ann_feats_up_www.pl> extracts feature, domain, and repeat
381binformation from the Uniprot GFF server at:
382http://www.uniprot.org/uniprot/ACC.gff2 This server provides GFF
383descriptions of Uniprot entries, with much of the information provided
384in UniProt feature tables.  Currently, the Uniprot's GFF server used
385by C<ann_feats_up_www.pl> does not provide the changes associated with
386variation and mutagenesis features.  Fortunately, the Uniprot DAS
387server does provide this information, which is available using the
388C<ann_feats_up_www2.pl> script.
389
390C<ann_feats_up_www.pl> is an alternative to C<ann_feats2l.pl> and
391C<ann_feats2ipr.pl> that does not require a MySQL database with
392Uniprot Feature table information.
393
394Given a command line argument that contains a sequence accession
395(P09488), the program looks up the features available for that
396sequence and returns them in a tab-delimited format:
397
398>sp|P09488|GSTM1_HUMAN
3992	[	-	GST N-terminal :1
4007	V	F	Mutagen: Reduces catalytic activity 100- fold.
40123	*	-	MOD_RES: Phosphotyrosine (By similarity).
40233	*	-	MOD_RES: Phosphotyrosine (By similarity).
40334	*	-	MOD_RES: Phosphothreonine (By similarity).
40488	]	-
40590	[	-	GST C-terminal :2
406108	V	Q	Mutagen: Reduces catalytic activity by half.
407108	V	S	Mutagen: Changes the properties of the enzyme toward some substrates.
408109	V	I	Mutagen: Reduces catalytic activity by half.
409116	#	-	BINDING: Substrate.
410116	V	A	Mutagen: Reduces catalytic activity 10-fold.
411116	V	F	Mutagen: Slight increase of catalytic activity.
412173	V	N	in allele GSTM1B; dbSNP:rs1065411.
413208	]	-
414210	V	T	in dbSNP:rs449856.
415
416If features are provided, then a legend of feature symbols is provided
417as well:
418
419 =*:phosphorylation
420 ==:active site
421 =@:site
422 =^:binding
423 =!:metal binding
424
425If the C<--lav> option is specified, domain and repeat features are
426presented in a different format for the C<lav2plt.pl> program:
427
428  >sp|P09488|GSTM1_HUMAN
429  2	88	GST N-terminal.
430  90	208	GST C-terminal.
431
432C<ann_feats_up_www.pl> is designed to be used by the B<FASTA> programs with
433the C<-V \!ann_feats_up_www.pl> option.  It can also be used with the lav2plt.pl
434program with the C<--xA "\!ann_feats_up_www.pl --lav"> or C<--yA "\!ann_feats_up_www.pl --lav"> options.
435
436=head1 AUTHOR
437
438William R. Pearson, wrp@virginia.edu
439
440=cut
441