1#!/usr/bin/perl -w
2
3################################################################
4# copyright (c) 2014 by William R. Pearson and The Rector &
5# Visitors of the University of Virginia */
6################################################################
7# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
9# You may obtain a copy of the License at
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing,
14# software distributed under this License is distributed on an "AS
15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16# express or implied.  See the License for the specific language
17# governing permissions and limitations under the License.
18################################################################
19
20# ann_feats_up_www2.pl gets an annotation file from fasta36 -V with a line of the form:
21
22# SP:GSTM1_HUMAN P09488 218
23#
24# it must:
25# (1) read in the line
26# (2) parse it to get the up_acc
27# (3) return the tab delimited features
28#
29
30# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains
31
32use strict;
33
34use Getopt::Long;
35use Pod::Usage;
36use LWP::Simple;
37## use IO::String;
38
39my $up_base = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb';
40my $gff_post = "gff2";
41
42my %domains = ();
43my $domain_cnt = 0;
44
45my $hostname = `/bin/hostname`;
46
47my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $no_over, $data_file, $shelp, $help) = (0,0,0,0,0,0,0,0,0);
48my ($min_nodom) = (10);
49
50GetOptions(
51    "lav" => \$lav,
52    "no-over" => \$no_over,
53	   "no_doms" => \$no_doms,
54	   "no-doms" => \$no_doms,
55	   "nodoms" => \$no_doms,
56	   "neg" => \$neg_doms,
57	   "neg_doms" => \$neg_doms,
58	   "neg-doms" => \$neg_doms,
59	   "negdoms" => \$neg_doms,
60	   "min_nodom=i" => \$min_nodom,
61	   "no_feats" => \$no_feats,
62	   "no-feats" => \$no_feats,
63	   "nofeats" => \$no_feats,
64	   "data:s" => \$data_file,
65	   "sstr" => \$sstr,
66	   "h|?" => \$shelp,
67	   "help" => \$help,
68	  );
69
70pod2usage(1) if $shelp;
71pod2usage(exitstatus => 0, verbose => 2) if $help;
72pod2usage(1) unless @ARGV || $data_file;
73
74#my @feat_keys = ('Acive site','Modified residue', 'Binding', 'Metal', 'Site');
75
76my @feat_keys = qw(catalytic_residue posttranslation_modification binding_motif metal_contact
77		   polypeptide_region mutated_variant_site natural_variant_site);
78
79my %feats_text = ();
80@feats_text{@feat_keys} = ('Active site', '', 'Substrate binding', 'Metal binding', 'Site', '','');
81
82my %feats_label;
83@feats_label{@feat_keys} = ('Active site', 'Modified', 'Substrate binding', 'Metal binding', 'Site', '','');
84
85my @feat_vals = ( '=','*','#','^','@','V','V');
86
87
88my @dom_keys = qw( polypeptide_domain polypeptide_repeat );
89my @dom_vals = ( [ '[', ']'],[ '[', ']']);
90
91my @ssr_keys = qw(beta_strand alpha_helix);
92my @ssr_vals = ( [ '[', ']']);
93
94my %annot_types = ();
95
96my $get_annot_sub = \&gff2_annots;
97if ($lav) {
98  $no_feats = 1;
99}
100
101if ($sstr) {
102  @annot_types{@ssr_keys} = @ssr_vals;
103} else {
104  @annot_types{@feat_keys} = @feat_vals unless ($no_feats);
105  @annot_types{@dom_keys} = @dom_vals unless ($no_doms);
106}
107
108if ($neg_doms) {
109  $domains{'NODOM'}=0;
110}
111
112my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
113
114unless ($no_feats || $sstr) {
115  for my $i ( 0 .. $#feat_keys) {
116    next unless $feats_label{$feat_keys[$i]};
117    print "=",$feat_vals[$i],":",$feats_label{$feat_keys[$i]},"\n";
118  }
119}
120
121# get the query
122my ($query, $seq_len) =  @ARGV;
123$seq_len = 0 unless defined($seq_len);
124
125$query =~ s/^>//;
126
127my $ANN_F;
128
129my @annots = ();
130
131#if it's a file I can open, read and parse it
132
133unless ($data_file) {
134  if ($query !~ m/\|/ && open($ANN_F, $query)) {
135
136    while (my $a_line = <$ANN_F>) {
137      $a_line =~ s/^>//;
138      chomp $a_line;
139      push @annots, lwp_annots($a_line, $get_annot_sub);
140    }
141  } else {
142    push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub);
143  }
144} else {   # just read the data from a file, give to $get_annot_sub().
145  my %annot_data = (seq_info => ">$data_file DATA");
146
147  open(DATA_IN, $data_file) || die "Cannot read $data_file";
148
149  my $lwp_data = "";
150  while (<DATA_IN>) {
151    $lwp_data .= $_;
152  }
153
154  $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_data,0);
155
156  push @annots, \%annot_data;
157}
158
159
160for my $seq_annot (@annots) {
161  print ">",$seq_annot->{seq_info},"\n";
162  for my $annot (@{$seq_annot->{list}}) {
163    if (!$lav && defined($domains{$annot->[-1]})) {
164      $annot->[-1] .= " :".$domains{$annot->[-1]};
165    }
166    print join("\t",@$annot),"\n";
167  }
168}
169
170exit(0);
171
172sub lwp_annots {
173  my ($query_len, $get_annot_sub) = @_;
174
175  my ($annot_line, $seq_len) = split(/\t/,$query_len);
176
177  my %annot_data = (seq_info=>$annot_line);
178
179  if ($annot_line =~ m/^gi\|/) {
180    ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
181  } elsif ($annot_line =~ m/^(SP|TR):(\w+)/) {
182    $sdb = lc($1);
183    $id = $2;
184#     $acc = $2;
185  } elsif ($annot_line =~ m/^(UR\d{3}:UniRef\d{2})_(\w+)/) {
186    $sdb = lc($1);
187    $id = $2;
188#    $acc = $2;
189  } else {
190    ($sdb, $acc, $id) = split(/\|/,$annot_line);
191  }
192
193  $acc =~ s/\.\d+// if ($acc);
194
195  $annot_data{list} = [];
196  my $lwp_features = "";
197
198  if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) {
199    $lwp_features = get("$up_base/$acc/$gff_post");
200  } elsif ($id && ($id =~ m/^\w+$/)) {
201    $lwp_features = get("$up_base/$id/$gff_post");
202  }
203
204  if ($lwp_features && ($lwp_features !~ /ERROR/)) {
205    $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_features, $seq_len);
206  }
207
208  return \%annot_data;
209}
210
211# parses www.uniprot.org gff feature table
212sub gff2_annots {
213  my ($annot_types, $annot_data, $seq_len) = @_;
214
215  my ($acc, $pos, $end, $label, $value, $comment, $len);
216  my ($seq_acc, $seq_start, $seq_end, $tmp);
217
218  $seq_len = 0;
219
220  my @feats2 = (); # features with start/stop, for checking overlap, adding negative
221  my @sites = ();  # sites with one position
222
223  my @gff_lines = split(/\n/m,$annot_data);
224
225  my $gff_line = shift @gff_lines; # skip ##gff
226  shift @gff_lines;		   # ##Type Protein
227  shift @gff_lines;		   # ''
228  $gff_line = shift @gff_lines;
229  ($tmp, $seq_acc, $seq_start, $seq_end) = split(/\s+/,$gff_line);
230  $seq_len = $seq_end if ($seq_end > $seq_len);
231
232  while ($gff_line = shift(@gff_lines)) {
233    next if ($gff_line =~ m/^#/);
234    chomp($gff_line);
235
236    my @gff_line_arr = split(/\t/,$gff_line);
237    ($acc, $label, $pos, $end, $comment) = @gff_line_arr[(0,2,3,4,-1)];
238
239    # combine different binding sites
240    if ($annot_types->{$label}) {
241
242      my @comments = ();
243      if ($comment =~ m/;/) {
244	@comments = split(/\s*;\s*/,$comment);
245      } else {
246	$comments[0] = $comment;
247      }
248
249      # select comments with 'Note='
250      @comments = grep {/Note /} @comments;
251      for my $comment ( @comments) {
252	$comment =~ s/^Note\s+//;
253	$comment =~ s/"//g;
254      }
255
256      # select first comment
257      $value = $comments[0];
258
259      if ($label =~ m/polypeptide_domain/ || $label =~ m/polypeptide_repeat/) {
260	$value = domain_name($label,$value);
261	push @feats2, [$pos, "-", $end, $value];
262      } elsif ($label =~ m/Helix/) {
263	push @feats2, [$pos, "-", $end, $value];
264      } elsif ($label =~ m/Beta/) {
265	push @feats2, [$pos, "-", $end, $value];
266      } elsif ($label =~ m/mutated_variant_site/ || $label =~ m/natural_variant_site/) {
267	next unless $value;
268	my ($mutant) = ($value =~ m/->\s(\w)/);
269	next unless $mutant;
270	my $info = $comments[1];
271	$info = '' unless $info;
272	if ($label =~ m/mutated_variant_site/) {
273	  $info = "Mutant: $info";
274	}
275	push @sites, [$pos, $annot_types->{$label}, $mutant, $info];
276      } else {
277	$value = '' unless $value;
278	#	print join("\t",($pos, $annot_types->{$label})),"\n";
279	#	print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n";
280	if ($feats_text{$label}) {
281	  my $info = $feats_text{$label};
282	  if ($value) {
283	    $info .= ": $value";
284	  }
285	  push @sites, [$pos, $annot_types->{$label}, "-", $info];
286	} else {
287	  push @sites, [$pos, $annot_types->{$label}, "-", $value];
288	}
289      }
290    }
291  }
292
293  @feats2 = sort { $a->[0] <=> $b->[0] } @feats2;
294
295  if ($no_over) {
296    # check for containment
297    my $have_contained = 0;
298    my $last_container = 0;
299    for (my $i=1; $i < scalar(@feats2); $i++) {
300      if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) {
301	$feats2[$i]->[1] = 'Delete';
302	$have_contained = 1;
303      } else {
304	$last_container=$i;
305      }
306    }
307
308    if ($have_contained) {
309      @feats2 = grep { $_->[1] !~ /Delete/ } @feats2;
310    }
311
312    # ensure that domains do not overlap
313    for (my $i=1; $i < scalar(@feats2); $i++) {
314      my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0];
315      if ($diff >= 0) {
316	$feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2);
317	$feats2[$i]->[0] = $feats2[$i-1]->[2] + 1;
318      }
319    }
320  }
321
322  my @n_feats2 = ();
323
324  if ($neg_doms) {
325    my $last_end = 0;
326    for my $feat ( @feats2 ) {
327      if ($feat->[0] - $last_end > $min_nodom) {
328	push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"];
329      }
330      $last_end = $feat->[2];
331    }
332    if ($seq_len - $last_end > $min_nodom) {
333      push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"];
334    }
335  }
336
337  my @feats = ();
338  for my $feat (@feats2, @n_feats2) {
339    if (!$lav)  {
340      push @feats, [$feat->[0], '-', $feat->[2], $feat->[-1] ];
341#      push @feats, [$feat->[2], ']', '-', ""];
342    }
343    else {
344      push @feats, [$feat->[0], $feat->[2], $feat->[-1]];
345    }
346  }
347
348  @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats);
349
350  return \@feats;
351}
352
353# domain name takes a uniprot domain label, removes comments ( ;
354# truncated) and numbers and returns a canonical form. Thus:
355# Cortactin 6.
356# Cortactin 7; truncated.
357# becomes "Cortactin"
358#
359
360sub domain_name {
361
362  my ($label, $value) = @_;
363
364  $value = 'UnDef' unless $value;
365
366  if ($label =~ /Domain|Repeat/i) {
367    $value =~ s/;.*$//;
368    $value =~ s/\.\s*$//;
369    $value =~ s/\s+\d+$//;
370    if (!defined($domains{$value})) {
371      $domain_cnt++;
372      $domains{$value} = $domain_cnt;
373    }
374    return $value;
375  }
376  else {
377    return $value;
378  }
379}
380
381
382
383__END__
384
385=pod
386
387=head1 NAME
388
389ann_feats_up_www2.pl
390
391=head1 SYNOPSIS
392
393 ann_feats_up_www2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file
394
395=head1 OPTIONS
396
397 -h	short help
398 --help include description
399 --no-doms  do not show domain boundaries (domains are always shown with --lav)
400 --no-feats do not show feature (variants, active sites, phospho-sites)
401 --lav  produce lav2plt.pl annotation format, only show domains/repeats
402
403 --neg-doms,  -- report domains between annotated domains as NODOM
404                 (also --neg, --neg_doms)
405 --min_nodom=10  -- minimum length between domains for NODOM
406
407 --host, --user, --password, --port --db -- info for mysql database
408
409=head1 DESCRIPTION
410
411C<ann_feats_up_www2.pl> extracts feature, domain, and repeat
412information from the Uniprot DAS server through an XSLT transation
413provided by http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb.
414This server provides GFF descriptions of Uniprot entries, with most of
415the information provided in UniProt feature tables.
416
417C<ann_feats_up_www2.pl> is an alternative to C<ann_pfam.pl> and
418C<ann_pfam.pl> that does not require a local MySQL copy of Pfam.
419
420Given a command line argument that contains a sequence accession
421(P09488), the program looks up the features available for that
422sequence and returns them in a tab-delimited format:
423
424>sp|P09488|GSTM1_HUMAN
4252	[	-	GST N-terminal :1
4267	V	F	Mutagen: Reduces catalytic activity 100- fold.
42723	*	-	MOD_RES: Phosphotyrosine (By similarity).
42833	*	-	MOD_RES: Phosphotyrosine (By similarity).
42934	*	-	MOD_RES: Phosphothreonine (By similarity).
43088	]	-
43190	[	-	GST C-terminal :2
432108	V	Q	Mutagen: Reduces catalytic activity by half.
433108	V	S	Mutagen: Changes the properties of the enzyme toward some substrates.
434109	V	I	Mutagen: Reduces catalytic activity by half.
435116	#	-	BINDING: Substrate.
436116	V	A	Mutagen: Reduces catalytic activity 10-fold.
437116	V	F	Mutagen: Slight increase of catalytic activity.
438173	V	N	in allele GSTM1B; dbSNP:rs1065411.
439208	]	-
440210	V	T	in dbSNP:rs449856.
441
442If features are provided, then a legend of feature symbols is provided
443as well:
444
445 =*:phosphorylation
446 ==:active site
447 =@:site
448 =^:binding
449 =!:metal binding
450
451If the C<--lav> option is specified, domain and repeat features are
452presented in a different format for the C<lav2plt.pl> program:
453
454  >sp|P09488|GSTM1_HUMAN
455  2	88	GST N-terminal.
456  90	208	GST C-terminal.
457
458C<ann_feats_up_www2.pl> is designed to be used by the B<FASTA> programs with
459the C<-V \!ann_feats_up_www2.pl> option.  It can also be used with the lav2plt.pl
460program with the C<--xA "\!ann_feats_up_www2.pl --lav"> or C<--yA "\!ann_feats_up_www2.pl --lav"> options.
461
462=head1 AUTHOR
463
464William R. Pearson, wrp@virginia.edu
465
466=cut
467