1#!/usr/bin/perl -w 2 3################################################################ 4# copyright (c) 2014 by William R. Pearson and The Rector & 5# Visitors of the University of Virginia */ 6################################################################ 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, 14# software distributed under this License is distributed on an "AS 15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either 16# express or implied. See the License for the specific language 17# governing permissions and limitations under the License. 18################################################################ 19 20# ann_feats_up_www.pl gets an annotation file from fasta36 -V with a line of the form: 21 22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg) 23# 24# it must: 25# (1) read in the line 26# (2) parse it to get the up_acc 27# (3) return the tab delimited features 28# 29 30# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains 31 32use strict; 33 34use Getopt::Long; 35use Pod::Usage; 36use LWP::Simple; 37## use IO::String; 38 39my $up_base = 'http://www.uniprot.org/uniprot'; 40my $gff_post = "gff"; 41 42my %domains = (); 43my $domain_cnt = 0; 44 45my $hostname = `/bin/hostname`; 46 47my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $shelp, $help) = (0,0,0,0,0,0,0); 48my ($min_nodom) = (10); 49 50GetOptions( 51 "lav" => \$lav, 52 "no_doms" => \$no_doms, 53 "no-doms" => \$no_doms, 54 "nodoms" => \$no_doms, 55 "neg" => \$neg_doms, 56 "neg_doms" => \$neg_doms, 57 "neg-doms" => \$neg_doms, 58 "negdoms" => \$neg_doms, 59 "min_nodom=i" => \$min_nodom, 60 "no_feats" => \$no_feats, 61 "no-feats" => \$no_feats, 62 "nofeats" => \$no_feats, 63 "sstr" => \$sstr, 64 "h|?" => \$shelp, 65 "help" => \$help, 66 ); 67 68pod2usage(1) if $shelp; 69pod2usage(exitstatus => 0, verbose => 2) if $help; 70pod2usage(1) unless @ARGV; 71 72my @feat_keys = ('Active site','Modified residue', 'Binding', 'Metal', 'Site'); 73 74my @feat_vals = ( '=','*','#','^','!'); 75 76my @dom_keys = qw( Domain Repeat ); 77my @dom_vals = ( [ '[', ']'],[ '[', ']']); 78 79my @ssr_keys = ('Beta strand', 'Helix'); 80my @ssr_vals = ( [ '[', ']']); 81 82my %annot_types = (); 83 84my $get_annot_sub = \&gff2_annots; 85if ($lav) { 86 $no_feats = 1; 87 $get_annot_sub = \&gff2_annots; 88} 89 90if ($sstr) {@annot_types{@ssr_keys} = @ssr_vals;} 91else { 92 @annot_types{@feat_keys} = @feat_vals unless ($no_feats); 93 @annot_types{@dom_keys} = @dom_vals unless ($no_doms); 94} 95 96if ($neg_doms) { 97 $domains{'NODOM'}=0; 98} 99 100my ($tmp, $gi, $sdb, $acc, $id, $use_acc); 101 102unless ($no_feats || $sstr) { 103 for my $i ( 0 .. $#feat_keys) { 104 print "=",$feat_vals[$i],":",$feat_keys[$i],"\n"; 105 } 106 # print "=*:phosphorylation\n"; 107 # print "==".":active site\n"; 108 # print "=@".":site\n"; 109 # print "=^:binding\n"; 110 # print "=!:metal binding\n"; 111} 112 113# get the query 114my ($query, $seq_len) = @ARGV; 115$seq_len = 0 unless defined($seq_len); 116 117$query =~ s/^>//; 118 119my $ANN_F; 120 121my @annots = (); 122 123#if it's a file I can open, read and parse it 124if ($query !~ m/\|/ && open($ANN_F, $query)) { 125 126 while (my $a_line = <$ANN_F>) { 127 $a_line =~ s/^>//; 128 chomp $a_line; 129 push @annots, lwp_annots($a_line, $get_annot_sub); 130 } 131} 132else { 133 push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub); 134} 135 136for my $seq_annot (@annots) { 137 print ">",$seq_annot->{seq_info},"\n"; 138 for my $annot (@{$seq_annot->{list}}) { 139 if (!$lav && defined($domains{$annot->[-1]})) { 140 $annot->[-1] .= " :".$domains{$annot->[-1]}; 141 } 142 print join("\t",@$annot),"\n"; 143 } 144} 145 146exit(0); 147 148sub lwp_annots { 149 my ($query_len, $get_annot_sub) = @_; 150 151 my ($annot_line, $seq_len) = split(/\t/,$query_len); 152 153 my %annot_data = (seq_info=>$annot_line); 154 155 if ($annot_line =~ m/^gi\|/) { 156 ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line); 157 } 158 elsif ($annot_line =~ m/^(SP|TR):(\w+)\s(\w+)/) { 159 $sdb = lc($1); 160 $id = $2; 161 $acc = $3; 162 } 163 else { 164 ($sdb, $acc, $id) = split(/\|/,$annot_line); 165 } 166 167 $annot_data{list} = []; 168 my $lwp_features = ""; 169 170 if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) { 171 $lwp_features = get("$up_base/$acc.$gff_post"); 172 } 173 174 if ($lwp_features && ($lwp_features !~ /ERROR/)) { 175 $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_features, $seq_len); 176 } 177 else { 178 $annot_data{list} = []; 179 } 180 181 return \%annot_data; 182} 183 184# parses www.uniprot.org gff feature table 185sub gff2_annots { 186 my ($annot_types, $annot_data, $seq_len) = @_; 187 188 my ($acc, $pos, $end, $label, $value, $comment, $len); 189 my ($seq_acc, $seq_start, $seq_end, $tmp); 190 191 $seq_len = 0; 192 193 my @feats2 = (); # features with start/stop, for checking overlap, adding negative 194 my @sites = (); # sites with one position 195 196# my $io = IO::String->new($annot_data); 197 198 my @gff_lines = split(/\n/m,$annot_data); 199 200 my $gff_line = shift @gff_lines; # skip ##gff 201 $gff_line = shift @gff_lines; # get sequence-region 202 ($tmp, $seq_acc, $seq_start, $seq_end) = split(/\s+/,$gff_line); 203 $seq_len = $seq_end if ($seq_end > $seq_len); 204 205 while ($gff_line = shift(@gff_lines)) { 206 chomp($gff_line); 207 208 my @gff_line_arr = split(/\t/,$gff_line); 209 ($acc, $label, $pos, $end, $comment) = @gff_line_arr[(0,2,3,4,-1)]; 210 211 # combine different binding sites 212 if ($label =~ /^Metal/) { $label = 'Metal';} 213 elsif ($label =~ /binding/i) { $label = 'Binding';} 214 215 if ($annot_types->{$label}) { 216 217 my @comments = (); 218 if ($comment =~ m/;/) { 219 @comments = split(/;/,$comment); 220 } else { 221 $comments[0] = $comment; 222 } 223 224 # select first comment with 'Note=' 225 ($value) = grep {/Note=/} @comments; 226 if ($value) { 227 $value =~ s/^Note=//; 228 $value =~ s/\%3B//g; 229 } 230 else { $value = "";} 231 232 if ($label =~ m/Domain/ || $label =~ m/Repeat/) { 233 $value = domain_name($label,$value); 234 push @feats2, [$pos, "-", $end, $value]; 235 } elsif ($label =~ m/Helix/) { 236 push @feats2, [$pos, "-", $end, $value]; 237 } elsif ($label =~ m/Beta/) { 238 push @feats2, [$pos, "-", $end, $value]; 239 } 240 else { 241# print join("\t",($pos, $annot_types->{$label})),"\n"; 242# print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n"; 243 push @sites, [$pos, $annot_types->{$label}, "-", "$label: $value"]; 244 } 245 } 246 } 247 248 @feats2 = sort { $a->[0] <=> $b->[0] } @feats2; 249 250 # check for containment 251 my $have_contained = 0; 252 my $last_container = 0; 253 for (my $i=1; $i < scalar(@feats2); $i++) { 254 if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) { 255 $feats2[$i]->[1] = 'Delete'; 256 $have_contained = 1; 257 } 258 else { 259 $last_container=$i; 260 } 261 } 262 263 if ($have_contained) { 264 @feats2 = grep { $_->[1] !~ /Delete/ } @feats2; 265 } 266 267 # ensure that domains do not overlap 268 for (my $i=1; $i < scalar(@feats2); $i++) { 269 my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0]; 270 if ($diff >= 0) { 271 $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2); 272 $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1; 273 } 274 } 275 276 my @n_feats2 = (); 277 278 if ($neg_doms) { 279 my $last_end = 0; 280 for my $feat ( @feats2 ) { 281 if ($feat->[0] - $last_end > $min_nodom) { 282 push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"]; 283 } 284 $last_end = $feat->[2]; 285 } 286 if ($seq_len - $last_end > $min_nodom) { 287 push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"]; 288 } 289 } 290 291 my @feats = (); 292 for my $feat (@feats2, @n_feats2) { 293 if (!$lav) { 294 push @feats, [$feat->[0], '[', '-', $feat->[-1] ]; 295 push @feats, [$feat->[2], ']', '-', ""]; 296 } 297 else { 298 push @feats, [$feat->[0], $feat->[2], $feat->[-1]]; 299 } 300 } 301 302 @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats); 303 304 return \@feats; 305} 306 307sub get_lav_annots { 308 my ($annot_types, $get_annots_sql, $seq_len) = @_; 309 310 my ($pos, $end, $label, $value, $comment); 311 312 my @feats = (); 313 314 my %annot = (); 315 while (($acc, $pos, $end, $label, $value) = $get_annots_sql->fetchrow_array()) { 316 next unless ($label =~ m/^DOMAIN/ || $label =~ m/^REPEAT/); 317 $value = domain_name($label,$value); 318 push @feats, [$pos, $end, $value]; 319 } 320 321 return \@feats; 322} 323 324# domain name takes a uniprot domain label, removes comments ( ; 325# truncated) and numbers and returns a canonical form. Thus: 326# Cortactin 6. 327# Cortactin 7; truncated. 328# becomes "Cortactin" 329# 330 331sub domain_name { 332 333 my ($label, $value) = @_; 334 335 if ($label =~ /Domain|Repeat/i) { 336 $value =~ s/;.*$//; 337 $value =~ s/\.\s*$//; 338 $value =~ s/\s+\d+$//; 339 if (!defined($domains{$value})) { 340 $domain_cnt++; 341 $domains{$value} = $domain_cnt; 342 } 343 return $value; 344 } 345 else { 346 return $value; 347 } 348} 349 350 351 352__END__ 353 354=pod 355 356=head1 NAME 357 358ann_feats_up_www.pl 359 360=head1 SYNOPSIS 361 362 ann_feats_up_www.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file 363 364=head1 OPTIONS 365 366 -h short help 367 --help include description 368 --no-doms do not show domain boundaries (domains are always shown with --lav) 369 --no-feats do not show feature (variants, active sites, phospho-sites) 370 --lav produce lav2plt.pl annotation format, only show domains/repeats 371 372 --neg-doms, -- report domains between annotated domains as NODOM 373 (also --neg, --neg_doms) 374 --min_nodom=10 -- minimum length between domains for NODOM 375 376 --host, --user, --password, --port --db -- info for mysql database 377 378=head1 DESCRIPTION 379 380C<ann_feats_up_www.pl> extracts feature, domain, and repeat 381binformation from the Uniprot GFF server at: 382http://www.uniprot.org/uniprot/ACC.gff2 This server provides GFF 383descriptions of Uniprot entries, with much of the information provided 384in UniProt feature tables. Currently, the Uniprot's GFF server used 385by C<ann_feats_up_www.pl> does not provide the changes associated with 386variation and mutagenesis features. Fortunately, the Uniprot DAS 387server does provide this information, which is available using the 388C<ann_feats_up_www2.pl> script. 389 390C<ann_feats_up_www.pl> is an alternative to C<ann_feats2l.pl> and 391C<ann_feats2ipr.pl> that does not require a MySQL database with 392Uniprot Feature table information. 393 394Given a command line argument that contains a sequence accession 395(P09488), the program looks up the features available for that 396sequence and returns them in a tab-delimited format: 397 398>sp|P09488|GSTM1_HUMAN 3992 [ - GST N-terminal :1 4007 V F Mutagen: Reduces catalytic activity 100- fold. 40123 * - MOD_RES: Phosphotyrosine (By similarity). 40233 * - MOD_RES: Phosphotyrosine (By similarity). 40334 * - MOD_RES: Phosphothreonine (By similarity). 40488 ] - 40590 [ - GST C-terminal :2 406108 V Q Mutagen: Reduces catalytic activity by half. 407108 V S Mutagen: Changes the properties of the enzyme toward some substrates. 408109 V I Mutagen: Reduces catalytic activity by half. 409116 # - BINDING: Substrate. 410116 V A Mutagen: Reduces catalytic activity 10-fold. 411116 V F Mutagen: Slight increase of catalytic activity. 412173 V N in allele GSTM1B; dbSNP:rs1065411. 413208 ] - 414210 V T in dbSNP:rs449856. 415 416If features are provided, then a legend of feature symbols is provided 417as well: 418 419 =*:phosphorylation 420 ==:active site 421 =@:site 422 =^:binding 423 =!:metal binding 424 425If the C<--lav> option is specified, domain and repeat features are 426presented in a different format for the C<lav2plt.pl> program: 427 428 >sp|P09488|GSTM1_HUMAN 429 2 88 GST N-terminal. 430 90 208 GST C-terminal. 431 432C<ann_feats_up_www.pl> is designed to be used by the B<FASTA> programs with 433the C<-V \!ann_feats_up_www.pl> option. It can also be used with the lav2plt.pl 434program with the C<--xA "\!ann_feats_up_www.pl --lav"> or C<--yA "\!ann_feats_up_www.pl --lav"> options. 435 436=head1 AUTHOR 437 438William R. Pearson, wrp@virginia.edu 439 440=cut 441