1#!/usr/bin/perl -w 2 3################################################################ 4# copyright (c) 2014 by William R. Pearson and The Rector & 5# Visitors of the University of Virginia */ 6################################################################ 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, 14# software distributed under this License is distributed on an "AS 15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either 16# express or implied. See the License for the specific language 17# governing permissions and limitations under the License. 18################################################################ 19 20# ann_feats_up_www2.pl gets an annotation file from fasta36 -V with a line of the form: 21 22# SP:GSTM1_HUMAN P09488 218 23# 24# it must: 25# (1) read in the line 26# (2) parse it to get the up_acc 27# (3) return the tab delimited features 28# 29 30# this version can read feature2 uniprot features (acc/pos/end/label/value), but returns sorted start/end domains 31 32use strict; 33 34use Getopt::Long; 35use Pod::Usage; 36use LWP::Simple; 37## use IO::String; 38 39my $up_base = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb'; 40my $gff_post = "gff2"; 41 42my %domains = (); 43my $domain_cnt = 0; 44 45my $hostname = `/bin/hostname`; 46 47my ($sstr, $lav, $neg_doms, $no_doms, $no_feats, $no_over, $data_file, $shelp, $help) = (0,0,0,0,0,0,0,0,0); 48my ($min_nodom) = (10); 49 50GetOptions( 51 "lav" => \$lav, 52 "no-over" => \$no_over, 53 "no_doms" => \$no_doms, 54 "no-doms" => \$no_doms, 55 "nodoms" => \$no_doms, 56 "neg" => \$neg_doms, 57 "neg_doms" => \$neg_doms, 58 "neg-doms" => \$neg_doms, 59 "negdoms" => \$neg_doms, 60 "min_nodom=i" => \$min_nodom, 61 "no_feats" => \$no_feats, 62 "no-feats" => \$no_feats, 63 "nofeats" => \$no_feats, 64 "data:s" => \$data_file, 65 "sstr" => \$sstr, 66 "h|?" => \$shelp, 67 "help" => \$help, 68 ); 69 70pod2usage(1) if $shelp; 71pod2usage(exitstatus => 0, verbose => 2) if $help; 72pod2usage(1) unless @ARGV || $data_file; 73 74#my @feat_keys = ('Acive site','Modified residue', 'Binding', 'Metal', 'Site'); 75 76my @feat_keys = qw(catalytic_residue posttranslation_modification binding_motif metal_contact 77 polypeptide_region mutated_variant_site natural_variant_site); 78 79my %feats_text = (); 80@feats_text{@feat_keys} = ('Active site', '', 'Substrate binding', 'Metal binding', 'Site', '',''); 81 82my %feats_label; 83@feats_label{@feat_keys} = ('Active site', 'Modified', 'Substrate binding', 'Metal binding', 'Site', '',''); 84 85my @feat_vals = ( '=','*','#','^','@','V','V'); 86 87 88my @dom_keys = qw( polypeptide_domain polypeptide_repeat ); 89my @dom_vals = ( [ '[', ']'],[ '[', ']']); 90 91my @ssr_keys = qw(beta_strand alpha_helix); 92my @ssr_vals = ( [ '[', ']']); 93 94my %annot_types = (); 95 96my $get_annot_sub = \&gff2_annots; 97if ($lav) { 98 $no_feats = 1; 99} 100 101if ($sstr) { 102 @annot_types{@ssr_keys} = @ssr_vals; 103} else { 104 @annot_types{@feat_keys} = @feat_vals unless ($no_feats); 105 @annot_types{@dom_keys} = @dom_vals unless ($no_doms); 106} 107 108if ($neg_doms) { 109 $domains{'NODOM'}=0; 110} 111 112my ($tmp, $gi, $sdb, $acc, $id, $use_acc); 113 114unless ($no_feats || $sstr) { 115 for my $i ( 0 .. $#feat_keys) { 116 next unless $feats_label{$feat_keys[$i]}; 117 print "=",$feat_vals[$i],":",$feats_label{$feat_keys[$i]},"\n"; 118 } 119} 120 121# get the query 122my ($query, $seq_len) = @ARGV; 123$seq_len = 0 unless defined($seq_len); 124 125$query =~ s/^>//; 126 127my $ANN_F; 128 129my @annots = (); 130 131#if it's a file I can open, read and parse it 132 133unless ($data_file) { 134 if ($query !~ m/\|/ && open($ANN_F, $query)) { 135 136 while (my $a_line = <$ANN_F>) { 137 $a_line =~ s/^>//; 138 chomp $a_line; 139 push @annots, lwp_annots($a_line, $get_annot_sub); 140 } 141 } else { 142 push @annots, lwp_annots("$query\t$seq_len", $get_annot_sub); 143 } 144} else { # just read the data from a file, give to $get_annot_sub(). 145 my %annot_data = (seq_info => ">$data_file DATA"); 146 147 open(DATA_IN, $data_file) || die "Cannot read $data_file"; 148 149 my $lwp_data = ""; 150 while (<DATA_IN>) { 151 $lwp_data .= $_; 152 } 153 154 $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_data,0); 155 156 push @annots, \%annot_data; 157} 158 159 160for my $seq_annot (@annots) { 161 print ">",$seq_annot->{seq_info},"\n"; 162 for my $annot (@{$seq_annot->{list}}) { 163 if (!$lav && defined($domains{$annot->[-1]})) { 164 $annot->[-1] .= " :".$domains{$annot->[-1]}; 165 } 166 print join("\t",@$annot),"\n"; 167 } 168} 169 170exit(0); 171 172sub lwp_annots { 173 my ($query_len, $get_annot_sub) = @_; 174 175 my ($annot_line, $seq_len) = split(/\t/,$query_len); 176 177 my %annot_data = (seq_info=>$annot_line); 178 179 if ($annot_line =~ m/^gi\|/) { 180 ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line); 181 } elsif ($annot_line =~ m/^(SP|TR):(\w+)/) { 182 $sdb = lc($1); 183 $id = $2; 184# $acc = $2; 185 } elsif ($annot_line =~ m/^(UR\d{3}:UniRef\d{2})_(\w+)/) { 186 $sdb = lc($1); 187 $id = $2; 188# $acc = $2; 189 } else { 190 ($sdb, $acc, $id) = split(/\|/,$annot_line); 191 } 192 193 $acc =~ s/\.\d+// if ($acc); 194 195 $annot_data{list} = []; 196 my $lwp_features = ""; 197 198 if ($acc && ($acc =~ m/^[A-Z][0-9][A-Z0-9]{3}[0-9]/)) { 199 $lwp_features = get("$up_base/$acc/$gff_post"); 200 } elsif ($id && ($id =~ m/^\w+$/)) { 201 $lwp_features = get("$up_base/$id/$gff_post"); 202 } 203 204 if ($lwp_features && ($lwp_features !~ /ERROR/)) { 205 $annot_data{list} = $get_annot_sub->(\%annot_types, $lwp_features, $seq_len); 206 } 207 208 return \%annot_data; 209} 210 211# parses www.uniprot.org gff feature table 212sub gff2_annots { 213 my ($annot_types, $annot_data, $seq_len) = @_; 214 215 my ($acc, $pos, $end, $label, $value, $comment, $len); 216 my ($seq_acc, $seq_start, $seq_end, $tmp); 217 218 $seq_len = 0; 219 220 my @feats2 = (); # features with start/stop, for checking overlap, adding negative 221 my @sites = (); # sites with one position 222 223 my @gff_lines = split(/\n/m,$annot_data); 224 225 my $gff_line = shift @gff_lines; # skip ##gff 226 shift @gff_lines; # ##Type Protein 227 shift @gff_lines; # '' 228 $gff_line = shift @gff_lines; 229 ($tmp, $seq_acc, $seq_start, $seq_end) = split(/\s+/,$gff_line); 230 $seq_len = $seq_end if ($seq_end > $seq_len); 231 232 while ($gff_line = shift(@gff_lines)) { 233 next if ($gff_line =~ m/^#/); 234 chomp($gff_line); 235 236 my @gff_line_arr = split(/\t/,$gff_line); 237 ($acc, $label, $pos, $end, $comment) = @gff_line_arr[(0,2,3,4,-1)]; 238 239 # combine different binding sites 240 if ($annot_types->{$label}) { 241 242 my @comments = (); 243 if ($comment =~ m/;/) { 244 @comments = split(/\s*;\s*/,$comment); 245 } else { 246 $comments[0] = $comment; 247 } 248 249 # select comments with 'Note=' 250 @comments = grep {/Note /} @comments; 251 for my $comment ( @comments) { 252 $comment =~ s/^Note\s+//; 253 $comment =~ s/"//g; 254 } 255 256 # select first comment 257 $value = $comments[0]; 258 259 if ($label =~ m/polypeptide_domain/ || $label =~ m/polypeptide_repeat/) { 260 $value = domain_name($label,$value); 261 push @feats2, [$pos, "-", $end, $value]; 262 } elsif ($label =~ m/Helix/) { 263 push @feats2, [$pos, "-", $end, $value]; 264 } elsif ($label =~ m/Beta/) { 265 push @feats2, [$pos, "-", $end, $value]; 266 } elsif ($label =~ m/mutated_variant_site/ || $label =~ m/natural_variant_site/) { 267 next unless $value; 268 my ($mutant) = ($value =~ m/->\s(\w)/); 269 next unless $mutant; 270 my $info = $comments[1]; 271 $info = '' unless $info; 272 if ($label =~ m/mutated_variant_site/) { 273 $info = "Mutant: $info"; 274 } 275 push @sites, [$pos, $annot_types->{$label}, $mutant, $info]; 276 } else { 277 $value = '' unless $value; 278 # print join("\t",($pos, $annot_types->{$label})),"\n"; 279 # print join("\t",($pos, $annot_types->{$label}, "-", "$label: $value")),"\n"; 280 if ($feats_text{$label}) { 281 my $info = $feats_text{$label}; 282 if ($value) { 283 $info .= ": $value"; 284 } 285 push @sites, [$pos, $annot_types->{$label}, "-", $info]; 286 } else { 287 push @sites, [$pos, $annot_types->{$label}, "-", $value]; 288 } 289 } 290 } 291 } 292 293 @feats2 = sort { $a->[0] <=> $b->[0] } @feats2; 294 295 if ($no_over) { 296 # check for containment 297 my $have_contained = 0; 298 my $last_container = 0; 299 for (my $i=1; $i < scalar(@feats2); $i++) { 300 if ($feats2[$i]->[0] >= $feats2[$last_container]->[0] && $feats2[$i]->[2] <= $feats2[$last_container]->[2]) { 301 $feats2[$i]->[1] = 'Delete'; 302 $have_contained = 1; 303 } else { 304 $last_container=$i; 305 } 306 } 307 308 if ($have_contained) { 309 @feats2 = grep { $_->[1] !~ /Delete/ } @feats2; 310 } 311 312 # ensure that domains do not overlap 313 for (my $i=1; $i < scalar(@feats2); $i++) { 314 my $diff = $feats2[$i-1]->[2] - $feats2[$i]->[0]; 315 if ($diff >= 0) { 316 $feats2[$i-1]->[2] = $feats2[$i]->[0]+ int($diff/2); 317 $feats2[$i]->[0] = $feats2[$i-1]->[2] + 1; 318 } 319 } 320 } 321 322 my @n_feats2 = (); 323 324 if ($neg_doms) { 325 my $last_end = 0; 326 for my $feat ( @feats2 ) { 327 if ($feat->[0] - $last_end > $min_nodom) { 328 push @n_feats2, [$last_end+1, "-", $feat->[0]-1, "NODOM"]; 329 } 330 $last_end = $feat->[2]; 331 } 332 if ($seq_len - $last_end > $min_nodom) { 333 push @n_feats2, [$last_end+1, "-", $seq_len, "NODOM"]; 334 } 335 } 336 337 my @feats = (); 338 for my $feat (@feats2, @n_feats2) { 339 if (!$lav) { 340 push @feats, [$feat->[0], '-', $feat->[2], $feat->[-1] ]; 341# push @feats, [$feat->[2], ']', '-', ""]; 342 } 343 else { 344 push @feats, [$feat->[0], $feat->[2], $feat->[-1]]; 345 } 346 } 347 348 @feats = sort { $a->[0] <=> $b->[0] } (@sites, @feats); 349 350 return \@feats; 351} 352 353# domain name takes a uniprot domain label, removes comments ( ; 354# truncated) and numbers and returns a canonical form. Thus: 355# Cortactin 6. 356# Cortactin 7; truncated. 357# becomes "Cortactin" 358# 359 360sub domain_name { 361 362 my ($label, $value) = @_; 363 364 $value = 'UnDef' unless $value; 365 366 if ($label =~ /Domain|Repeat/i) { 367 $value =~ s/;.*$//; 368 $value =~ s/\.\s*$//; 369 $value =~ s/\s+\d+$//; 370 if (!defined($domains{$value})) { 371 $domain_cnt++; 372 $domains{$value} = $domain_cnt; 373 } 374 return $value; 375 } 376 else { 377 return $value; 378 } 379} 380 381 382 383__END__ 384 385=pod 386 387=head1 NAME 388 389ann_feats_up_www2.pl 390 391=head1 SYNOPSIS 392 393 ann_feats_up_www2.pl --no_doms --no_feats --lav 'sp|P09488|GSTM1_NUMAN' | accession.file 394 395=head1 OPTIONS 396 397 -h short help 398 --help include description 399 --no-doms do not show domain boundaries (domains are always shown with --lav) 400 --no-feats do not show feature (variants, active sites, phospho-sites) 401 --lav produce lav2plt.pl annotation format, only show domains/repeats 402 403 --neg-doms, -- report domains between annotated domains as NODOM 404 (also --neg, --neg_doms) 405 --min_nodom=10 -- minimum length between domains for NODOM 406 407 --host, --user, --password, --port --db -- info for mysql database 408 409=head1 DESCRIPTION 410 411C<ann_feats_up_www2.pl> extracts feature, domain, and repeat 412information from the Uniprot DAS server through an XSLT transation 413provided by http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/uniprotkb. 414This server provides GFF descriptions of Uniprot entries, with most of 415the information provided in UniProt feature tables. 416 417C<ann_feats_up_www2.pl> is an alternative to C<ann_pfam.pl> and 418C<ann_pfam.pl> that does not require a local MySQL copy of Pfam. 419 420Given a command line argument that contains a sequence accession 421(P09488), the program looks up the features available for that 422sequence and returns them in a tab-delimited format: 423 424>sp|P09488|GSTM1_HUMAN 4252 [ - GST N-terminal :1 4267 V F Mutagen: Reduces catalytic activity 100- fold. 42723 * - MOD_RES: Phosphotyrosine (By similarity). 42833 * - MOD_RES: Phosphotyrosine (By similarity). 42934 * - MOD_RES: Phosphothreonine (By similarity). 43088 ] - 43190 [ - GST C-terminal :2 432108 V Q Mutagen: Reduces catalytic activity by half. 433108 V S Mutagen: Changes the properties of the enzyme toward some substrates. 434109 V I Mutagen: Reduces catalytic activity by half. 435116 # - BINDING: Substrate. 436116 V A Mutagen: Reduces catalytic activity 10-fold. 437116 V F Mutagen: Slight increase of catalytic activity. 438173 V N in allele GSTM1B; dbSNP:rs1065411. 439208 ] - 440210 V T in dbSNP:rs449856. 441 442If features are provided, then a legend of feature symbols is provided 443as well: 444 445 =*:phosphorylation 446 ==:active site 447 =@:site 448 =^:binding 449 =!:metal binding 450 451If the C<--lav> option is specified, domain and repeat features are 452presented in a different format for the C<lav2plt.pl> program: 453 454 >sp|P09488|GSTM1_HUMAN 455 2 88 GST N-terminal. 456 90 208 GST C-terminal. 457 458C<ann_feats_up_www2.pl> is designed to be used by the B<FASTA> programs with 459the C<-V \!ann_feats_up_www2.pl> option. It can also be used with the lav2plt.pl 460program with the C<--xA "\!ann_feats_up_www2.pl --lav"> or C<--yA "\!ann_feats_up_www2.pl --lav"> options. 461 462=head1 AUTHOR 463 464William R. Pearson, wrp@virginia.edu 465 466=cut 467