1#!/usr/bin/perl -w 2 3################################################################ 4# copyright (c) 2014 by William R. Pearson and The Rector & 5# Visitors of the University of Virginia */ 6################################################################ 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, 14# software distributed under this License is distributed on an "AS 15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either 16# express or implied. See the License for the specific language 17# governing permissions and limitations under the License. 18################################################################ 19 20# ann_feats.pl gets an annotation file from fasta36 -V with a line of the form: 21 22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg) 23# 24# it must: 25# (1) read in the line 26# (2) parse it to get the up_acc 27# (3) return the tab delimited features 28# 29 30# this version is designed for various formats of the pdbaa/pdbaa_off NCBI files with the lines: 31# >gi|4388890|pdb|1GTUA|sp|P09488 or 32# >gi|4388890|pdb|1GTU|A 33# if I can find |sp|P09488, I will use that, otherwise I will use 34# |pdb|1GTU|A (concatenated) and a different part of the cath_dom 35# database 36# 37 38use strict; 39 40use DBI; 41use Getopt::Long; 42use Pod::Usage; 43 44use vars qw($host $db $port $user $pass); 45 46my $hostname = `/bin/hostname`; 47 48($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "uniprot", 0, "web_user", "fasta_www"); 49 50my ($neg_doms, $lav, $shelp, $help, $class) = (0, 0, 0, 0, 0); 51my ($min_nodom) = (10); 52 53GetOptions( 54 "host=s" => \$host, 55 "db=s" => \$db, 56 "user=s" => \$user, 57 "password=s" => \$pass, 58 "port=i" => \$port, 59 "lav" => \$lav, 60 "neg" => \$neg_doms, 61 "neg_doms" => \$neg_doms, 62 "neg-doms" => \$neg_doms, 63 "min_nodom=i" => \$min_nodom, 64 "class" => \$class, 65 "h|?" => \$shelp, 66 "help" => \$help, 67 ); 68 69pod2usage(1) if $shelp; 70pod2usage(exitstatus => 0, verbose => 2) if $help; 71pod2usage(1) unless @ARGV; 72 73my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db"; 74$connect .= ";host=$host" if $host; 75$connect .= ";port=$port" if $port; 76 77my $dbh = DBI->connect($connect, 78 $user, 79 $pass 80 ) or die $DBI::errstr; 81 82my %domains = (NODOM=>0); 83my $domain_cnt = 0; 84 85my $get_offsets_pdb = $dbh->prepare(<<EOSQL); 86SELECT res_beg, pdb_beg, pdb_end, sp_beg, sp_end 87FROM pdb_chain_up 88WHERE pdb_acc=? 89ORDER BY res_beg 90EOSQL 91 92my $get_cathdoms_pdb = $dbh->prepare(<<EOSQL); 93SELECT s_start, s_stop, p_start, p_stop, cath_class, s_descr as info 94FROM cath_doms 95JOIN cath_names using(cath_class) 96WHERE pdb_acc=? 97ORDER BY s_start 98EOSQL 99 100my ($tmp, $gi, $sdb, $acc, $id, $use_acc); 101 102# get the query 103my ($query, $seq_len) = @ARGV; 104$seq_len = 1 unless $seq_len; 105 106$query =~ s/^>//; 107 108my $ANN_F; 109 110my @annots = (); 111 112#if it's a file I can open, read and parse it 113if ($query !~ m/\|/ && open($ANN_F, $query)) { 114 115 while (my $a_line = <$ANN_F>) { 116 $a_line =~ s/^>//; 117 chomp $a_line; 118 push @annots, show_annots($lav,$a_line); 119 } 120} 121else { 122 push @annots, show_annots($lav, "$query $seq_len"); 123} 124 125for my $seq_annot (@annots) { 126 print ">",$seq_annot->{seq_info},"\n"; 127 for my $annot (@{$seq_annot->{list}}) { 128 if (!$lav && defined($domains{$annot->[-1]})) { 129 $annot->[-1] .= " :".$domains{$annot->[-1]}; 130 } 131 print join("\t",@$annot),"\n"; 132 } 133} 134 135exit(0); 136 137sub show_annots { 138 my ($lav, $annot_line) = @_; 139 140 my ($query, $seq_len) = split(/\s+/,$annot_line); 141 142 my %annot_data = (seq_info => $query); 143 144 my ($tmp, $gi, $pdb, $pdb_acc, $pdb_id, $pdb_chain, $sdb, $up_acc, $off_flag); 145 146 $off_flag = 0; 147 if ($query =~ m/^gi\|/) { 148 if ($query =~ m/\|sp\|/) { 149 ($tmp, $gi, $pdb, $pdb_acc, $sdb, $up_acc) = split(/\|/,$query); 150 $up_acc =~ s/\.\d+$//; 151 $off_flag = 1; 152 } 153 elsif ($query =~ m/\|pdb\|/) { 154 ($tmp, $gi, $pdb, $pdb_id, $pdb_chain) = split(/\|/,$query); 155 $pdb_acc = $pdb_id . $pdb_chain; 156 } 157 } 158 elsif ($query =~ m/^sp\|/) { 159 ($pdb, $pdb_acc) = split(/\|/,$query); 160 } 161 elsif ($query =~ m/^pdb\|(\w{4})\|(\w)/) { 162 $pdb_acc = $1 . $2; 163 } 164 else { 165 $pdb_acc = $query; 166 } 167 168# only get the first res_beg because it is used to calculate pdbaa_off @c:xxx 169 $get_offsets_pdb->execute($pdb_acc); 170 my ($res_beg, $pdb_beg, $pdb_end, $sp_beg, $sp_end) = $get_offsets_pdb->fetchrow_array(); 171 172 $res_beg = 1 unless defined($res_beg); 173 $pdb_beg = 1 unless defined($pdb_beg); 174 $sp_beg = 1 unless defined($sp_beg); 175 176 if (defined($sp_end) && $sp_end > $seq_len) {$seq_len = $sp_end;} 177 if (defined($pdb_end) && $pdb_end > $seq_len) {$seq_len = $pdb_end;} 178 179 # unless ($seq_len > 1) { 180 # if (defined($sp_end)) { 181 # $seq_len = $sp_end; 182 # } 183 # elsif (defined($pdb_end)) { 184 # $seq_len = $pdb_end; 185 # } 186 # } 187 188 $get_cathdoms_pdb->execute($pdb_acc); 189 $annot_data{list} = get_cath_annots($lav, $get_cathdoms_pdb, $pdb_beg, $seq_len, $off_flag); 190 191 return \%annot_data; 192} 193 194sub get_cath_annots { 195 my ($lav, $get_annots, $sp_offset, $seq_length, $is_offset) = @_; 196 197 my @cath_domains = (); 198 199 # get the list of domains, sorted by start 200 while ( my $row_href = $get_annots->fetchrow_hashref()) { 201 202 # put in logic to subtract sp_offset when necessary 203 if ($is_offset && $row_href->{p_start}) { 204 $row_href->{seq_start} = $row_href->{p_start} - $sp_offset; 205 $row_href->{seq_end} = $row_href->{p_stop} - $sp_offset; 206 } 207 else { 208 $row_href->{seq_start} = $row_href->{s_start} - $sp_offset; 209 $row_href->{seq_end} = $row_href->{s_stop} - $sp_offset; 210 } 211 212 if ($seq_length <= 1) { 213 $seq_length = $row_href->{seq_end}; 214 } 215 else { 216 $row_href->{seq_end} = $seq_length if ($row_href->{seq_end} > $seq_length); 217 } 218 push @cath_domains, $row_href 219 } 220 221 return unless (scalar(@cath_domains)); 222 223 # do a consistency check 224 for (my $i=1; $i < scalar(@cath_domains); $i++) { 225 if ($cath_domains[$i]->{seq_start} <= $cath_domains[$i-1]->{seq_end}) { 226 my $delta = $cath_domains[$i]->{seq_start} - $cath_domains[$i-1]->{seq_end}; 227 $cath_domains[$i-1]->{seq_end} -= $delta/2; 228 $cath_domains[$i]->{seq_start} = $cath_domains[$i-1]->{seq_end}+1; 229 } 230 } 231 232 if ($neg_doms) { 233 my @ncath_domains; 234 my $prev_dom={seq_end=>0}; 235 for my $cur_dom ( @cath_domains) { 236 if ($cur_dom{seq_start} - $prev_dom{seq_end} > $min_nodom) { 237 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $cur_dom->{seq_start}-1, info=>'NODOM'); 238 push @ncath_domains, \%new_dom; 239 } 240 push @ncath_domains, $cur_dom; 241 $prev_dom = $cur_dom; 242 } 243 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM'); 244 if ($new_dom{seq_end} > $new_dom{seq_start}) {push @ncath_domains, \%new_dom;} 245 246 @cath_domains = @ncath_domains; 247 } 248 249 for my $cath (@cath_domains) { 250 if ($class && $cath->{cath_class}) {$cath->{info} = $cath->{cath_class};} 251 $cath->{info} = domain_name($cath->{info}); 252 } 253 254 my @feats = (); 255 256 if ($lav) { 257 for my $d_ref (@cath_domains) { 258 push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info} ]; 259 } 260 } 261 else { 262 for my $d_ref (@cath_domains) { 263 push @feats, [$d_ref->{seq_start}, '[', '-', $d_ref->{info} ]; 264 push @feats, [$d_ref->{seq_end}, ']', '-', ""]; 265 } 266 } 267 268 return \@feats; 269 270} 271 272# domain name takes a uniprot domain label, removes comments ( ; 273# truncated) and numbers and returns a canonical form. Thus: 274# Cortactin 6. 275# Cortactin 7; truncated. 276# becomes "Cortactin" 277# 278 279sub domain_name { 280 281 my ($value) = @_; 282 283 if (!defined($domains{$value})) { 284 $domain_cnt++; 285 $domains{$value} = $domain_cnt; 286 } 287 return $value; 288} 289 290__END__ 291 292=pod 293 294=head1 NAME 295 296ann_feats.pl 297 298=head1 SYNOPSIS 299 300 ann_pdb_cath.pl --neg 'sp|P09488|GSTM1_NUMAN' | accession.file 301 302=head1 OPTIONS 303 304 -h short help 305 --help include description 306 307 --lav produce lav2plt.pl annotation format, only show domains/repeats 308 --neg-doms, -- report domains between annotated domains as NODOM 309 (also --neg, --neg_doms) 310 --min_nodom=10 -- minimum length between domains for NODOM 311 312 --host, --user, --password, --port --db -- info for mysql database 313 314=head1 DESCRIPTION 315 316C<ann_pfam26.pl> extracts domain information from the pfam26 msyql 317database. Currently, the program works with database sequence 318descriptions in one of two formats: 319 320 >pf26|649|O94823|AT10B_HUMAN -- RPD2_seqs 321 322(pf26 databases have auto_pfamseq in the second field) and 323 324 >gi|1705556|sp|P54670.1|CAF1_DICDI 325 326C<ann_pfam26.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>, 327and C<pfamA> tables of the C<pfam26> database to extract domain 328information on a protein. For proteins that have multiple domains 329associated with the same overlapping region (domains overlap by more 330than 1/3 of the domain length), C<auto_pfam26.pl> selects the domain 331annotation with the best C<domain_evalue_score>. When domains overlap 332by less than 1/3 of the domain length, they are shortened to remove 333the overlap. 334 335C<ann_pfam26.pl> is designed to be used by the B<FASTA> programs with 336the C<-V \!ann_pfam26.pl> or C<-V "\!ann_pfam26.pl --neg"> option. 337 338=head1 AUTHOR 339 340William R. Pearson, wrp@virginia.edu 341 342=cut 343