1#!/usr/bin/perl -w
2
3################################################################
4# copyright (c) 2014 by William R. Pearson and The Rector &
5# Visitors of the University of Virginia */
6################################################################
7# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
9# You may obtain a copy of the License at
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing,
14# software distributed under this License is distributed on an "AS
15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16# express or implied.  See the License for the specific language
17# governing permissions and limitations under the License.
18################################################################
19
20# ann_feats.pl gets an annotation file from fasta36 -V with a line of the form:
21
22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n  (at least from pir1.lseg)
23#
24# it must:
25# (1) read in the line
26# (2) parse it to get the up_acc
27# (3) return the tab delimited features
28#
29
30# this version is designed for various formats of the pdbaa/pdbaa_off NCBI files with the lines:
31# >gi|4388890|pdb|1GTUA|sp|P09488  or
32# >gi|4388890|pdb|1GTU|A
33# if I can find |sp|P09488, I will use that, otherwise I will use
34# |pdb|1GTU|A (concatenated) and a different part of the cath_dom
35# database
36#
37
38use strict;
39
40use DBI;
41use Getopt::Long;
42use Pod::Usage;
43
44use vars qw($host $db $port $user $pass);
45
46my $hostname = `/bin/hostname`;
47
48($host, $db, $port, $user, $pass)  = ("wrpxdb.its.virginia.edu", "uniprot", 0, "web_user", "fasta_www");
49
50my ($neg_doms, $lav, $shelp, $help, $class) = (0, 0, 0, 0, 0);
51my ($min_nodom) = (10);
52
53GetOptions(
54    "host=s" => \$host,
55    "db=s" => \$db,
56    "user=s" => \$user,
57    "password=s" => \$pass,
58    "port=i" => \$port,
59    "lav" => \$lav,
60    "neg" => \$neg_doms,
61    "neg_doms" => \$neg_doms,
62    "neg-doms" => \$neg_doms,
63    "min_nodom=i" => \$min_nodom,
64    "class" => \$class,
65    "h|?" => \$shelp,
66    "help" => \$help,
67    );
68
69pod2usage(1) if $shelp;
70pod2usage(exitstatus => 0, verbose => 2) if $help;
71pod2usage(1) unless @ARGV;
72
73my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
74$connect .= ";host=$host" if $host;
75$connect .= ";port=$port" if $port;
76
77my $dbh = DBI->connect($connect,
78		       $user,
79		       $pass
80		      ) or die $DBI::errstr;
81
82my %domains = (NODOM=>0);
83my $domain_cnt = 0;
84
85my $get_offsets_pdb =  $dbh->prepare(<<EOSQL);
86SELECT res_beg, pdb_beg, pdb_end, sp_beg, sp_end
87FROM pdb_chain_up
88WHERE pdb_acc=?
89ORDER BY res_beg
90EOSQL
91
92my $get_cathdoms_pdb = $dbh->prepare(<<EOSQL);
93SELECT s_start, s_stop, p_start, p_stop, cath_class, s_descr as info
94FROM cath_doms
95JOIN cath_names using(cath_class)
96WHERE pdb_acc=?
97ORDER BY s_start
98EOSQL
99
100my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
101
102# get the query
103my ($query, $seq_len) = @ARGV;
104$seq_len = 1 unless $seq_len;
105
106$query =~ s/^>//;
107
108my $ANN_F;
109
110my @annots = ();
111
112#if it's a file I can open, read and parse it
113if ($query !~ m/\|/ && open($ANN_F, $query)) {
114
115  while (my $a_line = <$ANN_F>) {
116    $a_line =~ s/^>//;
117    chomp $a_line;
118    push @annots, show_annots($lav,$a_line);
119  }
120}
121else {
122  push @annots, show_annots($lav, "$query $seq_len");
123}
124
125for my $seq_annot (@annots) {
126  print ">",$seq_annot->{seq_info},"\n";
127  for my $annot (@{$seq_annot->{list}}) {
128    if (!$lav && defined($domains{$annot->[-1]})) {
129      $annot->[-1] .= " :".$domains{$annot->[-1]};
130    }
131    print join("\t",@$annot),"\n";
132  }
133}
134
135exit(0);
136
137sub show_annots {
138  my ($lav, $annot_line) = @_;
139
140  my ($query, $seq_len) = split(/\s+/,$annot_line);
141
142  my %annot_data = (seq_info => $query);
143
144  my ($tmp, $gi, $pdb, $pdb_acc, $pdb_id, $pdb_chain, $sdb, $up_acc, $off_flag);
145
146  $off_flag = 0;
147  if ($query =~ m/^gi\|/) {
148    if ($query =~ m/\|sp\|/) {
149      ($tmp, $gi, $pdb, $pdb_acc, $sdb, $up_acc) = split(/\|/,$query);
150      $up_acc =~ s/\.\d+$//;
151      $off_flag = 1;
152    }
153    elsif ($query =~ m/\|pdb\|/) {
154      ($tmp, $gi, $pdb, $pdb_id, $pdb_chain) = split(/\|/,$query);
155      $pdb_acc = $pdb_id . $pdb_chain;
156    }
157  }
158  elsif ($query =~ m/^sp\|/) {
159    ($pdb, $pdb_acc) = split(/\|/,$query);
160  }
161  elsif  ($query =~ m/^pdb\|(\w{4})\|(\w)/) {
162    $pdb_acc = $1 . $2;
163  }
164  else {
165    $pdb_acc = $query;
166  }
167
168# only get the first res_beg because it is used to calculate pdbaa_off @c:xxx
169  $get_offsets_pdb->execute($pdb_acc);
170  my ($res_beg, $pdb_beg, $pdb_end, $sp_beg, $sp_end) = $get_offsets_pdb->fetchrow_array();
171
172  $res_beg = 1 unless defined($res_beg);
173  $pdb_beg = 1 unless defined($pdb_beg);
174  $sp_beg = 1 unless defined($sp_beg);
175
176  if (defined($sp_end) && $sp_end > $seq_len) {$seq_len = $sp_end;}
177  if (defined($pdb_end) && $pdb_end > $seq_len) {$seq_len = $pdb_end;}
178
179  # unless ($seq_len > 1) {
180  #   if (defined($sp_end)) {
181  #     $seq_len = $sp_end;
182  #   }
183  #   elsif (defined($pdb_end)) {
184  #     $seq_len = $pdb_end;
185  #   }
186  # }
187
188  $get_cathdoms_pdb->execute($pdb_acc);
189  $annot_data{list} = get_cath_annots($lav, $get_cathdoms_pdb, $pdb_beg, $seq_len, $off_flag);
190
191  return \%annot_data;
192}
193
194sub get_cath_annots {
195  my ($lav, $get_annots, $sp_offset, $seq_length, $is_offset) = @_;
196
197  my @cath_domains = ();
198
199  # get the list of domains, sorted by start
200  while ( my $row_href = $get_annots->fetchrow_hashref()) {
201
202    # put in logic to subtract sp_offset when necessary
203    if ($is_offset && $row_href->{p_start}) {
204      $row_href->{seq_start} = $row_href->{p_start} - $sp_offset;
205      $row_href->{seq_end} = $row_href->{p_stop} - $sp_offset;
206    }
207    else {
208      $row_href->{seq_start} = $row_href->{s_start} - $sp_offset;
209      $row_href->{seq_end} = $row_href->{s_stop} - $sp_offset;
210    }
211
212    if ($seq_length <= 1) {
213      $seq_length = $row_href->{seq_end};
214    }
215    else {
216      $row_href->{seq_end} = $seq_length if ($row_href->{seq_end} > $seq_length);
217    }
218    push @cath_domains, $row_href
219  }
220
221  return unless (scalar(@cath_domains));
222
223  # do a consistency check
224  for (my $i=1; $i < scalar(@cath_domains); $i++) {
225    if ($cath_domains[$i]->{seq_start} <= $cath_domains[$i-1]->{seq_end}) {
226      my $delta = $cath_domains[$i]->{seq_start} - $cath_domains[$i-1]->{seq_end};
227      $cath_domains[$i-1]->{seq_end} -= $delta/2;
228      $cath_domains[$i]->{seq_start} = $cath_domains[$i-1]->{seq_end}+1;
229    }
230  }
231
232  if ($neg_doms) {
233    my @ncath_domains;
234    my $prev_dom={seq_end=>0};
235    for my $cur_dom ( @cath_domains) {
236      if ($cur_dom{seq_start} - $prev_dom{seq_end} > $min_nodom) {
237	my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $cur_dom->{seq_start}-1, info=>'NODOM');
238	push @ncath_domains, \%new_dom;
239      }
240      push @ncath_domains, $cur_dom;
241      $prev_dom = $cur_dom;
242    }
243    my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
244    if ($new_dom{seq_end} > $new_dom{seq_start}) {push @ncath_domains, \%new_dom;}
245
246    @cath_domains = @ncath_domains;
247  }
248
249  for my $cath (@cath_domains) {
250    if ($class && $cath->{cath_class}) {$cath->{info} = $cath->{cath_class};}
251    $cath->{info} = domain_name($cath->{info});
252  }
253
254  my @feats = ();
255
256  if ($lav) {
257    for my $d_ref (@cath_domains) {
258      push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info} ];
259    }
260  }
261  else {
262    for my $d_ref (@cath_domains) {
263      push @feats, [$d_ref->{seq_start}, '[', '-',  $d_ref->{info} ];
264      push @feats, [$d_ref->{seq_end}, ']', '-', ""];
265    }
266  }
267
268  return \@feats;
269
270}
271
272# domain name takes a uniprot domain label, removes comments ( ;
273# truncated) and numbers and returns a canonical form. Thus:
274# Cortactin 6.
275# Cortactin 7; truncated.
276# becomes "Cortactin"
277#
278
279sub domain_name {
280
281  my ($value) = @_;
282
283  if (!defined($domains{$value})) {
284    $domain_cnt++;
285    $domains{$value} = $domain_cnt;
286  }
287  return $value;
288}
289
290__END__
291
292=pod
293
294=head1 NAME
295
296ann_feats.pl
297
298=head1 SYNOPSIS
299
300 ann_pdb_cath.pl --neg 'sp|P09488|GSTM1_NUMAN' | accession.file
301
302=head1 OPTIONS
303
304 -h	short help
305 --help include description
306
307 --lav  produce lav2plt.pl annotation format, only show domains/repeats
308 --neg-doms,  -- report domains between annotated domains as NODOM
309                 (also --neg, --neg_doms)
310 --min_nodom=10  -- minimum length between domains for NODOM
311
312 --host, --user, --password, --port --db -- info for mysql database
313
314=head1 DESCRIPTION
315
316C<ann_pfam26.pl> extracts domain information from the pfam26 msyql
317database.  Currently, the program works with database sequence
318descriptions in one of two formats:
319
320 >pf26|649|O94823|AT10B_HUMAN -- RPD2_seqs
321
322(pf26 databases have auto_pfamseq in the second field) and
323
324 >gi|1705556|sp|P54670.1|CAF1_DICDI
325
326C<ann_pfam26.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
327and C<pfamA> tables of the C<pfam26> database to extract domain
328information on a protein.  For proteins that have multiple domains
329associated with the same overlapping region (domains overlap by more
330than 1/3 of the domain length), C<auto_pfam26.pl> selects the domain
331annotation with the best C<domain_evalue_score>.  When domains overlap
332by less than 1/3 of the domain length, they are shortened to remove
333the overlap.
334
335C<ann_pfam26.pl> is designed to be used by the B<FASTA> programs with
336the C<-V \!ann_pfam26.pl> or C<-V "\!ann_pfam26.pl --neg"> option.
337
338=head1 AUTHOR
339
340William R. Pearson, wrp@virginia.edu
341
342=cut
343