1#!/usr/bin/perl -w
2
3################################################################
4# copyright (c) 2014 by William R. Pearson and The Rector &
5# Visitors of the University of Virginia */
6################################################################
7# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
9# You may obtain a copy of the License at
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing,
14# software distributed under this License is distributed on an "AS
15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16# express or implied.  See the License for the specific language
17# governing permissions and limitations under the License.
18################################################################
19
20# ann_pfam_e.pl gets an annotation file from fasta36 -V with a line of the form:
21
22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n  (at least from pir1.lseg)
23#
24# it must:
25# (1) read in the line
26# (2) parse it to get the up_acc
27# (3) return the tab delimited features
28#
29
30# this version only annotates sequences known to Pfam:pfamseq:
31# >pf26|164|O57809|1A1D_PYRHO
32# and only provides domain information
33
34use strict;
35
36use DBI;
37use Getopt::Long;
38use Pod::Usage;
39
40use vars qw($host $db $port $user $pass);
41
42my $hostname = `/bin/hostname`;
43
44($host, $db, $port, $user, $pass)  = ("wrpxdb.its.virginia.edu", "pfam27", 0, "web_user", "fasta_www");
45$host = 'xdb';
46
47my ($auto_reg,$rpd2_fams, $neg_doms, $lav, $no_doms, $no_clans, $pf_acc, $no_over, $acc_comment, $shelp, $help) =
48  (0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0);
49my ($min_nodom) = (10);
50
51GetOptions(
52    "host=s" => \$host,
53    "db=s" => \$db,
54    "user=s" => \$user,
55    "password=s" => \$pass,
56    "port=i" => \$port,
57    "lav" => \$lav,
58    "acc_comment" => \$acc_comment,
59    "no-over" => \$no_over,
60    "no_over" => \$no_over,
61    "no-clans" => \$no_clans,
62    "no_clans" => \$no_clans,
63    "neg" => \$neg_doms,
64    "neg_doms" => \$neg_doms,
65    "neg-doms" => \$neg_doms,
66    "min_nodom=i" => \$min_nodom,
67    "pfacc" => \$pf_acc,
68    "RPD2" => \$rpd2_fams,
69    "auto_reg" => \$auto_reg,
70    "h|?" => \$shelp,
71    "help" => \$help,
72    );
73
74pod2usage(1) if $shelp;
75pod2usage(exitstatus => 0, verbose => 2) if $help;
76pod2usage(1) unless @ARGV;
77
78my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
79$connect .= ";host=$host" if $host;
80$connect .= ";port=$port" if $port;
81
82my $dbh = DBI->connect($connect,
83		       $user,
84		       $pass
85		      ) or die $DBI::errstr;
86
87my %annot_types = ();
88my %domains = (NODOM=>0);
89my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
90my @domain_list = (0);
91my $domain_cnt = 0;
92
93my $get_annot_sub = \&get_pfam_annots;
94
95my $get_pfam_acc = $dbh->prepare(<<EOSQL);
96
97SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
98FROM pfamseq
99JOIN pfamA_reg_full_significant using(auto_pfamseq)
100JOIN pfamA USING (auto_pfamA)
101WHERE in_full = 1
102AND  pfamseq_acc=?
103ORDER BY seq_start
104
105EOSQL
106
107my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
108
109SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
110FROM pfamseq
111JOIN pfamA_reg_full_significant using(auto_pfamseq)
112JOIN pfamA USING (auto_pfamA)
113JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp')
114JOIN seqdb_demo2.annot as sa2 using(prot_id)
115WHERE in_full = 1
116AND  sa2.acc=?
117AND  sa2.db='ref'
118ORDER BY seq_start
119
120EOSQL
121
122my $get_annots_sql = $get_pfam_acc;
123
124my $get_pfam_id = $dbh->prepare(<<EOSQL);
125
126SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
127FROM pfamseq
128JOIN pfamA_reg_full_significant using(auto_pfamseq)
129JOIN pfamA USING (auto_pfamA)
130WHERE in_full=1
131AND  pfamseq_id=?
132ORDER BY seq_start
133
134EOSQL
135
136my $get_pfam_clan = $dbh->prepare(<<EOSQL);
137
138SELECT clan_acc, clan_id
139FROM clans
140JOIN clan_membership using(auto_clan)
141WHERE auto_pfamA=?
142
143EOSQL
144
145my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
146
147SELECT auto_pfamA, clan
148FROM ljm_db.RPD2_final_fams
149WHERE clan is not NULL
150
151EOSQL
152
153# -- LEFT JOIN clan_membership USING (auto_pfamA)
154# -- LEFT JOIN clans using(auto_clan)
155
156my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
157
158# get the query
159my ($query, $seq_len) = @ARGV;
160$seq_len = 0 unless defined($seq_len);
161
162$query =~ s/^>//;
163
164my $ANN_F;
165
166my @annots = ();
167
168my %rpd2_clan_fams = ();
169
170if ($rpd2_fams) {
171  $get_rpd2_clans->execute();
172  my ($auto_pfam, $auto_clan);
173  while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
174    $rpd2_clan_fams{$auto_pfam} = $auto_clan;
175  }
176}
177
178#if it's a file I can open, read and parse it
179if ($query !~ m/\|/ && open($ANN_F, $query)) {
180
181  while (my $a_line = <$ANN_F>) {
182    $a_line =~ s/^>//;
183    chomp $a_line;
184    push @annots, show_annots($a_line, $get_annot_sub);
185  }
186}
187else {
188  push @annots, show_annots("$query $seq_len", $get_annot_sub);
189}
190
191for my $seq_annot (@annots) {
192  print ">",$seq_annot->{seq_info},"\n";
193  for my $annot (@{$seq_annot->{list}}) {
194    if (!$lav && defined($domains{$annot->[-1]})) {
195      my ($a_name, $a_num) = ($annot->[-1],$domains{$annot->[-1]});
196      if ($acc_comment) {
197	$annot->[-1] .= "{$domain_list[$a_num]}";
198      }
199      $annot->[-1] .= " :".$a_num;
200    }
201    print join("\t",@$annot),"\n";
202  }
203}
204
205exit(0);
206
207sub show_annots {
208  my ($query_len, $get_annot_sub) = @_;
209
210  my ($annot_line, $seq_len) = split(/\s+/,$query_len);
211
212  my $pfamA_acc;
213
214  my %annot_data = (seq_info=>$annot_line);
215
216  $use_acc = 1;
217  $get_annots_sql = $get_pfam_acc;
218
219  if ($annot_line =~ m/^pf26\|/) {
220    ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line);
221    $dbh->do("use RPD2_pfam");
222  }
223  elsif ($annot_line =~ m/^gi\|/) {
224    ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
225    if ($sdb =~ m/ref/) {
226	$get_annots_sql = $get_pfam_refacc;
227    }
228  }
229  elsif ($annot_line =~ m/^sp\|/) {
230    ($sdb, $acc, $id) = split(/\|/,$annot_line);
231  }
232  elsif ($annot_line =~ m/^ref\|/) {
233    ($sdb, $acc) = split(/\|/,$annot_line);
234    $get_annots_sql = $get_pfam_refacc;
235  }
236  elsif ($annot_line =~ m/^tr\|/) {
237    ($sdb, $acc, $id) = split(/\|/,$annot_line);
238  }
239  elsif ($annot_line =~ m/^SP:/i) {
240    ($sdb, $id) = split(/:/,$annot_line);
241    $use_acc = 0;
242  }
243
244  # remove version number
245  unless ($use_acc) {
246    $get_annots_sql = $get_pfam_id;
247    $get_annots_sql->execute($id);
248  }
249  else {
250    $acc =~ s/\.\d+$//;
251    $get_annots_sql->execute($acc);
252  }
253
254  $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
255
256  return \%annot_data;
257}
258
259sub get_pfam_annots {
260  my ($get_annots, $seq_length) = @_;
261
262  $seq_length = 0 unless $seq_length;
263
264  my @pf_domains = ();
265
266  # get the list of domains, sorted by start
267  while ( my $row_href = $get_annots->fetchrow_hashref()) {
268    if ($auto_reg) {
269      $row_href->{info} = $row_href->{auto_pfamA_reg_full};
270    }
271    elsif ($pf_acc) {
272      $row_href->{info} = $row_href->{pfamA_acc};
273    }
274    else {
275      $row_href->{info} = $row_href->{pfamA_id};
276    }
277
278    if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) { $seq_length = $row_href->{length};}
279
280    next if ($row_href->{seq_start} >= $seq_length);
281    if ($row_href->{seq_end} > $seq_length) {
282	$row_href->{seq_end} = $seq_length;
283    }
284
285    push @pf_domains, $row_href
286  }
287
288  # check for domain overlap, and resolve check for domain overlap
289  # (possibly more than 2 domains), choosing the domain with the best
290  # evalue
291
292  if($no_over && scalar(@pf_domains) > 1) {
293
294    my @tmp_domains = @pf_domains;
295    my @save_domains = ();
296
297    my $prev_dom = shift @tmp_domains;
298
299    while (my $curr_dom = shift @tmp_domains) {
300
301      my @overlap_domains = ($prev_dom);
302
303      my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
304      # check for overlap > domain_length/3
305
306      my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
307      my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
308		       (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
309
310      my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
311
312      while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) {
313	push @overlap_domains, $curr_dom;
314	$curr_dom = shift @tmp_domains;
315	last unless $curr_dom;
316	$diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
317	($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
318	$longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
319	$inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
320		      (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
321      }
322
323      # check for overlapping domains; >1 because $prev_dom is always there
324      if (scalar(@overlap_domains) > 1 ) {
325	# if $rpd2_fams, check for a chosen one
326	if ($rpd2_fams) {
327	  for my $dom (@overlap_domains) {
328	    if ($rpd2_clan_fams{$dom->{auto_pfamA}}) {
329	      $prev_dom = $dom;
330	      last;
331	    }
332	  }
333	}
334	else {
335	  @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
336	  $prev_dom = $overlap_domains[0];
337	}
338      }
339
340      # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
341      push @save_domains, $prev_dom;
342      $prev_dom = $curr_dom;
343    }
344    if ($prev_dom) {push @save_domains, $prev_dom;}
345
346    @pf_domains = @save_domains;
347
348    # now check for smaller overlaps
349    for (my $i=1; $i < scalar(@pf_domains); $i++) {
350      if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) {
351	my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start};
352	$pf_domains[$i-1]->{seq_end} -= int($overlap/2);
353	$pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1;
354      }
355    }
356  }
357
358  if ($neg_doms) {
359    my @npf_domains;
360    my $prev_dom={seq_end=>0};
361    for my $curr_dom ( @pf_domains) {
362      if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
363	my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
364	push @npf_domains, \%new_dom;
365      }
366      push @npf_domains, $curr_dom;
367      $prev_dom = $curr_dom;
368    }
369    if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
370      my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
371      if ($new_dom{seq_end} > $new_dom{seq_start}) {push @npf_domains, \%new_dom;}
372    }
373
374    # @npf_domains has both old @pf_domains and new neg-domains
375    @pf_domains = @npf_domains;
376  }
377
378  # now make sure we have useful names: colors
379
380  for my $pf (@pf_domains) {
381    $pf->{info} = domain_name($pf->{info}, $pf->{auto_pfamA}, $pf->{pfamA_acc});
382  }
383
384  my @feats = ();
385  for my $d_ref (@pf_domains) {
386    if ($lav) {
387      push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
388    }
389    else {
390      push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end},  $d_ref->{info} ];
391#      push @feats, [$d_ref->{seq_end}, ']', '-', ""];
392    }
393
394  }
395
396  return \@feats;
397}
398
399# domain name takes a uniprot domain label, removes comments ( ;
400# truncated) and numbers and returns a canonical form. Thus:
401# Cortactin 6.
402# Cortactin 7; truncated.
403# becomes "Cortactin"
404#
405
406sub domain_name {
407
408  my ($value, $auto_pfamA, $pfamA_acc) = @_;
409
410  # check for clan:
411  if ($no_clans) {
412    if (! defined($domains{$value})) {
413      $domain_clan{$value} = 0;
414      $domains{$value} = ++$domain_cnt;
415      push @domain_list, $pfamA_acc;
416    }
417  }
418  elsif (!defined($domain_clan{$value})) {
419    ## only do this for new domains, old domains have known mappings
420
421    ## ways to highlight the same domain:
422    # (1) for clans, substitute clan name for family name
423    # (2) for clans, use the same color for the same clan, but don't change the name
424    # (3) for clans, combine family name with clan name, but use colors based on clan
425
426    # check to see if it's a clan
427    $get_pfam_clan->execute($auto_pfamA);
428
429    my $pfam_clan_href=0;
430
431    if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) {  # is a clan
432      my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
433
434      # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
435      my $c_value = "C." . $clan_id;
436      if ($pf_acc) {$c_value = "C." . $clan_acc;}
437
438      $domain_clan{$value} = {clan_id => $clan_id,
439			      clan_acc => $clan_acc};
440
441      if ($domains{$c_value}) {
442	$domain_clan{$value}->{domain_cnt} =  $domains{$c_value};
443	$value = $c_value;
444      }
445      else {
446	$domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
447	$value = $c_value;
448	$domains{$value} = $domain_cnt;
449	push @domain_list, $pfamA_acc;
450      }
451    }
452    else {			# not a clan
453      $domain_clan{$value} = 0;
454      $domains{$value} = ++$domain_cnt;
455      push @domain_list, $pfamA_acc;
456    }
457  }
458  elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
459    if ($pf_acc) {$value = "C." . $domain_clan{$value}->{clan_acc};}
460    else { $value = "C." . $domain_clan{$value}->{clan_id}; }
461  }
462
463  return $value;
464}
465
466__END__
467
468=pod
469
470=head1 NAME
471
472ann_feats.pl
473
474=head1 SYNOPSIS
475
476 ann_pfam_e.pl --neg-doms  'sp|P09488|GSTM1_NUMAN' | accession.file
477
478=head1 OPTIONS
479
480 -h	short help
481 --help include description
482 --no-over  : generate non-overlapping domains (equivalent to ann_pfam.pl)
483 --no-clans : do not use clans with multiple families from same clan
484 --neg-doms : report domains between annotated domains as NODOM
485                 (also --neg, --neg_doms)
486 --min_nodom=10  : minimum length between domains for NODOM
487
488 --host, --user, --password, --port --db : info for mysql database
489
490=head1 DESCRIPTION
491
492C<ann_pfam_e.pl> extracts domain information from the pfam msyql
493database.  Currently, the program works with database sequence
494descriptions in one of two formats:
495
496 Currently, the program works with database
497sequence descriptions in several formats:
498
499 >gi|1705556|sp|P54670.1|CAF1_DICDI
500 >sp|P09488|GSTM1_HUMAN
501 >sp:CALM_HUMAN
502
503C<ann_pfam_e.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
504and C<pfamA> tables of the C<pfam> database to extract domain
505information on a protein.
506
507If the "--no-over" option is set, overlapping domains are selected and
508edited to remove overlaps.  For proteins with multiple overlapping
509domains (domains overlap by more than 1/3 of the domain length),
510C<auto_pfam_e.pl> selects the domain annotation with the best
511C<domain_evalue_score>.  When domains overlap by less than 1/3 of the
512domain length, they are shortened to remove the overlap.
513
514C<ann_pfam_e.pl> is designed to be used by the B<FASTA> programs with
515the C<-V \!ann_pfam_e.pl> or C<-V "\!ann_pfam_e.pl --neg"> option.
516
517=head1 AUTHOR
518
519William R. Pearson, wrp@virginia.edu
520
521=cut
522