1#!/usr/bin/perl -w
2
3################################################################
4# copyright (c) 2014 by William R. Pearson and The Rector &
5# Visitors of the University of Virginia */
6################################################################
7# Licensed under the Apache License, Version 2.0 (the "License");
8# you may not use this file except in compliance with the License.
9# You may obtain a copy of the License at
10#
11# http://www.apache.org/licenses/LICENSE-2.0
12#
13# Unless required by applicable law or agreed to in writing,
14# software distributed under this License is distributed on an "AS
15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16# express or implied.  See the License for the specific language
17# governing permissions and limitations under the License.
18################################################################
19
20# ann_pfam_e.pl gets an annotation file from fasta36 -V with a line of the form:
21
22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n  (at least from pir1.lseg)
23#
24# it must:
25# (1) read in the line
26# (2) parse it to get the up_acc
27# (3) return the tab delimited features
28#
29
30# this version only annotates sequences known to Pfam:pfamseq:
31# >pf26|164|O57809|1A1D_PYRHO
32# and only provides domain information
33
34use strict;
35
36use DBI;
37use Getopt::Long;
38use Pod::Usage;
39
40use vars qw($host $db $port $user $pass);
41
42my $hostname = `/bin/hostname`;
43
44($host, $db, $port, $user, $pass)  = ("wrpxdb.its.virginia.edu", "pfam28", 0, "web_user", "fasta_www");
45$host = 'xdb';
46#$host = 'localhost';
47#$db = 'RPD2_pfam28u';
48
49my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_doms, $no_clans, $pf_acc, $no_over, $acc_comment, $shelp, $help) =
50  (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0);
51my ($min_nodom, $min_vdom) = (10,10);
52
53GetOptions(
54    "host=s" => \$host,
55    "db=s" => \$db,
56    "user=s" => \$user,
57    "password=s" => \$pass,
58    "port=i" => \$port,
59    "lav" => \$lav,
60    "acc_comment" => \$acc_comment,
61    "no-over" => \$no_over,
62    "no_over" => \$no_over,
63    "no-clans" => \$no_clans,
64    "no_clans" => \$no_clans,
65    "neg" => \$neg_doms,
66    "neg_doms" => \$neg_doms,
67    "neg-doms" => \$neg_doms,
68    "min_nodom=i" => \$min_nodom,
69    "vdoms" => \$vdoms,
70    "v_doms" => \$vdoms,
71    "pfacc" => \$pf_acc,
72    "RPD2" => \$rpd2_fams,
73    "auto_reg" => \$auto_reg,
74    "h|?" => \$shelp,
75    "help" => \$help,
76    );
77
78pod2usage(1) if $shelp;
79pod2usage(exitstatus => 0, verbose => 2) if $help;
80pod2usage(1) unless @ARGV;
81
82my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db";
83$connect .= ";host=$host" if $host;
84$connect .= ";port=$port" if $port;
85
86my $dbh = DBI->connect($connect,
87		       $user,
88		       $pass
89		      ) or die $DBI::errstr;
90
91my %annot_types = ();
92my %domains = (NODOM=>0);
93my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0});
94my @domain_list = (0);
95my $domain_cnt = 0;
96
97my $pfamA_reg_full = 'pfamA_reg_full_significant';
98
99my $get_annot_sub = \&get_pfam_annots;
100
101my @pfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_pfamA_reg_full domain_evalue_score as evalue length);
102
103my $get_pfam_acc = $dbh->prepare(<<EOSQL);
104SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
105FROM pfamseq
106JOIN $pfamA_reg_full using(pfamseq_acc)
107JOIN pfamA USING (pfamA_acc)
108WHERE in_full = 1
109AND  pfamseq_acc=?
110ORDER BY seq_start
111
112EOSQL
113
114my $get_pfam_refacc = $dbh->prepare(<<EOSQL);
115
116SELECT seq_start, seq_end, model_start, model_end, model_length pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
117FROM pfamseq
118JOIN $pfamA_reg_full using(pfamseq_acc)
119JOIN pfamA USING (pfamA_acc)
120JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp')
121JOIN seqdb_demo2.annot as sa2 using(prot_id)
122WHERE in_full = 1
123AND  sa2.acc=?
124AND  sa2.db='ref'
125ORDER BY seq_start
126
127EOSQL
128
129my $get_annots_sql = $get_pfam_acc;
130
131my $get_pfam_id = $dbh->prepare(<<EOSQL);
132
133SELECT seq_start, seq_end, model_start, model_end, model_length pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length
134FROM pfamseq
135JOIN $pfamA_reg_full using(pfamseq_acc)
136JOIN pfamA USING (pfamA_acc)
137WHERE in_full=1
138AND  pfamseq_id=?
139ORDER BY seq_start
140
141EOSQL
142
143my $get_pfam_clan = $dbh->prepare(<<EOSQL);
144
145SELECT clan_acc, clan_id
146FROM clan
147JOIN clan_membership using(clan_acc)
148WHERE pfamA_acc=?
149
150EOSQL
151
152my $get_rpd2_clans = $dbh->prepare(<<EOSQL);
153
154SELECT auto_pfamA, clan
155FROM ljm_db.RPD2_final_fams
156WHERE clan is not NULL
157
158EOSQL
159
160# -- LEFT JOIN clan_membership USING (auto_pfamA)
161# -- LEFT JOIN clans using(auto_clan)
162
163my ($tmp, $gi, $sdb, $acc, $id, $use_acc);
164
165# get the query
166my ($query, $seq_len) = @ARGV;
167$seq_len = 0 unless defined($seq_len);
168
169$query =~ s/^>//;
170
171my $ANN_F;
172
173my @annots = ();
174
175my %rpd2_clan_fams = ();
176
177if ($rpd2_fams) {
178  $get_rpd2_clans->execute();
179  my ($auto_pfam, $auto_clan);
180  while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) {
181    $rpd2_clan_fams{$auto_pfam} = $auto_clan;
182  }
183}
184
185#if it's a file I can open, read and parse it
186if ($query !~ m/\|/ && open($ANN_F, $query)) {
187
188  while (my $a_line = <$ANN_F>) {
189    $a_line =~ s/^>//;
190    chomp $a_line;
191    push @annots, show_annots($a_line, $get_annot_sub);
192  }
193}
194else {
195  push @annots, show_annots("$query $seq_len", $get_annot_sub);
196}
197
198for my $seq_annot (@annots) {
199  print ">",$seq_annot->{seq_info},"\n";
200  for my $annot (@{$seq_annot->{list}}) {
201    if (!$lav && defined($domains{$annot->[-1]})) {
202      my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]});
203      if ($acc_comment) {
204	$annot->[-1] .= "{$domain_list[$a_num]}";
205      }
206      $annot->[-1] = "$a_name :$a_num";
207    }
208    print join("\t",@$annot),"\n";
209  }
210}
211
212exit(0);
213
214sub show_annots {
215  my ($query_len, $get_annot_sub) = @_;
216
217  my ($annot_line, $seq_len) = split(/\s+/,$query_len);
218
219  my $pfamA_acc;
220
221  my %annot_data = (seq_info=>$annot_line);
222
223  $use_acc = 1;
224  $get_annots_sql = $get_pfam_acc;
225
226  if ($annot_line =~ m/^pf26\|/) {
227    ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line);
228    $dbh->do("use RPD2_pfam");
229  }
230  elsif ($annot_line =~ m/^gi\|/) {
231    ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line);
232    if ($sdb =~ m/ref/) {
233	$get_annots_sql = $get_pfam_refacc;
234    }
235  }
236  elsif ($annot_line =~ m/^sp\|/) {
237    ($sdb, $acc, $id) = split(/\|/,$annot_line);
238  }
239  elsif ($annot_line =~ m/^ref\|/) {
240    ($sdb, $acc) = split(/\|/,$annot_line);
241    $get_annots_sql = $get_pfam_refacc;
242  }
243  elsif ($annot_line =~ m/^tr\|/) {
244    ($sdb, $acc, $id) = split(/\|/,$annot_line);
245  }
246  elsif ($annot_line =~ m/^SP:/i) {
247    ($sdb, $id) = split(/:/,$annot_line);
248    $use_acc = 0;
249  }
250
251  # remove version number
252  unless ($use_acc) {
253    $get_annots_sql = $get_pfam_id;
254    $get_annots_sql->execute($id);
255  }
256  else {
257    $acc =~ s/\.\d+$//;
258    $get_annots_sql->execute($acc);
259  }
260
261  $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len);
262
263  return \%annot_data;
264}
265
266sub get_pfam_annots {
267  my ($get_annots, $seq_length) = @_;
268
269  $seq_length = 0 unless $seq_length;
270
271  my @pf_domains = ();
272
273  # get the list of domains, sorted by start
274
275  # $row_href has: seq_start, seq_end, model_start, model_end, model_length,
276  #                pfamA_acc, pfamA_id, auto_pfamA_reg_full,
277  #                domain_evalue_score as evalue, length
278
279  while ( my $row_href = $get_annots->fetchrow_hashref()) {
280    if ($auto_reg) {
281      $row_href->{info} = $row_href->{auto_pfamA_reg_full};
282    } elsif ($pf_acc) {
283      $row_href->{info} = $row_href->{pfamA_acc};
284    } else {
285      $row_href->{info} = $row_href->{pfamA_id};
286    }
287
288    if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) {
289      $seq_length = $row_href->{length};
290    }
291
292    next if ($row_href->{seq_start} >= $seq_length);
293    if ($row_href->{seq_end} > $seq_length) {
294      $row_href->{seq_end} = $seq_length;
295    }
296
297    push @pf_domains, $row_href
298  }
299
300  # before checking for domain overlap, check for "split-domains"
301  # (self-unbound) by looking for runs of the same domain that are
302  # ordered by model_start
303
304  if (scalar(@pf_domains) > 1) {
305    my @j_domains;		#joined domains
306    my @tmp_domains = @pf_domains;
307
308    my $prev_dom = shift(@tmp_domains);
309
310    for my $curr_dom (@tmp_domains) {
311      # to join domains:
312      # (1) the domains must be in order by model_start/end coordinates
313      # (3) joining the domains cannot make the total combination too long
314
315      # check for model and sequence consistency
316      if (($prev_dom->{pfamA_acc} eq $curr_dom->{pfamA_acc})  # same family
317	  && $prev_dom->{model_start} < $curr_dom->{model_start}  # model check
318	  && $prev_dom->{model_end} < $curr_dom->{model_end}
319
320	  && ($curr_dom->{model_start} > $prev_dom->{model_end} * 0.80   # limit overlap
321	      || $curr_dom->{model_start} <  $prev_dom->{model_end} * 1.25)
322	  && ((($curr_dom->{model_end} - $curr_dom->{model_start}+1)/$curr_dom->{model_length} +
323	       ($prev_dom->{model_end} - $prev_dom->{model_start}+1)/$prev_dom->{model_length}) < 1.33)
324	 ) {			# join them by updating $prev_dom
325	$prev_dom->{seq_end} = $curr_dom->{seq_end};
326	$prev_dom->{model_end} = $curr_dom->{model_end};
327	$prev_dom->{auto_pfamA_reg_full} = $prev_dom->{auto_pfamA_reg_full} . ";". $curr_dom->{auto_pfamA_reg_full};
328	$prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue});
329      } else {
330	push @j_domains, $prev_dom;
331	$prev_dom = $curr_dom;
332      }
333    }
334    push @j_domains, $prev_dom;
335    @pf_domains = @j_domains;
336  }
337
338  if($no_over && scalar(@pf_domains) > 1) {
339
340    my @tmp_domains = @pf_domains;
341    my @save_domains = ();
342
343    my $prev_dom = shift @tmp_domains;
344
345    while (my $curr_dom = shift @tmp_domains) {
346
347      my @overlap_domains = ($prev_dom);
348
349      my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
350      # check for overlap > domain_length/3
351
352      my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
353      my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
354		       (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
355
356      my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
357
358      while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) {
359	push @overlap_domains, $curr_dom;
360	$curr_dom = shift @tmp_domains;
361	last unless $curr_dom;
362	$diff = $prev_dom->{seq_end} - $curr_dom->{seq_start};
363	($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1);
364	$longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len;
365	$inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) ||
366		      (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end})));
367      }
368
369      # check for overlapping domains; >1 because $prev_dom is always there
370      if (scalar(@overlap_domains) > 1 ) {
371	# if $rpd2_fams, check for a chosen one
372
373	for my $dom ( @overlap_domains) {
374	  $dom->{evalue} = 1.0 unless defined($dom->{evalue});
375	}
376
377	@overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains;
378	$prev_dom = $overlap_domains[0];
379      }
380
381      # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3
382      push @save_domains, $prev_dom;
383      $prev_dom = $curr_dom;
384    }
385    if ($prev_dom) {push @save_domains, $prev_dom;}
386
387    @pf_domains = @save_domains;
388
389    # now check for smaller overlaps
390    for (my $i=1; $i < scalar(@pf_domains); $i++) {
391      if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) {
392	my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start};
393	$pf_domains[$i-1]->{seq_end} -= int($overlap/2);
394	$pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1;
395      }
396    }
397  }
398
399  # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms,
400  # but covering parts of a Pfam model that are not annotated.  split
401  # domains have been joined, so simply check beginning and end of
402  # each domain (but must also check for bounded-ness)
403  # only add when 10% or more is missing and missing length > $min_nodom
404
405  if ($vdoms) {
406    my @vpf_domains;
407
408    my $curr_dom = $pf_domains[0];
409    my $length = $curr_dom->{length};
410
411    my $prev_dom={seq_end=>0, pfamA_acc=>''};
412    my $prev_dom_end = 0;
413    my $next_dom_start = $length+1;
414
415    for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) {
416      $curr_dom = $pf_domains[$dom_ix];
417
418      my $pfamA =  $curr_dom->{pfamA_acc};
419
420      # first, look left, is there a domain there (if there is,
421      # it should be updated right
422
423      # my $min_vdom = $curr_dom->{model_length} / 10;
424
425      if ($prev_dom->{pfamA_acc}) { # look for previous domain
426	$prev_dom_end = $prev_dom->{seq_end};
427      }
428
429      # there is a domain to the left, how much room is available?
430      my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1);
431      if ( $left_dom_len > $min_vdom) {
432	# there is room for a virtual domain
433	my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len,
434	               seq_end => $curr_dom->{seq_start}-1,
435		       info=>'@'.$curr_dom->{info},
436		       model_length=>$curr_dom->{model_length},
437		       model_end => $curr_dom->{model_start}-1,
438		       model_start => $left_dom_len,
439		       pfamA_acc=>$pfamA,
440		      );
441	push @vpf_domains, \%new_dom;
442      }
443
444      # save the current domain
445      push @vpf_domains, $curr_dom;
446      $prev_dom = $curr_dom;
447
448      if ($dom_ix < $#pf_domains) { # there is a domain to the right
449	# first, give all the extra space to the first domain (no splitting)
450	$next_dom_start = $pf_domains[$dom_ix+1]->{seq_start};
451      }
452      else {
453	$next_dom_start = $length;
454      }
455
456      # is there room for a virtual domain right
457
458      my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available
459			      $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
460			     );
461      if ( $right_dom_len > $min_vdom) {
462	my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
463		       seq_end=> $curr_dom->{seq_end}+$right_dom_len,
464		       info=>'@'.$pfamA,
465		       model_length => $curr_dom->{model_length},
466		       pfamA_acc=> $pfamA,
467		      );
468	push @vpf_domains, \%new_dom;
469	$prev_dom = \%new_dom;
470      }
471    }				# all done, check for last one
472
473    # $curr_dom=$pf_domains[-1];
474    # # my $min_vdom = $curr_dom->{model_length}/10;
475
476    # my $right_dom_len = min($length - $curr_dom->{seq_end}+1,  # space available
477    # 			    $curr_dom->{model_length}-$curr_dom->{model_end} # space needed
478    # 			   );
479    # if ($right_dom_len > $min_vdom) {
480    #   my %new_dom = (seq_start=> $curr_dom->{seq_end}+1,
481    # 		     seq_end => $curr_dom->{seq_end}+$right_dom_len,
482    # 		     info=>'@'.$curr_dom->{pfamA_acc},
483    # 		     model_len=> $curr_dom->{model_len},
484    # 		     pfamA_acc => $curr_dom->{pfamA_acc},
485    # 		     model_start => $curr_dom->{model_end}+1,
486    # 		     model_end => $curr_dom->{model_len},
487    # 		     );
488
489    #   push @vpf_domains, \%new_dom;
490    # }
491
492    # @vpf_domains has both old @pf_domains and new neg-domains
493    @pf_domains = @vpf_domains;
494  }
495
496  if ($neg_doms) {
497    my @npf_domains;
498    my $prev_dom={seq_end=>0};
499    for my $curr_dom ( @pf_domains) {
500      if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) {
501	my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM');
502	push @npf_domains, \%new_dom;
503      }
504      push @npf_domains, $curr_dom;
505      $prev_dom = $curr_dom;
506    }
507    if ($seq_length - $prev_dom->{seq_end} > $min_nodom) {
508      my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM');
509      if ($new_dom{seq_end} > $new_dom{seq_start}) {
510	push @npf_domains, \%new_dom;
511      }
512    }
513
514    # @npf_domains has both old @pf_domains and new neg-domains
515    @pf_domains = @npf_domains;
516  }
517
518  # now make sure we have useful names: colors
519
520  for my $pf (@pf_domains) {
521    $pf->{info} = domain_name($pf->{info}, $pf->{pfamA_acc});
522  }
523
524  my @feats = ();
525  for my $d_ref (@pf_domains) {
526    if ($lav) {
527      push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}];
528    } else {
529      push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end},  $d_ref->{info} ];
530      #      push @feats, [$d_ref->{seq_end}, ']', '-', ""];
531    }
532
533  }
534
535  return \@feats;
536}
537
538sub min {
539  my ($arg1, $arg2) = @_;
540
541  return ($arg1 <= $arg2 ? $arg1 : $arg2);
542}
543
544sub max {
545  my ($arg1, $arg2) = @_;
546
547  return ($arg1 >= $arg2 ? $arg1 : $arg2);
548}
549
550# domain name takes a uniprot domain label, removes comments ( ;
551# truncated) and numbers and returns a canonical form. Thus:
552# Cortactin 6.
553# Cortactin 7; truncated.
554# becomes "Cortactin"
555#
556
557sub domain_name {
558
559  my ($value, $pfamA_acc) = @_;
560  my $is_virtual = 0;
561
562  if ($value =~ m/^@/) {
563    $is_virtual = 1;
564    $value =~ s/^@//;
565  }
566
567  # check for clan:
568  if ($no_clans) {
569    if (! defined($domains{$value})) {
570      $domain_clan{$value} = 0;
571      $domains{$value} = ++$domain_cnt;
572      push @domain_list, $pfamA_acc;
573    }
574  }
575  elsif (!defined($domain_clan{$value})) {
576    ## only do this for new domains, old domains have known mappings
577
578    ## ways to highlight the same domain:
579    # (1) for clans, substitute clan name for family name
580    # (2) for clans, use the same color for the same clan, but don't change the name
581    # (3) for clans, combine family name with clan name, but use colors based on clan
582
583    # check to see if it's a clan
584    $get_pfam_clan->execute($pfamA_acc);
585
586    my $pfam_clan_href=0;
587
588    if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) {  # is a clan
589      my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)};
590
591      # now check to see if we have seen this clan before (if so, do not increment $domain_cnt)
592      my $c_value = "C." . $clan_id;
593      if ($pf_acc) {$c_value = $clan_acc;}
594
595      $domain_clan{$value} = {clan_id => $clan_id,
596			      clan_acc => $clan_acc};
597
598      if ($domains{$c_value}) {
599	$domain_clan{$value}->{domain_cnt} =  $domains{$c_value};
600	$value = $c_value;
601      }
602      else {
603	$domain_clan{$value}->{domain_cnt} = ++ $domain_cnt;
604	$value = $c_value;
605	$domains{$value} = $domain_cnt;
606	push @domain_list, $pfamA_acc;
607      }
608    }
609    else {			# not a clan
610      $domain_clan{$value} = 0;
611      $domains{$value} = ++$domain_cnt;
612      push @domain_list, $pfamA_acc;
613    }
614  }
615  elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) {
616    if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};}
617    else { $value = "C." . $domain_clan{$value}->{clan_id}; }
618  }
619
620  if ($is_virtual) {
621    $domains{'@'.$value} = $domains{$value};
622    $value = '@'.$value;
623  }
624  return $value;
625}
626
627sub domain_num {
628  my ($value, $number) = @_;
629  if ($value =~ m/^@/) {
630    $value =~ s/^@/v/;
631    $number = $number."v";
632  }
633  return ($value, $number);
634}
635
636
637__END__
638
639=pod
640
641=head1 NAME
642
643ann_feats.pl
644
645=head1 SYNOPSIS
646
647 ann_pfam_e.pl --neg-doms  --vdoms 'sp|P09488|GSTM1_NUMAN' | accession.file
648
649=head1 OPTIONS
650
651 -h	short help
652 --help include description
653 --no-over  : generate non-overlapping domains (equivalent to ann_pfam.pl)
654 --no-clans : do not use clans with multiple families from same clan
655 --neg-doms : report domains between annotated domains as NODOM
656                 (also --neg, --neg_doms)
657 --vdoms : produce "virtual domains" using model_start,
658           model_end for partial pfam domains
659 --min_nodom=10  : minimum length between domains for NODOM
660
661 --host, --user, --password, --port --db : info for mysql database
662
663=head1 DESCRIPTION
664
665C<ann_pfam_e.pl> extracts domain information from the pfam msyql
666database.  Currently, the program works with database sequence
667descriptions in one of two formats:
668
669 Currently, the program works with database
670sequence descriptions in several formats:
671
672 >gi|1705556|sp|P54670.1|CAF1_DICDI
673 >sp|P09488|GSTM1_HUMAN
674 >sp:CALM_HUMAN
675
676C<ann_pfam_e.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>,
677and C<pfamA> tables of the C<pfam> database to extract domain
678information on a protein.
679
680If the "--no-over" option is set, overlapping domains are selected and
681edited to remove overlaps.  For proteins with multiple overlapping
682domains (domains overlap by more than 1/3 of the domain length),
683C<auto_pfam_e.pl> selects the domain annotation with the best
684C<domain_evalue_score>.  When domains overlap by less than 1/3 of the
685domain length, they are shortened to remove the overlap.
686
687C<ann_pfam_e.pl> is designed to be used by the B<FASTA> programs with
688the C<-V \!ann_pfam_e.pl> or C<-V "\!ann_pfam_e.pl --neg"> option.
689
690=head1 AUTHOR
691
692William R. Pearson, wrp@virginia.edu
693
694=cut
695