1#!/usr/bin/perl -w 2 3################################################################ 4# copyright (c) 2014 by William R. Pearson and The Rector & 5# Visitors of the University of Virginia */ 6################################################################ 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, 14# software distributed under this License is distributed on an "AS 15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either 16# express or implied. See the License for the specific language 17# governing permissions and limitations under the License. 18################################################################ 19 20# ann_pfam_e.pl gets an annotation file from fasta36 -V with a line of the form: 21 22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg) 23# 24# it must: 25# (1) read in the line 26# (2) parse it to get the up_acc 27# (3) return the tab delimited features 28# 29 30# this version only annotates sequences known to Pfam:pfamseq: 31# >pf26|164|O57809|1A1D_PYRHO 32# and only provides domain information 33 34use strict; 35 36use DBI; 37use Getopt::Long; 38use Pod::Usage; 39 40use vars qw($host $db $port $user $pass); 41 42my $hostname = `/bin/hostname`; 43 44($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam28", 0, "web_user", "fasta_www"); 45$host = 'xdb'; 46#$host = 'localhost'; 47#$db = 'RPD2_pfam28u'; 48 49my ($auto_reg,$rpd2_fams, $neg_doms, $vdoms, $lav, $no_doms, $no_clans, $pf_acc, $no_over, $acc_comment, $shelp, $help) = 50 (0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0); 51my ($min_nodom, $min_vdom) = (10,10); 52 53GetOptions( 54 "host=s" => \$host, 55 "db=s" => \$db, 56 "user=s" => \$user, 57 "password=s" => \$pass, 58 "port=i" => \$port, 59 "lav" => \$lav, 60 "acc_comment" => \$acc_comment, 61 "no-over" => \$no_over, 62 "no_over" => \$no_over, 63 "no-clans" => \$no_clans, 64 "no_clans" => \$no_clans, 65 "neg" => \$neg_doms, 66 "neg_doms" => \$neg_doms, 67 "neg-doms" => \$neg_doms, 68 "min_nodom=i" => \$min_nodom, 69 "vdoms" => \$vdoms, 70 "v_doms" => \$vdoms, 71 "pfacc" => \$pf_acc, 72 "RPD2" => \$rpd2_fams, 73 "auto_reg" => \$auto_reg, 74 "h|?" => \$shelp, 75 "help" => \$help, 76 ); 77 78pod2usage(1) if $shelp; 79pod2usage(exitstatus => 0, verbose => 2) if $help; 80pod2usage(1) unless @ARGV; 81 82my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db"; 83$connect .= ";host=$host" if $host; 84$connect .= ";port=$port" if $port; 85 86my $dbh = DBI->connect($connect, 87 $user, 88 $pass 89 ) or die $DBI::errstr; 90 91my %annot_types = (); 92my %domains = (NODOM=>0); 93my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0}); 94my @domain_list = (0); 95my $domain_cnt = 0; 96 97my $pfamA_reg_full = 'pfamA_reg_full_significant'; 98 99my $get_annot_sub = \&get_pfam_annots; 100 101my @pfam_fields = qw(seq_start seq_end model_start model_end model_length pfamA_acc pfamA_id auto_pfamA_reg_full domain_evalue_score as evalue length); 102 103my $get_pfam_acc = $dbh->prepare(<<EOSQL); 104SELECT seq_start, seq_end, model_start, model_end, model_length, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 105FROM pfamseq 106JOIN $pfamA_reg_full using(pfamseq_acc) 107JOIN pfamA USING (pfamA_acc) 108WHERE in_full = 1 109AND pfamseq_acc=? 110ORDER BY seq_start 111 112EOSQL 113 114my $get_pfam_refacc = $dbh->prepare(<<EOSQL); 115 116SELECT seq_start, seq_end, model_start, model_end, model_length pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 117FROM pfamseq 118JOIN $pfamA_reg_full using(pfamseq_acc) 119JOIN pfamA USING (pfamA_acc) 120JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp') 121JOIN seqdb_demo2.annot as sa2 using(prot_id) 122WHERE in_full = 1 123AND sa2.acc=? 124AND sa2.db='ref' 125ORDER BY seq_start 126 127EOSQL 128 129my $get_annots_sql = $get_pfam_acc; 130 131my $get_pfam_id = $dbh->prepare(<<EOSQL); 132 133SELECT seq_start, seq_end, model_start, model_end, model_length pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 134FROM pfamseq 135JOIN $pfamA_reg_full using(pfamseq_acc) 136JOIN pfamA USING (pfamA_acc) 137WHERE in_full=1 138AND pfamseq_id=? 139ORDER BY seq_start 140 141EOSQL 142 143my $get_pfam_clan = $dbh->prepare(<<EOSQL); 144 145SELECT clan_acc, clan_id 146FROM clan 147JOIN clan_membership using(clan_acc) 148WHERE pfamA_acc=? 149 150EOSQL 151 152my $get_rpd2_clans = $dbh->prepare(<<EOSQL); 153 154SELECT auto_pfamA, clan 155FROM ljm_db.RPD2_final_fams 156WHERE clan is not NULL 157 158EOSQL 159 160# -- LEFT JOIN clan_membership USING (auto_pfamA) 161# -- LEFT JOIN clans using(auto_clan) 162 163my ($tmp, $gi, $sdb, $acc, $id, $use_acc); 164 165# get the query 166my ($query, $seq_len) = @ARGV; 167$seq_len = 0 unless defined($seq_len); 168 169$query =~ s/^>//; 170 171my $ANN_F; 172 173my @annots = (); 174 175my %rpd2_clan_fams = (); 176 177if ($rpd2_fams) { 178 $get_rpd2_clans->execute(); 179 my ($auto_pfam, $auto_clan); 180 while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) { 181 $rpd2_clan_fams{$auto_pfam} = $auto_clan; 182 } 183} 184 185#if it's a file I can open, read and parse it 186if ($query !~ m/\|/ && open($ANN_F, $query)) { 187 188 while (my $a_line = <$ANN_F>) { 189 $a_line =~ s/^>//; 190 chomp $a_line; 191 push @annots, show_annots($a_line, $get_annot_sub); 192 } 193} 194else { 195 push @annots, show_annots("$query $seq_len", $get_annot_sub); 196} 197 198for my $seq_annot (@annots) { 199 print ">",$seq_annot->{seq_info},"\n"; 200 for my $annot (@{$seq_annot->{list}}) { 201 if (!$lav && defined($domains{$annot->[-1]})) { 202 my ($a_name, $a_num) = domain_num($annot->[-1],$domains{$annot->[-1]}); 203 if ($acc_comment) { 204 $annot->[-1] .= "{$domain_list[$a_num]}"; 205 } 206 $annot->[-1] = "$a_name :$a_num"; 207 } 208 print join("\t",@$annot),"\n"; 209 } 210} 211 212exit(0); 213 214sub show_annots { 215 my ($query_len, $get_annot_sub) = @_; 216 217 my ($annot_line, $seq_len) = split(/\s+/,$query_len); 218 219 my $pfamA_acc; 220 221 my %annot_data = (seq_info=>$annot_line); 222 223 $use_acc = 1; 224 $get_annots_sql = $get_pfam_acc; 225 226 if ($annot_line =~ m/^pf26\|/) { 227 ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line); 228 $dbh->do("use RPD2_pfam"); 229 } 230 elsif ($annot_line =~ m/^gi\|/) { 231 ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line); 232 if ($sdb =~ m/ref/) { 233 $get_annots_sql = $get_pfam_refacc; 234 } 235 } 236 elsif ($annot_line =~ m/^sp\|/) { 237 ($sdb, $acc, $id) = split(/\|/,$annot_line); 238 } 239 elsif ($annot_line =~ m/^ref\|/) { 240 ($sdb, $acc) = split(/\|/,$annot_line); 241 $get_annots_sql = $get_pfam_refacc; 242 } 243 elsif ($annot_line =~ m/^tr\|/) { 244 ($sdb, $acc, $id) = split(/\|/,$annot_line); 245 } 246 elsif ($annot_line =~ m/^SP:/i) { 247 ($sdb, $id) = split(/:/,$annot_line); 248 $use_acc = 0; 249 } 250 251 # remove version number 252 unless ($use_acc) { 253 $get_annots_sql = $get_pfam_id; 254 $get_annots_sql->execute($id); 255 } 256 else { 257 $acc =~ s/\.\d+$//; 258 $get_annots_sql->execute($acc); 259 } 260 261 $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len); 262 263 return \%annot_data; 264} 265 266sub get_pfam_annots { 267 my ($get_annots, $seq_length) = @_; 268 269 $seq_length = 0 unless $seq_length; 270 271 my @pf_domains = (); 272 273 # get the list of domains, sorted by start 274 275 # $row_href has: seq_start, seq_end, model_start, model_end, model_length, 276 # pfamA_acc, pfamA_id, auto_pfamA_reg_full, 277 # domain_evalue_score as evalue, length 278 279 while ( my $row_href = $get_annots->fetchrow_hashref()) { 280 if ($auto_reg) { 281 $row_href->{info} = $row_href->{auto_pfamA_reg_full}; 282 } elsif ($pf_acc) { 283 $row_href->{info} = $row_href->{pfamA_acc}; 284 } else { 285 $row_href->{info} = $row_href->{pfamA_id}; 286 } 287 288 if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) { 289 $seq_length = $row_href->{length}; 290 } 291 292 next if ($row_href->{seq_start} >= $seq_length); 293 if ($row_href->{seq_end} > $seq_length) { 294 $row_href->{seq_end} = $seq_length; 295 } 296 297 push @pf_domains, $row_href 298 } 299 300 # before checking for domain overlap, check for "split-domains" 301 # (self-unbound) by looking for runs of the same domain that are 302 # ordered by model_start 303 304 if (scalar(@pf_domains) > 1) { 305 my @j_domains; #joined domains 306 my @tmp_domains = @pf_domains; 307 308 my $prev_dom = shift(@tmp_domains); 309 310 for my $curr_dom (@tmp_domains) { 311 # to join domains: 312 # (1) the domains must be in order by model_start/end coordinates 313 # (3) joining the domains cannot make the total combination too long 314 315 # check for model and sequence consistency 316 if (($prev_dom->{pfamA_acc} eq $curr_dom->{pfamA_acc}) # same family 317 && $prev_dom->{model_start} < $curr_dom->{model_start} # model check 318 && $prev_dom->{model_end} < $curr_dom->{model_end} 319 320 && ($curr_dom->{model_start} > $prev_dom->{model_end} * 0.80 # limit overlap 321 || $curr_dom->{model_start} < $prev_dom->{model_end} * 1.25) 322 && ((($curr_dom->{model_end} - $curr_dom->{model_start}+1)/$curr_dom->{model_length} + 323 ($prev_dom->{model_end} - $prev_dom->{model_start}+1)/$prev_dom->{model_length}) < 1.33) 324 ) { # join them by updating $prev_dom 325 $prev_dom->{seq_end} = $curr_dom->{seq_end}; 326 $prev_dom->{model_end} = $curr_dom->{model_end}; 327 $prev_dom->{auto_pfamA_reg_full} = $prev_dom->{auto_pfamA_reg_full} . ";". $curr_dom->{auto_pfamA_reg_full}; 328 $prev_dom->{evalue} = ($prev_dom->{evalue} < $curr_dom->{evalue} ? $prev_dom->{evalue} : $curr_dom->{evalue}); 329 } else { 330 push @j_domains, $prev_dom; 331 $prev_dom = $curr_dom; 332 } 333 } 334 push @j_domains, $prev_dom; 335 @pf_domains = @j_domains; 336 } 337 338 if($no_over && scalar(@pf_domains) > 1) { 339 340 my @tmp_domains = @pf_domains; 341 my @save_domains = (); 342 343 my $prev_dom = shift @tmp_domains; 344 345 while (my $curr_dom = shift @tmp_domains) { 346 347 my @overlap_domains = ($prev_dom); 348 349 my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start}; 350 # check for overlap > domain_length/3 351 352 my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1); 353 my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || 354 (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); 355 356 my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len; 357 358 while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) { 359 push @overlap_domains, $curr_dom; 360 $curr_dom = shift @tmp_domains; 361 last unless $curr_dom; 362 $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start}; 363 ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1); 364 $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len; 365 $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || 366 (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); 367 } 368 369 # check for overlapping domains; >1 because $prev_dom is always there 370 if (scalar(@overlap_domains) > 1 ) { 371 # if $rpd2_fams, check for a chosen one 372 373 for my $dom ( @overlap_domains) { 374 $dom->{evalue} = 1.0 unless defined($dom->{evalue}); 375 } 376 377 @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains; 378 $prev_dom = $overlap_domains[0]; 379 } 380 381 # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3 382 push @save_domains, $prev_dom; 383 $prev_dom = $curr_dom; 384 } 385 if ($prev_dom) {push @save_domains, $prev_dom;} 386 387 @pf_domains = @save_domains; 388 389 # now check for smaller overlaps 390 for (my $i=1; $i < scalar(@pf_domains); $i++) { 391 if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) { 392 my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start}; 393 $pf_domains[$i-1]->{seq_end} -= int($overlap/2); 394 $pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1; 395 } 396 } 397 } 398 399 # $vdoms -- virtual Pfam domains -- the equivalent of $neg_doms, 400 # but covering parts of a Pfam model that are not annotated. split 401 # domains have been joined, so simply check beginning and end of 402 # each domain (but must also check for bounded-ness) 403 # only add when 10% or more is missing and missing length > $min_nodom 404 405 if ($vdoms) { 406 my @vpf_domains; 407 408 my $curr_dom = $pf_domains[0]; 409 my $length = $curr_dom->{length}; 410 411 my $prev_dom={seq_end=>0, pfamA_acc=>''}; 412 my $prev_dom_end = 0; 413 my $next_dom_start = $length+1; 414 415 for (my $dom_ix=0; $dom_ix < scalar(@pf_domains); $dom_ix++ ) { 416 $curr_dom = $pf_domains[$dom_ix]; 417 418 my $pfamA = $curr_dom->{pfamA_acc}; 419 420 # first, look left, is there a domain there (if there is, 421 # it should be updated right 422 423 # my $min_vdom = $curr_dom->{model_length} / 10; 424 425 if ($prev_dom->{pfamA_acc}) { # look for previous domain 426 $prev_dom_end = $prev_dom->{seq_end}; 427 } 428 429 # there is a domain to the left, how much room is available? 430 my $left_dom_len = min($curr_dom->{seq_start}-$prev_dom_end-1, $curr_dom->{model_start}-1); 431 if ( $left_dom_len > $min_vdom) { 432 # there is room for a virtual domain 433 my %new_dom = (seq_start=> $curr_dom->{seq_start}-$left_dom_len, 434 seq_end => $curr_dom->{seq_start}-1, 435 info=>'@'.$curr_dom->{info}, 436 model_length=>$curr_dom->{model_length}, 437 model_end => $curr_dom->{model_start}-1, 438 model_start => $left_dom_len, 439 pfamA_acc=>$pfamA, 440 ); 441 push @vpf_domains, \%new_dom; 442 } 443 444 # save the current domain 445 push @vpf_domains, $curr_dom; 446 $prev_dom = $curr_dom; 447 448 if ($dom_ix < $#pf_domains) { # there is a domain to the right 449 # first, give all the extra space to the first domain (no splitting) 450 $next_dom_start = $pf_domains[$dom_ix+1]->{seq_start}; 451 } 452 else { 453 $next_dom_start = $length; 454 } 455 456 # is there room for a virtual domain right 457 458 my $right_dom_len = min($next_dom_start-$curr_dom->{seq_end}-1, # space available 459 $curr_dom->{model_length}-$curr_dom->{model_end} # space needed 460 ); 461 if ( $right_dom_len > $min_vdom) { 462 my %new_dom = (seq_start=> $curr_dom->{seq_end}+1, 463 seq_end=> $curr_dom->{seq_end}+$right_dom_len, 464 info=>'@'.$pfamA, 465 model_length => $curr_dom->{model_length}, 466 pfamA_acc=> $pfamA, 467 ); 468 push @vpf_domains, \%new_dom; 469 $prev_dom = \%new_dom; 470 } 471 } # all done, check for last one 472 473 # $curr_dom=$pf_domains[-1]; 474 # # my $min_vdom = $curr_dom->{model_length}/10; 475 476 # my $right_dom_len = min($length - $curr_dom->{seq_end}+1, # space available 477 # $curr_dom->{model_length}-$curr_dom->{model_end} # space needed 478 # ); 479 # if ($right_dom_len > $min_vdom) { 480 # my %new_dom = (seq_start=> $curr_dom->{seq_end}+1, 481 # seq_end => $curr_dom->{seq_end}+$right_dom_len, 482 # info=>'@'.$curr_dom->{pfamA_acc}, 483 # model_len=> $curr_dom->{model_len}, 484 # pfamA_acc => $curr_dom->{pfamA_acc}, 485 # model_start => $curr_dom->{model_end}+1, 486 # model_end => $curr_dom->{model_len}, 487 # ); 488 489 # push @vpf_domains, \%new_dom; 490 # } 491 492 # @vpf_domains has both old @pf_domains and new neg-domains 493 @pf_domains = @vpf_domains; 494 } 495 496 if ($neg_doms) { 497 my @npf_domains; 498 my $prev_dom={seq_end=>0}; 499 for my $curr_dom ( @pf_domains) { 500 if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) { 501 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM'); 502 push @npf_domains, \%new_dom; 503 } 504 push @npf_domains, $curr_dom; 505 $prev_dom = $curr_dom; 506 } 507 if ($seq_length - $prev_dom->{seq_end} > $min_nodom) { 508 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM'); 509 if ($new_dom{seq_end} > $new_dom{seq_start}) { 510 push @npf_domains, \%new_dom; 511 } 512 } 513 514 # @npf_domains has both old @pf_domains and new neg-domains 515 @pf_domains = @npf_domains; 516 } 517 518 # now make sure we have useful names: colors 519 520 for my $pf (@pf_domains) { 521 $pf->{info} = domain_name($pf->{info}, $pf->{pfamA_acc}); 522 } 523 524 my @feats = (); 525 for my $d_ref (@pf_domains) { 526 if ($lav) { 527 push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}]; 528 } else { 529 push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ]; 530 # push @feats, [$d_ref->{seq_end}, ']', '-', ""]; 531 } 532 533 } 534 535 return \@feats; 536} 537 538sub min { 539 my ($arg1, $arg2) = @_; 540 541 return ($arg1 <= $arg2 ? $arg1 : $arg2); 542} 543 544sub max { 545 my ($arg1, $arg2) = @_; 546 547 return ($arg1 >= $arg2 ? $arg1 : $arg2); 548} 549 550# domain name takes a uniprot domain label, removes comments ( ; 551# truncated) and numbers and returns a canonical form. Thus: 552# Cortactin 6. 553# Cortactin 7; truncated. 554# becomes "Cortactin" 555# 556 557sub domain_name { 558 559 my ($value, $pfamA_acc) = @_; 560 my $is_virtual = 0; 561 562 if ($value =~ m/^@/) { 563 $is_virtual = 1; 564 $value =~ s/^@//; 565 } 566 567 # check for clan: 568 if ($no_clans) { 569 if (! defined($domains{$value})) { 570 $domain_clan{$value} = 0; 571 $domains{$value} = ++$domain_cnt; 572 push @domain_list, $pfamA_acc; 573 } 574 } 575 elsif (!defined($domain_clan{$value})) { 576 ## only do this for new domains, old domains have known mappings 577 578 ## ways to highlight the same domain: 579 # (1) for clans, substitute clan name for family name 580 # (2) for clans, use the same color for the same clan, but don't change the name 581 # (3) for clans, combine family name with clan name, but use colors based on clan 582 583 # check to see if it's a clan 584 $get_pfam_clan->execute($pfamA_acc); 585 586 my $pfam_clan_href=0; 587 588 if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan 589 my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)}; 590 591 # now check to see if we have seen this clan before (if so, do not increment $domain_cnt) 592 my $c_value = "C." . $clan_id; 593 if ($pf_acc) {$c_value = $clan_acc;} 594 595 $domain_clan{$value} = {clan_id => $clan_id, 596 clan_acc => $clan_acc}; 597 598 if ($domains{$c_value}) { 599 $domain_clan{$value}->{domain_cnt} = $domains{$c_value}; 600 $value = $c_value; 601 } 602 else { 603 $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt; 604 $value = $c_value; 605 $domains{$value} = $domain_cnt; 606 push @domain_list, $pfamA_acc; 607 } 608 } 609 else { # not a clan 610 $domain_clan{$value} = 0; 611 $domains{$value} = ++$domain_cnt; 612 push @domain_list, $pfamA_acc; 613 } 614 } 615 elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) { 616 if ($pf_acc) {$value = $domain_clan{$value}->{clan_acc};} 617 else { $value = "C." . $domain_clan{$value}->{clan_id}; } 618 } 619 620 if ($is_virtual) { 621 $domains{'@'.$value} = $domains{$value}; 622 $value = '@'.$value; 623 } 624 return $value; 625} 626 627sub domain_num { 628 my ($value, $number) = @_; 629 if ($value =~ m/^@/) { 630 $value =~ s/^@/v/; 631 $number = $number."v"; 632 } 633 return ($value, $number); 634} 635 636 637__END__ 638 639=pod 640 641=head1 NAME 642 643ann_feats.pl 644 645=head1 SYNOPSIS 646 647 ann_pfam_e.pl --neg-doms --vdoms 'sp|P09488|GSTM1_NUMAN' | accession.file 648 649=head1 OPTIONS 650 651 -h short help 652 --help include description 653 --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl) 654 --no-clans : do not use clans with multiple families from same clan 655 --neg-doms : report domains between annotated domains as NODOM 656 (also --neg, --neg_doms) 657 --vdoms : produce "virtual domains" using model_start, 658 model_end for partial pfam domains 659 --min_nodom=10 : minimum length between domains for NODOM 660 661 --host, --user, --password, --port --db : info for mysql database 662 663=head1 DESCRIPTION 664 665C<ann_pfam_e.pl> extracts domain information from the pfam msyql 666database. Currently, the program works with database sequence 667descriptions in one of two formats: 668 669 Currently, the program works with database 670sequence descriptions in several formats: 671 672 >gi|1705556|sp|P54670.1|CAF1_DICDI 673 >sp|P09488|GSTM1_HUMAN 674 >sp:CALM_HUMAN 675 676C<ann_pfam_e.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>, 677and C<pfamA> tables of the C<pfam> database to extract domain 678information on a protein. 679 680If the "--no-over" option is set, overlapping domains are selected and 681edited to remove overlaps. For proteins with multiple overlapping 682domains (domains overlap by more than 1/3 of the domain length), 683C<auto_pfam_e.pl> selects the domain annotation with the best 684C<domain_evalue_score>. When domains overlap by less than 1/3 of the 685domain length, they are shortened to remove the overlap. 686 687C<ann_pfam_e.pl> is designed to be used by the B<FASTA> programs with 688the C<-V \!ann_pfam_e.pl> or C<-V "\!ann_pfam_e.pl --neg"> option. 689 690=head1 AUTHOR 691 692William R. Pearson, wrp@virginia.edu 693 694=cut 695