1#!/usr/bin/perl -w 2 3################################################################ 4# copyright (c) 2014 by William R. Pearson and The Rector & 5# Visitors of the University of Virginia */ 6################################################################ 7# Licensed under the Apache License, Version 2.0 (the "License"); 8# you may not use this file except in compliance with the License. 9# You may obtain a copy of the License at 10# 11# http://www.apache.org/licenses/LICENSE-2.0 12# 13# Unless required by applicable law or agreed to in writing, 14# software distributed under this License is distributed on an "AS 15# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either 16# express or implied. See the License for the specific language 17# governing permissions and limitations under the License. 18################################################################ 19 20# ann_pfam_e.pl gets an annotation file from fasta36 -V with a line of the form: 21 22# gi|62822551|sp|P00502|GSTA1_RAT Glutathione S-transfer\n (at least from pir1.lseg) 23# 24# it must: 25# (1) read in the line 26# (2) parse it to get the up_acc 27# (3) return the tab delimited features 28# 29 30# this version only annotates sequences known to Pfam:pfamseq: 31# >pf26|164|O57809|1A1D_PYRHO 32# and only provides domain information 33 34use strict; 35 36use DBI; 37use Getopt::Long; 38use Pod::Usage; 39 40use vars qw($host $db $port $user $pass); 41 42my $hostname = `/bin/hostname`; 43 44($host, $db, $port, $user, $pass) = ("wrpxdb.its.virginia.edu", "pfam27", 0, "web_user", "fasta_www"); 45$host = 'xdb'; 46 47my ($auto_reg,$rpd2_fams, $neg_doms, $lav, $no_doms, $no_clans, $pf_acc, $no_over, $acc_comment, $shelp, $help) = 48 (0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0); 49my ($min_nodom) = (10); 50 51GetOptions( 52 "host=s" => \$host, 53 "db=s" => \$db, 54 "user=s" => \$user, 55 "password=s" => \$pass, 56 "port=i" => \$port, 57 "lav" => \$lav, 58 "acc_comment" => \$acc_comment, 59 "no-over" => \$no_over, 60 "no_over" => \$no_over, 61 "no-clans" => \$no_clans, 62 "no_clans" => \$no_clans, 63 "neg" => \$neg_doms, 64 "neg_doms" => \$neg_doms, 65 "neg-doms" => \$neg_doms, 66 "min_nodom=i" => \$min_nodom, 67 "pfacc" => \$pf_acc, 68 "RPD2" => \$rpd2_fams, 69 "auto_reg" => \$auto_reg, 70 "h|?" => \$shelp, 71 "help" => \$help, 72 ); 73 74pod2usage(1) if $shelp; 75pod2usage(exitstatus => 0, verbose => 2) if $help; 76pod2usage(1) unless @ARGV; 77 78my $connect = "dbi:mysql(AutoCommit=>1,RaiseError=>1):database=$db"; 79$connect .= ";host=$host" if $host; 80$connect .= ";port=$port" if $port; 81 82my $dbh = DBI->connect($connect, 83 $user, 84 $pass 85 ) or die $DBI::errstr; 86 87my %annot_types = (); 88my %domains = (NODOM=>0); 89my %domain_clan = (NODOM => {clan_id => 'NODOM', clan_acc=>0, domain_cnt=>0}); 90my @domain_list = (0); 91my $domain_cnt = 0; 92 93my $get_annot_sub = \&get_pfam_annots; 94 95my $get_pfam_acc = $dbh->prepare(<<EOSQL); 96 97SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 98FROM pfamseq 99JOIN pfamA_reg_full_significant using(auto_pfamseq) 100JOIN pfamA USING (auto_pfamA) 101WHERE in_full = 1 102AND pfamseq_acc=? 103ORDER BY seq_start 104 105EOSQL 106 107my $get_pfam_refacc = $dbh->prepare(<<EOSQL); 108 109SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 110FROM pfamseq 111JOIN pfamA_reg_full_significant using(auto_pfamseq) 112JOIN pfamA USING (auto_pfamA) 113JOIN seqdb_demo2.annot as sa1 on(sa1.acc=pfamseq_acc and sa1.db='sp') 114JOIN seqdb_demo2.annot as sa2 using(prot_id) 115WHERE in_full = 1 116AND sa2.acc=? 117AND sa2.db='ref' 118ORDER BY seq_start 119 120EOSQL 121 122my $get_annots_sql = $get_pfam_acc; 123 124my $get_pfam_id = $dbh->prepare(<<EOSQL); 125 126SELECT seq_start, seq_end, auto_pfamA, pfamA_acc, pfamA_id, auto_pfamA_reg_full, domain_evalue_score as evalue, length 127FROM pfamseq 128JOIN pfamA_reg_full_significant using(auto_pfamseq) 129JOIN pfamA USING (auto_pfamA) 130WHERE in_full=1 131AND pfamseq_id=? 132ORDER BY seq_start 133 134EOSQL 135 136my $get_pfam_clan = $dbh->prepare(<<EOSQL); 137 138SELECT clan_acc, clan_id 139FROM clans 140JOIN clan_membership using(auto_clan) 141WHERE auto_pfamA=? 142 143EOSQL 144 145my $get_rpd2_clans = $dbh->prepare(<<EOSQL); 146 147SELECT auto_pfamA, clan 148FROM ljm_db.RPD2_final_fams 149WHERE clan is not NULL 150 151EOSQL 152 153# -- LEFT JOIN clan_membership USING (auto_pfamA) 154# -- LEFT JOIN clans using(auto_clan) 155 156my ($tmp, $gi, $sdb, $acc, $id, $use_acc); 157 158# get the query 159my ($query, $seq_len) = @ARGV; 160$seq_len = 0 unless defined($seq_len); 161 162$query =~ s/^>//; 163 164my $ANN_F; 165 166my @annots = (); 167 168my %rpd2_clan_fams = (); 169 170if ($rpd2_fams) { 171 $get_rpd2_clans->execute(); 172 my ($auto_pfam, $auto_clan); 173 while (($auto_pfam, $auto_clan)=$get_rpd2_clans->fetchrow_array()) { 174 $rpd2_clan_fams{$auto_pfam} = $auto_clan; 175 } 176} 177 178#if it's a file I can open, read and parse it 179if ($query !~ m/\|/ && open($ANN_F, $query)) { 180 181 while (my $a_line = <$ANN_F>) { 182 $a_line =~ s/^>//; 183 chomp $a_line; 184 push @annots, show_annots($a_line, $get_annot_sub); 185 } 186} 187else { 188 push @annots, show_annots("$query $seq_len", $get_annot_sub); 189} 190 191for my $seq_annot (@annots) { 192 print ">",$seq_annot->{seq_info},"\n"; 193 for my $annot (@{$seq_annot->{list}}) { 194 if (!$lav && defined($domains{$annot->[-1]})) { 195 my ($a_name, $a_num) = ($annot->[-1],$domains{$annot->[-1]}); 196 if ($acc_comment) { 197 $annot->[-1] .= "{$domain_list[$a_num]}"; 198 } 199 $annot->[-1] .= " :".$a_num; 200 } 201 print join("\t",@$annot),"\n"; 202 } 203} 204 205exit(0); 206 207sub show_annots { 208 my ($query_len, $get_annot_sub) = @_; 209 210 my ($annot_line, $seq_len) = split(/\s+/,$query_len); 211 212 my $pfamA_acc; 213 214 my %annot_data = (seq_info=>$annot_line); 215 216 $use_acc = 1; 217 $get_annots_sql = $get_pfam_acc; 218 219 if ($annot_line =~ m/^pf26\|/) { 220 ($sdb, $gi, $acc, $id) = split(/\|/,$annot_line); 221 $dbh->do("use RPD2_pfam"); 222 } 223 elsif ($annot_line =~ m/^gi\|/) { 224 ($tmp, $gi, $sdb, $acc, $id) = split(/\|/,$annot_line); 225 if ($sdb =~ m/ref/) { 226 $get_annots_sql = $get_pfam_refacc; 227 } 228 } 229 elsif ($annot_line =~ m/^sp\|/) { 230 ($sdb, $acc, $id) = split(/\|/,$annot_line); 231 } 232 elsif ($annot_line =~ m/^ref\|/) { 233 ($sdb, $acc) = split(/\|/,$annot_line); 234 $get_annots_sql = $get_pfam_refacc; 235 } 236 elsif ($annot_line =~ m/^tr\|/) { 237 ($sdb, $acc, $id) = split(/\|/,$annot_line); 238 } 239 elsif ($annot_line =~ m/^SP:/i) { 240 ($sdb, $id) = split(/:/,$annot_line); 241 $use_acc = 0; 242 } 243 244 # remove version number 245 unless ($use_acc) { 246 $get_annots_sql = $get_pfam_id; 247 $get_annots_sql->execute($id); 248 } 249 else { 250 $acc =~ s/\.\d+$//; 251 $get_annots_sql->execute($acc); 252 } 253 254 $annot_data{list} = $get_annot_sub->($get_annots_sql, $seq_len); 255 256 return \%annot_data; 257} 258 259sub get_pfam_annots { 260 my ($get_annots, $seq_length) = @_; 261 262 $seq_length = 0 unless $seq_length; 263 264 my @pf_domains = (); 265 266 # get the list of domains, sorted by start 267 while ( my $row_href = $get_annots->fetchrow_hashref()) { 268 if ($auto_reg) { 269 $row_href->{info} = $row_href->{auto_pfamA_reg_full}; 270 } 271 elsif ($pf_acc) { 272 $row_href->{info} = $row_href->{pfamA_acc}; 273 } 274 else { 275 $row_href->{info} = $row_href->{pfamA_id}; 276 } 277 278 if ($row_href && $row_href->{length} > $seq_length && $seq_length == 0) { $seq_length = $row_href->{length};} 279 280 next if ($row_href->{seq_start} >= $seq_length); 281 if ($row_href->{seq_end} > $seq_length) { 282 $row_href->{seq_end} = $seq_length; 283 } 284 285 push @pf_domains, $row_href 286 } 287 288 # check for domain overlap, and resolve check for domain overlap 289 # (possibly more than 2 domains), choosing the domain with the best 290 # evalue 291 292 if($no_over && scalar(@pf_domains) > 1) { 293 294 my @tmp_domains = @pf_domains; 295 my @save_domains = (); 296 297 my $prev_dom = shift @tmp_domains; 298 299 while (my $curr_dom = shift @tmp_domains) { 300 301 my @overlap_domains = ($prev_dom); 302 303 my $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start}; 304 # check for overlap > domain_length/3 305 306 my ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1); 307 my $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || 308 (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); 309 310 my $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len; 311 312 while ($inclusion || ($diff > 0 && $diff > $longer_len/3)) { 313 push @overlap_domains, $curr_dom; 314 $curr_dom = shift @tmp_domains; 315 last unless $curr_dom; 316 $diff = $prev_dom->{seq_end} - $curr_dom->{seq_start}; 317 ($prev_len, $cur_len) = ($prev_dom->{seq_end}-$prev_dom->{seq_start}+1, $curr_dom->{seq_end}-$curr_dom->{seq_start}+1); 318 $longer_len = ($prev_len > $cur_len) ? $prev_len : $cur_len; 319 $inclusion = ((($curr_dom->{seq_start} >= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} <= $prev_dom->{seq_end})) || 320 (($curr_dom->{seq_start} <= $prev_dom->{seq_start}) && ($curr_dom->{seq_end} >= $prev_dom->{seq_end}))); 321 } 322 323 # check for overlapping domains; >1 because $prev_dom is always there 324 if (scalar(@overlap_domains) > 1 ) { 325 # if $rpd2_fams, check for a chosen one 326 if ($rpd2_fams) { 327 for my $dom (@overlap_domains) { 328 if ($rpd2_clan_fams{$dom->{auto_pfamA}}) { 329 $prev_dom = $dom; 330 last; 331 } 332 } 333 } 334 else { 335 @overlap_domains = sort { $a->{evalue} <=> $b->{evalue} } @overlap_domains; 336 $prev_dom = $overlap_domains[0]; 337 } 338 } 339 340 # $prev_dom should be the best of the overlaps, and we are no longer overlapping > dom_length/3 341 push @save_domains, $prev_dom; 342 $prev_dom = $curr_dom; 343 } 344 if ($prev_dom) {push @save_domains, $prev_dom;} 345 346 @pf_domains = @save_domains; 347 348 # now check for smaller overlaps 349 for (my $i=1; $i < scalar(@pf_domains); $i++) { 350 if ($pf_domains[$i-1]->{seq_end} >= $pf_domains[$i]->{seq_start}) { 351 my $overlap = $pf_domains[$i-1]->{seq_end} - $pf_domains[$i]->{seq_start}; 352 $pf_domains[$i-1]->{seq_end} -= int($overlap/2); 353 $pf_domains[$i]->{seq_start} = $pf_domains[$i-1]->{seq_end}+1; 354 } 355 } 356 } 357 358 if ($neg_doms) { 359 my @npf_domains; 360 my $prev_dom={seq_end=>0}; 361 for my $curr_dom ( @pf_domains) { 362 if ($curr_dom->{seq_start} - $prev_dom->{seq_end} > $min_nodom) { 363 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end => $curr_dom->{seq_start}-1, info=>'NODOM'); 364 push @npf_domains, \%new_dom; 365 } 366 push @npf_domains, $curr_dom; 367 $prev_dom = $curr_dom; 368 } 369 if ($seq_length - $prev_dom->{seq_end} > $min_nodom) { 370 my %new_dom = (seq_start=>$prev_dom->{seq_end}+1, seq_end=>$seq_length, info=>'NODOM'); 371 if ($new_dom{seq_end} > $new_dom{seq_start}) {push @npf_domains, \%new_dom;} 372 } 373 374 # @npf_domains has both old @pf_domains and new neg-domains 375 @pf_domains = @npf_domains; 376 } 377 378 # now make sure we have useful names: colors 379 380 for my $pf (@pf_domains) { 381 $pf->{info} = domain_name($pf->{info}, $pf->{auto_pfamA}, $pf->{pfamA_acc}); 382 } 383 384 my @feats = (); 385 for my $d_ref (@pf_domains) { 386 if ($lav) { 387 push @feats, [$d_ref->{seq_start}, $d_ref->{seq_end}, $d_ref->{info}]; 388 } 389 else { 390 push @feats, [$d_ref->{seq_start}, '-', $d_ref->{seq_end}, $d_ref->{info} ]; 391# push @feats, [$d_ref->{seq_end}, ']', '-', ""]; 392 } 393 394 } 395 396 return \@feats; 397} 398 399# domain name takes a uniprot domain label, removes comments ( ; 400# truncated) and numbers and returns a canonical form. Thus: 401# Cortactin 6. 402# Cortactin 7; truncated. 403# becomes "Cortactin" 404# 405 406sub domain_name { 407 408 my ($value, $auto_pfamA, $pfamA_acc) = @_; 409 410 # check for clan: 411 if ($no_clans) { 412 if (! defined($domains{$value})) { 413 $domain_clan{$value} = 0; 414 $domains{$value} = ++$domain_cnt; 415 push @domain_list, $pfamA_acc; 416 } 417 } 418 elsif (!defined($domain_clan{$value})) { 419 ## only do this for new domains, old domains have known mappings 420 421 ## ways to highlight the same domain: 422 # (1) for clans, substitute clan name for family name 423 # (2) for clans, use the same color for the same clan, but don't change the name 424 # (3) for clans, combine family name with clan name, but use colors based on clan 425 426 # check to see if it's a clan 427 $get_pfam_clan->execute($auto_pfamA); 428 429 my $pfam_clan_href=0; 430 431 if ($pfam_clan_href=$get_pfam_clan->fetchrow_hashref()) { # is a clan 432 my ($clan_id, $clan_acc) = @{$pfam_clan_href}{qw(clan_id clan_acc)}; 433 434 # now check to see if we have seen this clan before (if so, do not increment $domain_cnt) 435 my $c_value = "C." . $clan_id; 436 if ($pf_acc) {$c_value = "C." . $clan_acc;} 437 438 $domain_clan{$value} = {clan_id => $clan_id, 439 clan_acc => $clan_acc}; 440 441 if ($domains{$c_value}) { 442 $domain_clan{$value}->{domain_cnt} = $domains{$c_value}; 443 $value = $c_value; 444 } 445 else { 446 $domain_clan{$value}->{domain_cnt} = ++ $domain_cnt; 447 $value = $c_value; 448 $domains{$value} = $domain_cnt; 449 push @domain_list, $pfamA_acc; 450 } 451 } 452 else { # not a clan 453 $domain_clan{$value} = 0; 454 $domains{$value} = ++$domain_cnt; 455 push @domain_list, $pfamA_acc; 456 } 457 } 458 elsif ($domain_clan{$value} && $domain_clan{$value}->{clan_acc}) { 459 if ($pf_acc) {$value = "C." . $domain_clan{$value}->{clan_acc};} 460 else { $value = "C." . $domain_clan{$value}->{clan_id}; } 461 } 462 463 return $value; 464} 465 466__END__ 467 468=pod 469 470=head1 NAME 471 472ann_feats.pl 473 474=head1 SYNOPSIS 475 476 ann_pfam_e.pl --neg-doms 'sp|P09488|GSTM1_NUMAN' | accession.file 477 478=head1 OPTIONS 479 480 -h short help 481 --help include description 482 --no-over : generate non-overlapping domains (equivalent to ann_pfam.pl) 483 --no-clans : do not use clans with multiple families from same clan 484 --neg-doms : report domains between annotated domains as NODOM 485 (also --neg, --neg_doms) 486 --min_nodom=10 : minimum length between domains for NODOM 487 488 --host, --user, --password, --port --db : info for mysql database 489 490=head1 DESCRIPTION 491 492C<ann_pfam_e.pl> extracts domain information from the pfam msyql 493database. Currently, the program works with database sequence 494descriptions in one of two formats: 495 496 Currently, the program works with database 497sequence descriptions in several formats: 498 499 >gi|1705556|sp|P54670.1|CAF1_DICDI 500 >sp|P09488|GSTM1_HUMAN 501 >sp:CALM_HUMAN 502 503C<ann_pfam_e.pl> uses the C<pfamA_reg_full_significant>, C<pfamseq>, 504and C<pfamA> tables of the C<pfam> database to extract domain 505information on a protein. 506 507If the "--no-over" option is set, overlapping domains are selected and 508edited to remove overlaps. For proteins with multiple overlapping 509domains (domains overlap by more than 1/3 of the domain length), 510C<auto_pfam_e.pl> selects the domain annotation with the best 511C<domain_evalue_score>. When domains overlap by less than 1/3 of the 512domain length, they are shortened to remove the overlap. 513 514C<ann_pfam_e.pl> is designed to be used by the B<FASTA> programs with 515the C<-V \!ann_pfam_e.pl> or C<-V "\!ann_pfam_e.pl --neg"> option. 516 517=head1 AUTHOR 518 519William R. Pearson, wrp@virginia.edu 520 521=cut 522