1#!/usr/bin/env perl 2 3package main; 4our $DEBUG; 5 6package Gene_obj; 7use strict; 8use Nuc_translator; 9#use Gene_ontology; 10use Longest_orf; 11use Storable qw (store retrieve freeze thaw dclone); 12use warnings; 13use Data::Dumper; 14use Carp qw (croak cluck confess); 15use URI::Escape; 16 17=head1 NAME 18 19package Gene_obj 20 21=cut 22 23 24 25=head1 DESCRIPTION 26 27 Gene_obj(s) encapsulate the elements of both gene structure and gene function. The gene structure is stored in a hierarchical fashion as follows: 28 29 Gene ========================================================= 30 31 Exon ========= ========= ========= ======== 32 33 CDS ====== ========= ====== 34 35 36 where a Gene is a container for Exon(s), and each Exon is a container for a CDS, and an Exon can contain a single CDS component. An Exon lacking a CDS exon is an untranslated exon or UTR exon. The region of an Exon which extends beyond the CDS is also considered a UTR. 37 38 39 There are several ways to instantiate gene objects. A simple example is described: 40 41 Exon and CDS component coordinates can be assigned as hashes. 42 43 ie. 44 45 my %mrna = ( 100 => 200, 46 300 => 500 ); 47 48 my %CDS = ( 150=>200, 49 300=>450); 50 51 my $sequence = "GACTACATTTAATAGGGCCC"; #string representing the genomic sequence 52 my $gene = new Gene_obj(); 53 54 $gene->{com_name} = "hypothetical protein"; 55 56 $gene->populate_gene_obj(\%CDS, \%mRNA, \$sequence); 57 print $gene->toString(); 58 59 60 61 Alternatively, the individual components of genes (Exons and CDSs) can be instantiated separately and used to build the Gene from the ground up (See packages mRNA_exon_obj and CDS_exon_obj following this Gene_obj documentation). 62 63 my $cds_exon = new CDS_exon_obj (150, 200); 64 65 my $mRNA_exon = new mRNA_exon_obj (100, 200); 66 67 $mRNA_exon->set_CDS_exon_obj($cds_exon); 68 69 my $gene_obj = new Gene_obj (); 70 71 $gene_obj->{gene_name} = "hypothetical gene"; 72 $gene_obj->{com_name} = "hypothetical protein"; 73 74 $gene_obj->add_mRNA_exon_obj($mRNA_exon); 75 76 $gene_obj->refine_gene_object(); 77 78 $gene_obj->create_all_sequence_types (\$sequence); #ref to genomic sequence string. 79 80 print $gene_obj->toString(); 81 82 83 The API below describes useful functions for navigating and manipulating the Gene object along with all of its attributes. 84 85 86 87=cut 88 89 90 91 92 93 94=over 4 95 96=item new() 97 98B<Description:> Constructor for Gene_obj 99 100B<Parameters:> none 101 102B<Returns:> $gene_obj 103 104 105The Gene_obj contains several attributes which can be manipulated directly (or by get/set methods if they exist). These attributes include: 106 107 asmbl_id # identifier for the genomic contig for which this gene is anchored. 108 TU_feat_name #feat_names are TIGR temporary identifiers. 109 Model_feat_name # temp TIGR identifier for gene models 110 locus #identifier for a gene (TU) ie. T2P3.5 111 pub_locus #another identifier for a gene (TU) ie. At2g00010 112 model_pub_locus #identifier for a gene model (model) ie. At2g00010.1 113 model_locus #analagous to locus, but for model rather than gene (TU) 114 alt_locus #alternative locus 115 gene_name # name for gene 116 com_name # name for gene product 117 comment #internal comment 118 pub_comment #comment related to gene 119 ec_num # enzyme commission number 120 gene_sym # gene symbol 121 is_5prime_partial # 0|1 missing start codon. 122 is_3prime_partial # 0|1 missing stop codon. 123 is_pseudogene # 0|1 124 curated_com_name # 0|1 125 curated_gene_structure # 0|1 126 127 ## Other attributes set internally Access-only, do not set directly. 128 129 gene_length # length of gene span (int). 130 mid_pt # holds midpoint of gene-span 131 strand # [+-] 132 protein_seq # holds protein sequence 133 protein_seq_length 134 CDS_sequence #holds CDS sequence (translated to protein); based on CDS_exon coordinates 135 CDS_seq_length 136 cDNA_sequence #holds cDNA sequence; based on mRNA exon coordinates. 137 cDNA_seq_length 138 gene_sequence #holds unspliced transcript 139 gene_sequence_length #length of unspliced transcript 140 gene_type # "protein-coding", #default type for gene object. Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accommodate other gene or feature types. 141 num_additional_isoforms # int 142 143 144=back 145 146=cut 147 148 149 150sub new { 151 shift; 152 my $self = { asmbl_id => 0, #genomic contig ID 153 locus => undef, #text 154 pub_locus => undef, #text ie. At2g00010 155 model_pub_locus =>undef, #text ie. At2g00010.1 156 model_locus => undef, #text ie. F12G15.1 157 alt_locus => undef, #text 158 gene_name => undef, #text 159 com_name => undef, #text 160 comment => undef, 161 curated_com_name => 0, 162 curated_gene_structure => 0, 163 pub_comment => undef, #text 164 ec_num => undef, #text (enzyme commission number) 165 gene_type => "protein-coding", #default type for gene object. Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accomodate other gene or feature types. 166 gene_sym => undef, #text (gene symbol) 167 mRNA_coords => 0, #assigned to anonymous hash of end5->end3 relative to the parent sequence 168 CDS_coords => 0, #assigned to anonymous hash of end5->end3 relative to the parent sequence 169 mRNA_exon_objs => 0, # holds arrayref to mRNA_obj, retrieve only thru method: get_exons() 170 num_exons => 0, # number of exons in this gene_obj 171 model_span => [], # holds array ref to (end5,end3) for CDS range of gene. 172 gene_span => [], # holds array ref to (end5,end3) for mRNA range of gene. 173 gene_length => 0, # length of gene span (int). 174 mid_pt => 0, # holds midpoint of gene-span 175 strand => 0, # [+-] 176 gi => undef, #text 177 prot_acc => undef, #text 178 is_pseudogene => 0, # toggle indicating pseudogene if 1. 179 is_5prime_partial => 0, #boolean indicating missing 5' part of gene. 180 is_3prime_partial => 0, #boolean 181 protein_seq => undef, # holds protein sequence 182 protein_seq_length => 0, 183 CDS_sequence => undef, #holds CDS sequence (translated to protein); based on CDS_exon coordinates 184 CDS_seq_length => 0, 185 cDNA_sequence => undef, #holds cDNA sequence; based on mRNA exon coordinates. 186 cDNA_seq_length => 0, 187 gene_sequence => undef, #holds unspliced transcript 188 gene_sequence_length => 0, #length of unspliced transcript 189 TU_feat_name => undef, #feat_names are TIGR temporary identifiers. 190 Model_feat_name =>undef, 191 classification => 'annotated_genes', #type of seq_element. 192 gene_synonyms => [], #list of synonymous model feat_names 193 GeneOntology=>[], #list of Gene_ontology assignment objects. ...see GeneOntology.pm 194 195 ## Additional functional attributes: 196 secondary_gene_names => [], 197 secondary_product_names => [], 198 secondary_gene_symbols => [], 199 secondary_ec_numbers =>[], 200 201 202 ## Alternative splicing support. 203 num_additional_isoforms => 0, # number of additional isoforms stored in additonal_isoform list below 204 additional_isoforms => [] # stores list of Gene_objs corresponding to the additional isoforms. 205 206 }; 207 bless($self); 208 return ($self); 209} 210 211 212 213 214=over 4 215 216=item erase_gene_structure() 217 218B<Description:> Removes the structural components of a gene (ie. exons, CDSs, coordinate spans, any corresponding sequences) 219 220B<Parameters:> none 221 222B<Returns:> none 223 224=back 225 226=cut 227 228 229## erase gene structure 230sub erase_gene_structure { 231 my $self = shift; 232 $self->{mRNA_exon_objs} = 0; 233 $self->{num_exons} = 0; 234 $self->{model_span} = []; 235 $self->{gene_span} = []; 236 $self->{gene_length} = 0; 237 $self->{strand} = 0; 238 $self->{protein_seq} = 0; 239 $self->{CDS_sequence} = 0; 240 $self->{CDS_seq_length} = 0; 241 $self->{cDNA_sequence} = 0; 242 $self->{cDNA_seq_length} = 0; 243} 244 245 246=over 4 247 248=item clone_gene() 249 250B<Description:> Clones this Gene_obj by copying attributes from this Gene to a new gene. Does NOT do a deep clone for all attributes. See dclone() for a more rigorous cloning method. This method is safer because all references are not cloned, only the critical ones. 251 252B<Parameters:> none 253 254B<Returns:> new Gene_obj 255 256=back 257 258=cut 259 260 261 262## all objects are cloned. References to data only are not. 263sub clone_gene { 264 my $self = shift; 265 my $clone = new Gene_obj(); 266 267 268 ## Copy over the non-ref attribute values. 269 foreach my $key (keys %$self) { 270 my $value = $self->{$key}; 271 if (defined $value) { 272 ## Not copying over refs. 273 if (ref $value) { 274 next; 275 } 276 277 ## Not copying over attributes of length > 200, such as protein/nucleotide sequences 278 my $length = length($value); 279 if ($length > 200) { next;} 280 } 281 282 # passed tests above, copying attribute. 283 $clone->{$key} = $value; 284 285 } 286 287 ## copy over the gene synonyms. 288 my @gene_syns = @{$self->{gene_synonyms}}; 289 $clone->{gene_synonyms} = \@gene_syns; 290 291 292 ## copy the GO assignments: 293 my @GO_assignments = $self->get_gene_ontology_objs(); 294 if (@GO_assignments) { 295 foreach my $go_assignment (@GO_assignments) { 296 my $go_clone = dclone($go_assignment); 297 $clone->add_gene_ontology_objs($go_clone); 298 } 299 } 300 301 302 ## copy gene structure. 303 my @exons = $self->get_exons(); 304 foreach my $exon (@exons) { 305 $clone->add_mRNA_exon_obj($exon->clone_exon()); 306 } 307 308 foreach my $isoform ($self->get_additional_isoforms()) { 309 my $isoform_clone = $isoform->clone_gene(); 310 $clone->add_isoform($isoform_clone); 311 } 312 313 $clone->refine_gene_object(); 314 315 return ($clone); 316} 317 318 319 320 321=over 4 322 323=item deep_clone() 324 325B<Description:> Provides a deep clone of a gene_obj. Only references supported in Gene_obj documentation are supported. Those added in a rogue way are undef()d 326 327B<Parameters:> none 328 329B<Returns:> $gene_obj 330 331uses the Storable dclone() function to deep clone the Gene_obj 332 333=back 334 335=cut 336 337 338 ; 339## all objects are cloned. References to data only are not. 340sub deep_clone { 341 my $self = shift; 342 my $clone = dclone($self); 343 344 my %supported_refs = (model_span => 1, 345 gene_span => 1, 346 gene_synonyms => 1, 347 Gene_ontology => 1, 348 additional_isoforms=>1, 349 mRNA_exon_objs => 1); 350 351 foreach my $gene_obj ($clone, $clone->get_additional_isoforms()) { 352 353 my @keys = keys %$gene_obj; 354 foreach my $key (@keys) { 355 my $value = $gene_obj->{$key}; 356 if (ref $value && !$supported_refs{$key}) { 357 $gene_obj->{$key} = undef; 358 } 359 } 360 } 361 362 return ($clone); 363} 364 365 366=over 4 367 368=item populate_gene_obj() 369 370B<Description:> Given CDS and mRNA coordinates stored in hash form, a gene object is populated with mRNA and CDS exons. This is one available way to populate a newly instantiated Gene_obj. 371 372B<Parameters:> $cds_hash_ref, $mRNA_hash_ref, <$seq_ref> 373 374$mRNA_hash_ref is a reference to a hash holding the end5 => end3 coordinates of the Exons 375 376$cds_hash_ref same as mRNA_has_ref except holds the CDS end5 => end3 coordinates. 377 378$seq_ref is a reference to a string containing the genomic sequence. This is an optional parameter. 379 380 381B<Returns:> none 382 383=back 384 385=cut 386 387 ; 388 389## Do several things at once: assign CDS and mRNA coordinates, and build gene sequences. 390## The \$seq_ref is optional in case you want to create the sequence types. 391sub populate_gene_obj { 392 my ($self, $cds_ref, $mRNA_ref, $seq_ref) = @_; 393 $self->set_CDS_coords ($cds_ref); 394 $self->set_mRNA_coords ($mRNA_ref); 395 $self->refine_gene_object(); 396 if (ref $seq_ref) { 397 $self->create_all_sequence_types($seq_ref); 398 } 399 ## reinitialize the hashrefs: 400 $self->{mRNA_coords} = 0; 401 $self->{CDS_coords} = 0; 402 403 404} 405 406 407# alias above 408sub populate_gene_object { 409 my $self = shift; 410 $self->populate_gene_obj(@_); 411} 412 413 414#### 415sub populate_gene_object_via_CDS_coords { 416 my $self = shift; 417 my @coordsets = @_; 418 419 foreach my $coordset (@coordsets) { 420 my ($end5, $end3) = @$coordset; 421 my $mrna_exon_obj = mRNA_exon_obj->new($end5, $end3); 422 my $cds_obj = CDS_exon_obj->new($end5, $end3); 423 $mrna_exon_obj->{CDS_exon_obj} = $cds_obj; 424 $self->add_mRNA_exon_obj($mrna_exon_obj); 425 } 426 427 $self->refine_gene_object(); 428 return; 429} 430 431 432sub build_gene_obj_exons_n_cds_range { 433 my $self = shift; 434 my ($exons_aref, $cds_lend, $cds_rend, $orient) = @_; 435 436 my @exon_coords; 437 foreach my $exon_aref (@$exons_aref) { 438 my ($exon_lend, $exon_rend) = sort {$a<=>$b} @$exon_aref; 439 push (@exon_coords, [$exon_lend, $exon_rend] ); 440 } 441 @exon_coords = sort {$a->[0]<=>$b->[0]} @exon_coords; 442 443 444 unless ($orient =~ /^[\+\-]$/) { 445 confess "Error, orient not [+-] "; 446 } 447 448 ## build the CDS coordinates. 449 450 my @cds_range; 451 452 if ($cds_lend > 0 && $cds_rend > 0) { 453 454 ($cds_lend, $cds_rend) = sort {$a<=>$b} ($cds_lend, $cds_rend); 455 456 foreach my $exon_coords_aref (@exon_coords) { 457 my ($exon_lend, $exon_rend) = @$exon_coords_aref; 458 459 if ($exon_rend >= $cds_lend && $exon_lend <= $cds_rend) { 460 461 ## got overlap 462 my $cds_exon_lend = ($cds_lend < $exon_lend) ? $exon_lend : $cds_lend; 463 464 my $cds_exon_rend = ($cds_rend > $exon_rend) ? $exon_rend : $cds_rend; 465 466 push (@cds_range, [$cds_exon_lend, $cds_exon_rend]); 467 } 468 } 469 470 unless (@cds_range) { 471 confess "Error, no CDS exon coords built based on exon overlap"; 472 } 473 } 474 ## all coordinate sets are ordered left to right. 475 # build the coordinates href 476 477 my %exon_coords; 478 my %cds_coords; 479 foreach my $exon_coords_aref (@exon_coords) { 480 my ($exon_lend, $exon_rend) = @$exon_coords_aref; 481 my ($exon_end5, $exon_end3) = ($orient eq '+') ? ($exon_lend, $exon_rend) : ($exon_rend, $exon_lend); 482 $exon_coords{$exon_end5} = $exon_end3; 483 } 484 foreach my $cds_coords_aref (@cds_range) { 485 my ($cds_lend, $cds_rend) = @$cds_coords_aref; 486 my ($cds_end5, $cds_end3) = ($orient eq '+') ? ($cds_lend, $cds_rend) : ($cds_rend, $cds_lend); 487 $cds_coords{$cds_end5} = $cds_end3; 488 } 489 490 # print Dumper (\%cds_coords) . Dumper (\%exon_coords); 491 492 $self->populate_gene_obj(\%cds_coords, \%exon_coords); 493 494 return ($self); 495} 496 497 498#### 499sub join_adjacent_exons { 500 my $self = shift; 501 502 my @exons = $self->get_exons(); 503 504 my $strand = $self->get_orientation(); 505 506 my $first_exon = shift @exons; 507 my @new_exons = ($first_exon); 508 509 while (@exons) { 510 my $prev_exon = $new_exons[$#new_exons]; 511 my ($prev_end5, $prev_end3) = $prev_exon->get_coords(); 512 513 my $next_exon = shift @exons; 514 my ($next_end5, $next_end3) = $next_exon->get_coords(); 515 516 if ( ($strand eq '+' && $prev_end3 == $next_end5 - 1) # adjacent 517 || 518 ($strand eq '-' && $prev_end3 == $next_end5 + 1) ) { 519 520 $prev_exon->merge_exon($next_exon); 521 } 522 else { 523 push (@new_exons, $next_exon); 524 } 525 } 526 527 $self->{mRNA_exon_objs} = [@new_exons]; 528 529 $self->refine_gene_object(); 530 531 return; 532} 533 534 535 536 537=over 4 538 539=item AAToNucleotideCoords() 540 541B<Description:> Converts an amino acid -based coordinate to a genomic sequence -based coordinate. 542 543B<Parameters:> $aa_coord 544 545B<Returns:> $genomic_coord 546 547undef is returned if the aa_coord could not be converted. 548 549 550=back 551 552=cut 553 554 ; 555 556sub AAToNucleotideCoords{ 557 my($self) = shift; 558 my($aacoord) = shift; 559 my($debug) = shift; 560 my($PCDS_coords) = {}; 561 my($A2NMapping) = {}; 562 my($currAA) = 1; 563 my $strand = $self->{strand}; 564 my @exons = $self->get_exons(); 565 my($cds_count)=0; 566 my($translated_bp)=-1; 567 my($lastcarryover)=0; 568 my($end_bp); 569 foreach my $exon (sort { 570 if($strand eq "+"){ 571 $a->{end5}<=>$b->{end5}; 572 } 573 else{ 574 $b->{end5}<=>$a->{end5}; 575 } 576 } @exons) { 577 my $cds = $exon->get_CDS_obj(); 578 if ($cds) { 579 my @cds_coords = $cds->get_CDS_end5_end3(); 580 my($bpspread) = abs($cds_coords[0]-$cds_coords[1]); 581 $bpspread+=$lastcarryover; 582 my($nextAA) = int($bpspread/3); # last complete AA in CDS 583 $lastcarryover = $bpspread%3; 584 $PCDS_coords->{$currAA} = $currAA+$nextAA-1; 585 if($strand eq "+"){ 586 $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1]; 587 } 588 else{ 589 $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0]; 590 } 591 print "DEBUG: $strand $cds_count AA range ($currAA - $PCDS_coords->{$currAA}) nucleotide start($A2NMapping->{$currAA})\n" if($debug); 592 $currAA = $currAA+$nextAA; 593 $cds_count++; 594 if($strand eq "+"){ 595 $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0]; 596 } 597 else{ 598 $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1]; 599 } 600 } 601 } 602 # PCDS_coords key/value are start/stop aa counts for each cds; 603 # A2NMapping stores cds AA start key to cds nucleotide start 604 $cds_count=0; 605 foreach my $PCDS_end5 (sort { 606 $a<=>$b; 607 }(keys %$PCDS_coords)) { 608 my($PCDS_end3) = $PCDS_coords->{$PCDS_end5}; 609 if($aacoord>=$PCDS_end5 && $aacoord<=$PCDS_end3){ 610 my($nucleotide_start) = $A2NMapping->{$PCDS_end5}; 611 my($aa_offset) = $aacoord - $PCDS_end5; 612 my($nucleotide_offset) = $aa_offset*3; 613 print "DEBUG: CDS offset $aa_offset AA $nucleotide_offset bp\n" if($debug); 614 if($strand eq "+"){ 615 $translated_bp = $nucleotide_start+$nucleotide_offset; 616 } 617 else{ 618 $translated_bp = $nucleotide_start-$nucleotide_offset; 619 } 620 print "DEBUG: Mapping $aacoord to $translated_bp in cds $cds_count\n" if($debug); 621 print "DEBUG: CDS $PCDS_end5 - $PCDS_end3 nucleotide start $A2NMapping->{$PCDS_end5}, nuc offset $nucleotide_offset\n" if($debug); 622 } 623 624 $cds_count++; 625 } 626 #} 627 if($translated_bp == -1){ 628 $translated_bp = undef; 629 print STDERR "Unable to translate AA coordinate: $aacoord. Off end. Using undef\n" if($debug); 630 } 631 return $translated_bp; 632} 633 634 635 636## private method, used by populate_gene_obj() 637# sets CDS_coords instance member to a hash reference of CDS coordinates. $hash{end5} = end3 638sub set_CDS_coords { 639 my $self = shift; 640 my $hash_ref = shift; 641 if (ref ($hash_ref) eq 'HASH') { 642 $self->{CDS_coords} = $hash_ref; 643 } else { 644 print STDERR "Cannot set CDS_coords, must have hash reference\n"; 645 } 646} 647 648 649 650 651=over 4 652 653=item get_gene_span() 654 655B<Description:> Retrieves the coordinates which span the length of the gene along the genomic sequence. 656 657B<Parameters:> none 658 659B<Returns:> (end5, end3) 660 661These coordinates represent the minimal and maximal exonic coordinates of the gene. Orientation can be inferred by the relative values of end5 and end3. 662 663 664=back 665 666=cut 667 668 ; 669 670## All return gene end5, end3 ### 671sub get_gene_span { 672 my $self = shift; 673 return (@{$self->{gene_span}}); 674} 675 676 677 678 679## private 680sub get_seq_span { 681 my $self = shift; 682 return ($self->get_gene_span()); 683} 684 685 686 687=over 4 688 689=item get_coords() 690 691B<Description:> See get_gene_span() 692 693B<Parameters:> none 694 695B<Returns:> (end5, end3) 696 697=back 698 699=cut 700 701 702sub get_coords { 703 my $self = shift; 704 return ($self->get_gene_span()); 705} 706 707 708 709=over 4 710 711=item get_model_span() 712 713B<Description:> Retrieves the coordinates spanned by the protein-coding region of the gene along the genomic sequence. 714 715B<Parameters:> none 716 717B<Returns:> (end5, end3) 718 719These coordinates are determined by the min and max of the CDS components of the gene. 720 721=back 722 723=cut 724 725 726 727 728sub get_model_span { 729 my $self = shift; 730 return (@{$self->{model_span}}); 731} 732 733 734sub get_CDS_span { # preferred 735 my $self = shift; 736 return($self->get_model_span()); 737} 738 739 740=over 4 741 742=item get_transcript_span() 743 744B<Description:> Retrieves the coordinates spanned by the exonic regions of the gene along the genomic sequence. 745 746B<Parameters:> none 747 748B<Returns:> (lend, rend) 749 750These coordinates are determined by the min and max of the CDS components of the gene. 751 752=back 753 754=cut 755 756 757sub get_transcript_span { 758 my $self = shift; 759 760 my @coords; 761 my @exons = $self->get_exons(); 762 foreach my $exon (@exons) { 763 push (@coords, $exon->get_coords()); 764 } 765 @coords = sort {$a<=>$b} @coords; 766 767 my $lend = shift @coords; 768 my $rend = pop @coords; 769 770 return($lend, $rend); 771} 772 773 774sub is_pseudogene { 775 my $self = shift; 776 return ($self->{is_pseudogene}); 777} 778 779sub set_pseudogene { 780 my $self = shift; 781 my $pseudogene_val = shift; 782 unless ($pseudogene_val =~ /[01]/) { 783 confess "Error, can set pseudogene to zero or one only.\n"; 784 } 785 786 foreach my $gene ($self, $self->get_additional_isoforms()) { 787 $gene->{is_pseudogene} = $pseudogene_val; 788 } 789 790 return; 791} 792 793 794 795#private 796# sets mRNA_coords instance member to a hash reference of CDS coordinates. $hash{end5} = end3 797sub set_mRNA_coords { 798 my $self = shift; 799 my $hash_ref = shift; 800 if (ref ($hash_ref) eq 'HASH') { 801 $self->{mRNA_coords} = $hash_ref; 802 } else { 803 print STDERR "Cannot set CDS_coords, must have hash reference\n"; 804 } 805} 806 807 808=over 4 809 810=item refine_gene_object() 811 812B<Description:> This method performs some data management operations and should be called at any time modifications have been made to the gene structure (ie. exons added or modified, model isoforms added, etc). It performs the following orientations: 813 814 -Sets (or resets) gene span and model span coordinates, strand orientation, gene length, mid-point. 815 816B<Parameters:> none 817 818B<Returns:> none 819 820=back 821 822=cut 823 824## Once mRNA_coords and CDS_coords have been assigned, this will populate the remaining elements in the gene object. 825 826sub refine_gene_object { 827 my ($self) = shift; 828 #check to see if mRNA_coords field is populated. If not, initialize. 829 if ($self->{mRNA_coords} == 0) { 830 $self->{mRNA_coords} = {}; 831 } 832 my ($CDS_coords, $mRNA_coords) = ($self->{CDS_coords}, $self->{mRNA_coords}); 833 834 unless ($CDS_coords && $mRNA_coords) { 835 #maybe created exon objects already 836 if ($self->{mRNA_exon_objs}) { 837 $self->trivial_refinement(); 838 } 839 return; 840 } 841 # intialize mRNA_exon_objs to array ref. 842 $self->{mRNA_exon_objs} = []; 843 #retrieve coordinate data. 844 my %mRNA = %$mRNA_coords; 845 my %CDS = %$CDS_coords; 846 my @mRNAcoords = keys %mRNA; 847 my @CDScoords = keys %CDS; 848 my (%new_mRNA, %new_CDS); 849 ## if correlation between mRNA exons and CDS exons, then map CDS's to mRNA's, otherwise, replicate CDSs as mRNAs 850 if ($#mRNAcoords >= $#CDScoords) { 851 852 foreach my $mRNA_end5 (keys %mRNA) { 853 my $mRNA_end3 = $mRNA{$mRNA_end5}; 854 #find overlapping cds exon to mRNA exon 855 #easy to compare if in same orientation for all comparisons 856 my ($m1, $m2) = ($mRNA_end5 < $mRNA_end3) ? ($mRNA_end5, $mRNA_end3) : ($mRNA_end3, $mRNA_end5); 857 #create mRNA_exon_obj 858 my $mRNA_exon_obj = mRNA_exon_obj->new ($mRNA_end5, $mRNA_end3); 859 $new_mRNA{$mRNA_end5} = $mRNA_end3; 860 foreach my $CDS_end5 (keys %CDS) { 861 my $CDS_end3 = $CDS{$CDS_end5}; 862 my ($c1, $c2) = ($CDS_end5 < $CDS_end3) ? ($CDS_end5, $CDS_end3) : ($CDS_end3, $CDS_end5); 863 ## do overlap comparison; CDS must be contained within mRNA exon 864 if ( ($c1 >= $m1) && ($c2 <= $m2)) { 865 # found the contained CDS 866 $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3); 867 $new_CDS{$CDS_end5} = $CDS_end3; 868 last; 869 } 870 } 871 $self->add_mRNA_exon_obj($mRNA_exon_obj); 872 } 873 } else { # remap CDSs to mRNAS 874 print STDERR "ERROR: mRNA exons < CDS exons. Copying all CDS exons into mRNA exons. \n\n"; 875 foreach my $CDS_end5 (keys %CDS) { 876 my $CDS_end3 = $CDS{$CDS_end5}; 877 my $mRNA_exon_obj = mRNA_exon_obj->new ($CDS_end5, $CDS_end3); 878 $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3); 879 $self->add_mRNA_exon_obj($mRNA_exon_obj); 880 $new_mRNA{$CDS_end5} = $CDS_end3; 881 $new_CDS{$CDS_end5} = $CDS_end3; 882 } 883 } 884 885 $self->trivial_refinement(); 886 887 ## assign orientation to all children exon and CDS components. 888 my $strand = $self->get_orientation(); 889 foreach my $exon ($self->get_exons()) { 890 $exon->{strand} = $strand; 891 if (my $cds = $exon->get_CDS_exon_obj()) { 892 $cds->{strand} = $strand; 893 } 894 } 895 return; 896 897} 898 899 900## alias 901sub refine_gene_obj { 902 my $self = shift; 903 $self->refine_gene_object(); 904} 905 906 907=over 4 908 909=item get_exons() 910 911B<Description:>Retrieves a list of exons belonging to this Gene_obj 912 913B<Parameters:> none 914 915B<Returns:> @exons 916 917@exons is an ordered list of mRNA_exon_obj; the first exon of the list corresponds to the first exon of the spliced gene. 918 919=back 920 921=cut 922 923 ; 924 925sub get_exons { 926 my ($self) = shift; 927 if ($self->{mRNA_exon_objs} != 0) { 928 my @exons = (@{$self->{mRNA_exon_objs}}); 929 @exons = sort {$a->{end5}<=>$b->{end5}} @exons; 930 if ($self->{strand} eq '-') { 931 @exons = reverse (@exons); 932 } 933 return (@exons); 934 } else { 935 my @x = (); 936 return (@x); #empty array 937 } 938} 939 940 941## private 942sub get_segments { 943 my $self = shift; 944 return ($self->get_exons()); 945} 946 947 948 949=over 4 950 951=item number_of_exons() 952 953B<Description:> Provides the number of exons contained by the Gene 954 955B<Parameters:> none 956 957B<Returns:> int 958 959=back 960 961=cut 962 963 964 965sub number_of_exons { 966 my $self = shift; 967 my $exon_number = $#{$self->{mRNA_exon_objs}} + 1; 968 return ($exon_number); 969} 970 971 972 973 974 975 976 977=over 4 978 979=item get_intron_coordinates() 980 981B<Description:> Provides an ordered list of intron coordinates 982 983B<Parameters:> none 984 985B<Returns:> ( [end5,end3], ....) 986 987A list of arrayRefs are returned providing the coordinates of introns, ordered from first intron to last intron within the gene. 988 989=back 990 991=cut 992 993 ; 994 995sub get_intron_coordinates { 996 my $gene_obj = shift; 997 my $strand = $gene_obj->get_orientation(); 998 my @exons = $gene_obj->get_exons(); 999 ## exon list should already be sorted. 1000 my @introns = (); 1001 1002 my $num_exons = $#exons + 1; 1003 if ($num_exons > 1) { #only genes with multiple exons will have introns. 1004 if ($strand eq '+') { 1005 my $first_exon = shift @exons; 1006 while (@exons) { 1007 my $next_exon = shift @exons; 1008 my ($first_end5, $first_end3) = $first_exon->get_coords(); 1009 my ($next_end5, $next_end3) = $next_exon->get_coords(); 1010 my $intron_end5 = $first_end3 + 1; 1011 my $intron_end3 = $next_end5 -1; 1012 if ($intron_end5 < $intron_end3) { 1013 push (@introns, [$intron_end5, $intron_end3]); 1014 } 1015 $first_exon = $next_exon; 1016 } 1017 } elsif ($strand eq '-') { 1018 my $first_exon = shift @exons; 1019 while (@exons) { 1020 my $next_exon = shift @exons; 1021 my ($first_end5, $first_end3) = $first_exon->get_coords(); 1022 my ($next_end5, $next_end3) = $next_exon->get_coords(); 1023 my $intron_end5 = $first_end3 - 1; 1024 my $intron_end3 = $next_end5 +1; 1025 if ($intron_end5 > $intron_end3) { 1026 push (@introns, [$intron_end5, $intron_end3]); 1027 } 1028 $first_exon = $next_exon; 1029 } 1030 1031 } else { 1032 die "Strand for gene_obj is not specified." . $gene_obj->toString(); 1033 } 1034 } 1035 return (@introns); 1036} 1037 1038 1039 1040 1041 1042#private 1043sub trivial_refinement { 1044 my $self = shift; 1045 my @exons = $self->get_exons(); 1046 $self->{num_exons} = scalar(@exons); 1047 my (%mRNAexons, %CDSexons); 1048 foreach my $exon (@exons) { 1049 my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3(); 1050 $mRNAexons{$exon_end5} = $exon_end3; 1051 my $cds; 1052 if ($cds = $exon->get_CDS_obj()) { 1053 my ($cds_end5, $cds_end3) = $cds->get_CDS_end5_end3(); 1054 $CDSexons{$cds_end5} = $cds_end3; 1055 } 1056 } 1057 my @mRNAexonsEnd5s = sort {$a<=>$b} keys %mRNAexons; 1058 my @CDSexonsEnd5s = sort {$a<=>$b} keys %CDSexons; 1059 my $strand = 0; #initialize. 1060 foreach my $mRNAend5 (@mRNAexonsEnd5s) { 1061 my $mRNAend3 = $mRNAexons{$mRNAend5}; 1062 if ($mRNAend5 == $mRNAend3) {next;} 1063 $strand = ($mRNAend5 < $mRNAend3) ? '+':'-'; 1064 last; 1065 } 1066 $self->{strand} = $strand; 1067 1068 ## determine gene and model boundaries: 1069 my ($gene_end5, $gene_end3, $model_end5, $model_end3); 1070 my @gene_coords = sort {$a<=>$b} %mRNAexons; 1071 my @model_coords = sort {$a<=>$b} %CDSexons; 1072 my $gene_lend = shift @gene_coords; 1073 my $gene_rend = pop @gene_coords; 1074 ## bound gene by transcript span 1075 ($gene_end5, $gene_end3) = ($strand eq "+") ? ($gene_lend, $gene_rend) : ($gene_rend, $gene_lend); 1076 if (@model_coords) { 1077 ## bound model by protein coding span 1078 my $model_lend = shift @model_coords; 1079 my $model_rend = pop @model_coords; 1080 ($model_end5, $model_end3) = ($strand eq "+") ? ($model_lend, $model_rend) : ($model_rend, $model_lend); 1081 } else { 1082 ## give it gene boundaries instead: 1083 ($model_end5, $model_end3) = ($gene_end5, $gene_end3); 1084 } 1085 1086 $self->{gene_span} = [$gene_end5, $gene_end3]; 1087 $self->{gene_length} = abs ($gene_end3 - $gene_end5) + 1; 1088 $self->{mid_pt} = int (($gene_end5 + $gene_end3)/2); 1089 $self->{model_span} = [$model_end5, $model_end3]; 1090 1091 ## Refine isoforms if they exist. 1092 if (my @isoforms = $self->get_additional_isoforms()) { 1093 my @gene_span_coords = $self->get_gene_span(); 1094 foreach my $isoform (@isoforms) { 1095 $isoform->refine_gene_object(); 1096 push (@gene_span_coords, $isoform->get_gene_span()); 1097 } 1098 @gene_span_coords = sort {$a<=>$b} @gene_span_coords; 1099 my $lend = shift @gene_span_coords; 1100 my $rend = pop @gene_span_coords; 1101 my $strand = $self->{strand}; 1102 if ($strand eq '-') { 1103 ($lend, $rend) = ($rend, $lend); 1104 } 1105 my $gene_length = abs ($lend -$rend) + 1; 1106 foreach my $gene ($self, @isoforms) { 1107 $gene->{gene_span} = [$lend, $rend]; 1108 $gene->{gene_length} = $gene_length; 1109 } 1110 } 1111 1112} 1113 1114 1115 1116 1117=over 4 1118 1119=item add_mRNA_exon_obj() 1120 1121B<Description:> Used to add a single mRNA_exon_obj to the Gene_obj 1122 1123B<Parameters:> mRNA_exon_obj 1124 1125B<Returns:> none 1126 1127=back 1128 1129=cut 1130 1131 ; 1132 1133sub add_mRNA_exon_obj { 1134 my ($self) = shift; 1135 my ($mRNA_exon_obj) = shift; 1136 if (!ref($self->{mRNA_exon_objs})) { 1137 $self->{mRNA_exon_objs} = []; 1138 } 1139 my $index = $#{$self->{mRNA_exon_objs}}; 1140 $index++; 1141 $self->{mRNA_exon_objs}->[$index] = $mRNA_exon_obj; 1142} 1143 1144#private 1145## forcibly set protein sequence value 1146 1147 1148sub set_protein_sequence { 1149 my $self = shift; 1150 my $protein = shift; 1151 if ($protein) { 1152 $self->{protein_seq} = $protein; 1153 $self->{protein_seq_length} = length ($protein); 1154 } else { 1155 print STDERR "No incoming protein sequence to set to.\n" . $self->toString(); 1156 } 1157} 1158 1159#private 1160## forcibly set CDS sequence value 1161sub set_CDS_sequence { 1162 my $self = shift; 1163 my $cds_seq = shift; 1164 if ($cds_seq) { 1165 $self->{CDS_sequence} = $cds_seq; 1166 $self->{CDS_sequence_length} = length ($cds_seq); 1167 } else { 1168 print STDERR "No incoming CDS sequence to set to\n" . $self->toString(); 1169 } 1170} 1171 1172#private 1173sub set_cDNA_sequence { 1174 my $self = shift; 1175 my $cDNA_seq = shift; 1176 if ($cDNA_seq) { 1177 $self->{cDNA_sequence} = $cDNA_seq; 1178 $self->{cDNA_sequence_length} = length($cDNA_seq); 1179 } else { 1180 print STDERR "No incoming cDNA sequence to set to.\n" . $self->toString(); 1181 } 1182} 1183 1184#private 1185sub set_gene_sequence { 1186 my $self = shift; 1187 my $seq = shift; 1188 if ($seq) { 1189 $self->{gene_sequence} = $seq; 1190 $self->{gene_sequence_length} = length ($seq); 1191 } else { 1192 print STDERR "No incoming gene sequence to set to\n" . $self->toString(); 1193 } 1194} 1195 1196 1197=over 4 1198 1199=item create_all_sequence_types() 1200 1201B<Description:> Given a scalar reference to the genomic sequence, the CDS, cDNA, unspliced transcript and protein sequences are constructed and populated within the Gene_obj 1202 1203B<Parameters:> $genomic_seq_ref, [%params] 1204 1205B<Returns:> 0|1 1206 1207returns 1 upon success, 0 upon failure 1208 1209By default, the protein and CDS sequence are populated. If you want the unspliced genomic sequence, you need to specify this in the attributes: 1210 1211 %params = ( potein => 1, 1212 CDS => 1, 1213 cDNA => 1, 1214 unspliced_transcript => 0) 1215 1216 1217=back 1218 1219=cut 1220 1221 1222## Create all gene sequences (protein, cds, cdna, genomic) 1223sub create_all_sequence_types { 1224 my $self = shift; 1225 my $big_seq_ref = shift; 1226 my %atts = @_; 1227 1228 unless (ref($big_seq_ref) eq 'SCALAR') { 1229 print STDERR "I require a sequence reference to create sequence types\n"; 1230 return (undef()); 1231 } 1232 $self->create_cDNA_sequence($big_seq_ref) unless (exists($atts{cDNA}) && $atts{cDNA}); 1233 $self->create_gene_sequence($big_seq_ref, 1) if ($atts{unspliced_transcript}); #highlight exons by default. 1234 1235 if ($self->is_coding_gene()) { 1236 $self->create_CDS_sequence ($big_seq_ref) unless (exists ($atts{CDS}) && $atts{CDS}); 1237 $self->create_protein_sequence($big_seq_ref) unless (exists ($atts{protein}) && $atts{protein}); 1238 } 1239 1240 if (my @isoforms = $self->get_additional_isoforms()) { 1241 foreach my $isoform (@isoforms) { 1242 $isoform->create_all_sequence_types($big_seq_ref, %atts); 1243 } 1244 } 1245 return(1); 1246} 1247 1248#private 1249## Create cDNA sequence 1250sub create_cDNA_sequence { 1251 my $self = shift; 1252 my $seq_ref = shift; 1253 my $sequence_ref; 1254 unless ($seq_ref) { 1255 print STDERR "The parent sequence must be specified for the cDNA creation method\n"; 1256 return; 1257 } 1258 ## hopefully the sequence came in as a reference. If not, make one to it. 1259 ## Don't want to pass chromosome sequences in by value! 1260 if (ref($seq_ref)) { 1261 $sequence_ref = $seq_ref; 1262 } else { 1263 $sequence_ref = \$seq_ref; 1264 } 1265 my @exons = $self->get_exons(); 1266 my $strand = $self->{strand}; 1267 my $cDNA_seq = ""; 1268 foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) { 1269 my $c1 = $exon_obj->{end5}; 1270 my $c2 = $exon_obj->{end3}; 1271 ## sequence retrieval coordinates must be in forward orientation 1272 my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1); 1273 $cDNA_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1)); 1274 } 1275 if ($strand eq '-') { 1276 $cDNA_seq = &reverse_complement($cDNA_seq); 1277 } 1278 $self->set_cDNA_sequence($cDNA_seq); 1279 return ($cDNA_seq); 1280} 1281 1282#private 1283## create a CDS sequence, and populate the protein field. 1284sub create_CDS_sequence { 1285 my $self = shift; 1286 my $seq_ref = shift; 1287 my $sequence_ref; 1288 unless ($seq_ref) { 1289 print STDERR "The parent sequence must be specified for the CDS creation method\n"; 1290 return; 1291 } 1292 1293 unless ($self->is_coding_gene()) { 1294 print STDERR "Warning: No coding region specified for gene: " . $self->toString(); 1295 return(""); 1296 } 1297 1298 1299 ## hopefully the sequence came in as a reference. If not, make one to it. 1300 ## Don't want to pass chromosome sequences in by value! 1301 if (ref($seq_ref)) { 1302 $sequence_ref = $seq_ref; 1303 } else { 1304 $sequence_ref = \$seq_ref; 1305 } 1306 my @exons = $self->get_exons(); 1307 my $strand = $self->{strand}; 1308 my $cds_seq = ""; 1309 foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) { 1310 my $CDS_obj = $exon_obj->get_CDS_obj(); 1311 if (ref $CDS_obj) { 1312 my ($c1, $c2) = $CDS_obj->get_CDS_end5_end3(); 1313 ## sequence retrieval coordinates must be in forward orientation 1314 my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1); 1315 $cds_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1)); 1316 } 1317 } 1318 if ($strand eq '-') { 1319 $cds_seq = &reverse_complement($cds_seq); 1320 } 1321 $self->set_CDS_sequence($cds_seq); 1322 1323 return ($cds_seq); 1324} 1325 1326 1327 1328sub is_coding_gene { 1329 my $self = shift; 1330 1331 if ($self->get_CDS_length()) { 1332 return(1); 1333 } 1334 else { 1335 return(0); 1336 } 1337} 1338 1339 1340 1341#private 1342## Translation requires parent nucleotide sequence (bac, chromosome, whatever). 1343sub create_protein_sequence { 1344 my $self = shift; 1345 my $seq_ref = shift; # optional 1346 1347 unless ($self->is_coding_gene()) { 1348 print STDERR "Warning: No coding sequence for gene: " . $self->toString(); 1349 return(""); 1350 } 1351 1352 my $cds_sequence = $self->get_CDS_sequence(); 1353 unless ($cds_sequence) { 1354 1355 ## if has a CDS, then try to translate it if the genome sequence is available. 1356 1357 unless (ref($seq_ref) eq 'SCALAR') { 1358 print STDERR "I require an assembly sequence ref if the CDS is unavailable\n"; 1359 return; 1360 } 1361 $cds_sequence = $self->create_CDS_sequence($seq_ref); 1362 } 1363 my $protein = &Nuc_translator::get_protein ($cds_sequence); 1364 $self->set_protein_sequence($protein); 1365 return ($protein); 1366} 1367 1368#private 1369## Create the unspliced nucleotide transcript 1370sub create_gene_sequence { 1371 my $self = shift; 1372 my $big_seq_ref = shift; 1373 my $highlight_exons_flag = shift; #upcases exons, lowcases introns. 1374 unless (ref ($big_seq_ref) eq 'SCALAR') { 1375 print STDERR "I require a reference to the assembly sequence!!\n"; 1376 return (undef()); 1377 } 1378 my $strand = $self->{strand}; 1379 my ($gene_seq); 1380 if ($highlight_exons_flag) { 1381 my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons(); 1382 my $exon = shift @exons; 1383 my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords(); 1384 $gene_seq = uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1)); 1385 my $prev_rend = $rend; 1386 while (@exons) { 1387 $exon = shift @exons; 1388 ## Add intron, then exon 1389 my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords(); 1390 $gene_seq .= lc (substr ($$big_seq_ref, $prev_rend, $lend - $prev_rend-1)); 1391 $gene_seq .= uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1)); 1392 $prev_rend = $rend; 1393 } 1394 1395 } else { #just get the sequence spanned by min and max coords 1396 my ($coord1, $coord2) = sort {$a<=>$b} $self->get_gene_span(); 1397 $gene_seq = substr ($$big_seq_ref, ($coord1 - 1), ($coord2 - $coord1 + 1)); 1398 } 1399 1400 $gene_seq = &reverse_complement($gene_seq) if ($strand eq '-'); 1401 $self->set_gene_sequence($gene_seq); 1402 return ($gene_seq); 1403} 1404 1405## retrieving the sequences 1406 1407=over 4 1408 1409=item get_protein_sequence() 1410 1411B<Description:> Retrieves the protein sequence 1412 1413B<Parameters:> none 1414 1415B<Returns:> $protein 1416 1417Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval. 1418 1419 1420=back 1421 1422=cut 1423 1424 ; 1425 1426sub get_protein_sequence { 1427 my $self = shift; 1428 return ($self->{protein_seq}); 1429} 1430 1431## alias 1432sub get_protein_seq { 1433 my $self = shift; 1434 return ($self->get_protein_sequence()); 1435} 1436 1437 1438 1439=over 4 1440 1441=item get_CDS_sequence() 1442 1443B<Description:> Retrieves the CDS sequence. The CDS sequence is the protein-coding nucleotide sequence. 1444 1445B<Parameters:> none 1446 1447B<Returns:> $cds 1448 1449Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval. 1450 1451=back 1452 1453=cut 1454 1455 1456sub get_CDS_sequence { 1457 my $self = shift; 1458 return ($self->{CDS_sequence}); 1459} 1460 1461=over 4 1462 1463=item get_cDNA_sequence() 1464 1465B<Description:> Retrieves the tentative cDNA sequence for the Gene. The cDNA includes the CDS with potential UTR extensions. 1466 1467B<Parameters:> none 1468 1469B<Returns:> $cdna 1470 1471Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval. 1472 1473 1474=back 1475 1476=cut 1477 1478 1479 1480sub get_cDNA_sequence { 1481 my $self = shift; 1482 return ($self->{cDNA_sequence}); 1483} 1484 1485 1486 1487sub get_CDS_length { 1488 my $self = shift; 1489 1490 my $cds_length = 0; 1491 1492 my @exons = $self->get_exons(); 1493 foreach my $exon (@exons) { 1494 if (my $cds = $exon->get_CDS_obj()) { 1495 $cds_length += $cds->length(); 1496 } 1497 } 1498 1499 1500 return ($cds_length); 1501} 1502 1503sub get_cDNA_length { 1504 my $self = shift; 1505 1506 my $cdna_length = 0; 1507 1508 my @exons = $self->get_exons(); 1509 foreach my $exon (@exons) { 1510 $cdna_length += $exon->length(); 1511 } 1512 1513 return($cdna_length); 1514 1515} 1516 1517 1518 1519 1520 1521 1522 1523=over 4 1524 1525=item get_gene_sequence() 1526 1527B<Description:> Retrieves the unspliced transcript of the gene. 1528 1529B<Parameters:> none 1530 1531B<Returns:> $unspliced_transcript 1532 1533=back 1534 1535=cut 1536 1537 1538sub get_gene_sequence { 1539 my $self = shift; 1540 return ($self->{gene_sequence}); 1541} 1542 1543 1544 1545 1546=over 4 1547 1548=item get_gene_synonyms() 1549 1550B<Description:> Retrieves the Model_feat_name(s) for the synonomous gene models found on other BACs or contigs. 1551 1552B<Parameters:> none 1553 1554B<Returns:> @model_feat_names 1555 1556 1557For Arabidopsis, gene models are found within overlapping regions of BAC sequences, in which the gene models are annotated on both corresponding BACs. Given a Gene_obj for a model on one BAC, the synomous gene on the overlapping BAC can be identified via this method. 1558 1559 1560=back 1561 1562=cut 1563 1564 1565sub get_gene_synonyms { 1566 my $self = shift; 1567 return (@{$self->{gene_synonyms}}); 1568} 1569 1570 1571 1572=over 4 1573 1574=item clear_sequence_info() 1575 1576B<Description:> Clears the sequence fields stored within a Gene_obj, including the CDS, cDNA, gene_sequence, and protein sequence. Often, these sequence fields, when populated, can consume large amounts of memory in comparison to the coordinate and functional annotation data. This method is useful to clear this memory when the sequences are not needed. The create_all_sequence_types($genomic_seq_ref) can be called again later to repopulate these sequences when they are needed. 1577 1578B<Parameters:> none 1579 1580B<Returns:> none 1581 1582=back 1583 1584=cut 1585 1586 1587## sequences consume huge amounts of memory in comparison to other gene features. 1588## want to clear them from time to time to save memory. 1589 1590 ; 1591 1592sub clear_sequence_info { 1593 my $self = shift; 1594 $self->{protein_seq} = undef; 1595 $self->{CDS_sequence} = undef; 1596 $self->{cDNA_sequence} = undef; 1597 $self->{gene_sequence} = undef; 1598} 1599 1600 1601=over 4 1602 1603=item set_gene_type() 1604 1605B<Description:> Sets the type of gene. Expected types include: 1606 1607 protein-coding #default setting 1608 rRNA 1609 snoRNA 1610 snRNA 1611 tRNA 1612 1613 ...or others as needed. Nothing is restricted. 1614 1615B<Parameters:> $type 1616 1617B<Returns:> none 1618 1619=back 1620 1621=cut 1622 1623 1624#### 1625sub set_gene_type { 1626 my ($self) = shift; 1627 my ($gene_type) = shift; 1628 $self->{gene_type} = $gene_type; 1629} 1630 1631 1632=over 4 1633 1634=item adjust_gene_coordinates() 1635 1636B<Description:> Used to add or subtract a specified number of bases from each gene component coordinate. 1637 1638B<Parameters:> $adj_amount 1639 1640$adj_amoount is a positive or negative integer. 1641 1642B<Returns:> none 1643 1644=back 1645 1646=cut 1647 1648 1649 ; 1650 1651#### 1652# add value to all gene component coordinates 1653sub adjust_gene_coordinates { 1654 my $self = shift; 1655 my $adj_amount = shift; 1656 my @exons = $self->get_exons(); 1657 foreach my $exon (@exons) { 1658 my ($end5, $end3) = $exon->get_coords(); 1659 $exon->set_coords($end5 + $adj_amount, $end3 + $adj_amount); 1660 my $cds = $exon->get_CDS_obj(); 1661 if (ref $cds) { 1662 my ($end5, $end3) = $cds->get_coords(); 1663 $cds->set_coords($end5 + $adj_amount, $end3 + $adj_amount); 1664 } 1665 } 1666 1667 ## don't forget about alt splicing isoforms! 1668 my @isoforms = $self->get_additional_isoforms(); 1669 foreach my $isoform (@isoforms) { 1670 $isoform->adjust_gene_coordinates($adj_amount); 1671 } 1672 $self->refine_gene_object(); 1673} 1674 1675 1676 1677 1678=over 4 1679 1680=item toString() 1681 1682B<Description:> Textually describes the Gene_obj including coordinates and attributes. 1683 1684B<Parameters:> <%attributes_list> 1685 1686%attributes_list is optional and can control whether certain attributes are included in the textual output 1687 1688Default settings are: 1689 1690 %attributes_list = ( 1691 -showIsoforms => 1, #set to 0 to avoid isoform info to the text output. 1692 -showSeqs => 0 #set to 1 for avoiding protein, cds, genomic, cdna seqs as output. 1693 ) 1694 1695B<Returns:> $text 1696 1697=back 1698 1699=cut 1700 1701 ; 1702 1703 1704## retrieve text output describing the gene. 1705sub toString { 1706 my $self = shift; 1707 my %atts = @_; 1708 # atts defaults: 1709 # -showIsoforms=>1 1710 # -showSeqs => 0 1711 1712 my $output = ""; 1713 foreach my $key (keys %$self) { 1714 my $value = $self->{$key}; 1715 unless (defined $value) { next;} 1716 if (ref $value) { 1717 if ($key =~ /secondary/ && ref $value eq "ARRAY") { 1718 foreach my $val (@$value) { 1719 $output .= "\t\t$key\t$val\n"; 1720 } 1721 } 1722 1723 1724 } else { 1725 if ($self->{is_pseudogene} && $key =~ /cds|cdna|protein/i && $key =~ /seq/) { 1726 next; 1727 } 1728 if ((!$atts{-showSeqs}) && $key =~/seq/) { next; } 1729 if ( ($value eq '0' || !defined($value)) && $key !~/^is_/) { next;} #dont print unpopulated info. 1730 $output .= "\t$key:\t$value\n"; 1731 } 1732 } 1733 $output .= "\tgene_synonyms: @{$self->{gene_synonyms}}\n"; 1734 1735 $output .= "\tmRNA_coords\t"; 1736 1737 if (ref ($self->{mRNA_coords}) eq "HASH") { 1738 foreach my $end5 (sort {$a<=>$b} keys %{$self->{mRNA_coords}}) { 1739 $output .= "$end5-$self->{mRNA_coords}->{$end5} "; 1740 } 1741 } 1742 $output .= "\n" 1743 . "\tCDS_coords\t"; 1744 if (ref ($self->{CDS_coords}) eq "HASH") { 1745 foreach my $end5 (sort {$a<=>$b} keys %{$self->{CDS_coords}}) { 1746 $output .= "$end5-$self->{CDS_coords}->{$end5} "; 1747 } 1748 } 1749 1750 my @exons = $self->get_exons(); 1751 foreach my $exon (@exons) { 1752 $output .= "\n\t\tRNA-exon: $exon->{end5}, $exon->{end3}\t"; 1753 my $cds = $exon->{CDS_exon_obj}; 1754 if ($cds) { 1755 $output .= "CDS-exon: $cds->{end5}, $cds->{end3}"; 1756 } 1757 } 1758 1759 if (ref $self->{gene_span}) { 1760 my ($gene_end5, $gene_end3) = @{$self->{gene_span}}; 1761 $output .= "\n\tgene_span: $gene_end5-$gene_end3"; 1762 } 1763 if (ref $self->{model_span}) { 1764 my ($model_end5, $model_end3) = @{$self->{model_span}}; 1765 $output .= "\n\tmodel_span: $model_end5-$model_end3"; 1766 } 1767 my @gene_ontology_objs = $self->get_gene_ontology_objs(); 1768 if (@gene_ontology_objs) { 1769 $output .= "\n\tGene Ontology Assignments:\n"; 1770 foreach my $go_assignment (@gene_ontology_objs) { 1771 $output .= "\t" . $go_assignment->toString(); 1772 } 1773 } 1774 1775 unless (defined ($atts{-showIsoforms}) && $atts{-showIsoforms} == 0) { 1776 foreach my $isoform ($self->get_additional_isoforms()) { 1777 $output .= "\n\n\tISOFORM:\n" . $isoform->toString(); 1778 } 1779 } 1780 $output .= "\n\n"; #spacer at terminus 1781 return ($output); 1782} 1783 1784 1785#### 1786## Splice site validation section 1787#### 1788 1789=over 4 1790 1791=item validate_splice_sites() 1792 1793B<Description:> Validates the presence of consensus splice sites 1794 1795B<Parameters:> $genomic_seq_ref 1796 1797$genomic_seq_ref is a scalar reference to the string containing the genomic sequence. 1798 1799B<Returns:> $errors 1800 1801If the empty string ("") is returned, then no inconsistencies were identified. 1802 1803=back 1804 1805=cut 1806 1807 ; 1808 1809#### 1810sub validate_splice_sites { 1811 my $self = shift; 1812 my $asmbl_seq_ref = shift; 1813 unless (ref ($asmbl_seq_ref)) { 1814 print STDERR "I require a sequence reference\n"; 1815 return (undef()); 1816 } 1817 my $error_string = ""; 1818 my $strand = $self->{strand}; 1819 my @exons = $self->get_exons(); 1820 my $num_exons = $#exons + 1; 1821 if ($num_exons == 1) { 1822 #no splice sites to confirm. 1823 return (""); 1824 } 1825 for (my $i = 1; $i <= $num_exons; $i++) { 1826 my $exon_type; 1827 if ($i == 1) { 1828 $exon_type = "initial"; 1829 } elsif ($i == $num_exons) { 1830 $exon_type = "terminal"; 1831 } else { 1832 $exon_type = "internal"; 1833 } 1834 my $exon = $exons[$i - 1]; 1835 my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3(); 1836 my ($coord1, $coord2) = sort {$a<=>$b} ($exon_end5, $exon_end3); 1837 ## get two coordinate sets corresponding to potential splice sites 1838 my $splice_1_start = $coord1-2-1; 1839 my $splice_2_start = $coord2-1+1; 1840 #print "confirming splice sites at " . ($splice_1_start +1) . " and " . ($splice_2_start + 1) . "\n"if $SEE; 1841 my $splice_1 = substr ($$asmbl_seq_ref, $splice_1_start, 2); 1842 my $splice_2 = substr ($$asmbl_seq_ref, $splice_2_start, 2); 1843 my ($acceptor, $donor) = ($strand eq '+') ? ($splice_1, $splice_2) : (&reverse_complement($splice_2), &reverse_complement($splice_1)); 1844 my $check_acceptor = ($acceptor =~ /ag/i); 1845 my $check_donor = ($donor =~ /gt|gc/i); 1846 ## associate results of checks with exon type. 1847 if ($exon_type eq "initial" || $exon_type eq "internal") { 1848 unless ($check_donor) { 1849 $error_string .= "non-consensus $donor donor splice site at $coord1\n"; 1850 } 1851 } 1852 1853 if ($exon_type eq "internal" || $exon_type eq "terminal") { 1854 unless ($check_acceptor) { 1855 $error_string .= "\tnon-consensus $acceptor acceptor splice site at $coord2\n"; 1856 } 1857 } 1858 } 1859 return ($error_string); 1860} 1861 1862 1863 1864=over 4 1865 1866=item get_annot_text() 1867 1868B<Description:> Provides basic functional annotation for a Gene_obj 1869 1870B<Parameters:> none 1871 1872B<Returns:> $string 1873 1874$string includes locus, pub_locus, com_name, and pub_comment 1875 1876=back 1877 1878=cut 1879 1880 1881 ; 1882 1883#### 1884sub get_annot_text { 1885 my $self = shift; 1886 my $locus = $self->{locus}; 1887 my $pub_locus = $self->{pub_locus}; 1888 my $com_name = $self->{com_name}; 1889 my $pub_comment = $self->{pub_comment}; 1890 my $text = ""; 1891 foreach my $token ($locus, $pub_locus, $com_name, $pub_comment) { 1892 if ($token) { 1893 $text .= "$token "; 1894 } 1895 } 1896 return ($text); 1897} 1898 1899 1900 1901=over 4 1902 1903=item add_isoform() 1904 1905B<Description:> Adds a Gene_obj to an existing Gene_obj as an alternative splicing variant. 1906 1907B<Parameters:> Gene_obj 1908 1909B<Returns:> none 1910 1911=back 1912 1913=cut 1914 1915 ; 1916sub add_isoform { 1917 my $self = shift; 1918 my @gene_objs = @_; 1919 foreach my $gene_obj (@gene_objs) { 1920 $self->{num_additional_isoforms}++; 1921 push (@{$self->{additional_isoforms}}, $gene_obj); 1922 } 1923} 1924 1925 1926 1927 1928 1929=over 4 1930 1931=item has_additional_isoforms() 1932 1933B<Description:> Provides number of additional isoforms. Typically used as a boolean. 1934 1935B<Parameters:> none 1936 1937B<Returns:> number of additional isoforms (int) 1938 1939If no additional isoforms exist, returns 0 1940 1941 1942boolean usage: 1943 19440 = false (has no more) 1945nonzero = true (has additional isoforms) 1946 1947=back 1948 1949=cut 1950 1951sub has_additional_isoforms { 1952 my $self = shift; 1953 return ($self->{num_additional_isoforms}); 1954} 1955 1956 1957 1958=over 4 1959 1960=item delete_isoforms() 1961 1962B<Description:> removes isoforms stored in this Gene_obj (assigning to a new anonymous arrayref) 1963 1964B<Parameters:> Gene_obj 1965 1966B<Returns:> none 1967 1968=back 1969 1970=cut 1971 1972sub delete_isoforms { 1973 my $self = shift; 1974 $self->{num_additional_isoforms} = 0; 1975 $self->{additional_isoforms} = []; 1976} 1977 1978 1979 1980 1981 1982=over 4 1983 1984=item get_additional_isoforms() 1985 1986B<Description:> Retrieves the additional isoforms for a given Gene_obj 1987 1988B<Parameters:> none 1989 1990B<Returns:> @Gene_objs 1991 1992If no additional isoforms exist, an empty array is returned. 1993 1994=back 1995 1996=cut 1997 1998 1999sub get_additional_isoforms { 2000 my $self = shift; 2001 return (@{$self->{additional_isoforms}}); 2002} 2003 2004 2005 2006=over 4 2007 2008=item get_orientation() 2009 2010B<Description:> Retrieves the strand orientation of the Gene_obj 2011 2012B<Parameters:> none 2013 2014B<Returns:> +|- 2015 2016=back 2017 2018=cut 2019 2020 2021sub get_orientation { 2022 my $self = shift; 2023 return ($self->{strand}); 2024} 2025 2026 2027 2028sub get_strand { ## preferred 2029 my $self = shift; 2030 return($self->get_orientation()); 2031} 2032 2033 2034 2035=over 4 2036 2037=item add_gene_ontology_objs() 2038 2039B<Description:> Adds a list of Gene_ontology objects to a Gene_obj 2040 2041B<Parameters:> @Gene_ontology_objs 2042 2043@Gene_ontology_objs is a list of objects instantiated from Gene_ontology.pm 2044 2045B<Returns:> none 2046 2047=back 2048 2049=cut 2050 2051 2052sub add_gene_ontology_objs { 2053 my ($self, @ontology_objs) = @_; 2054 push (@{$self->{GeneOntology}}, @ontology_objs); 2055} 2056 2057 2058 2059=over 4 2060 2061=item get_gene_ontology_objs() 2062 2063B<Description:> Retrieves Gene_ontology objs assigned to the Gene_obj 2064 2065B<Parameters:> none 2066 2067B<Returns:> @Gene_ontology_objs 2068 2069@Gene_ontology_objs are objects instantiated from package Gene_ontology (See Gene_ontology.pm) 2070 2071=back 2072 2073=cut 2074 2075 ; 2076 2077sub get_gene_ontology_objs { 2078 my $self = shift; 2079 if (ref ($self->{GeneOntology})) { 2080 return (@{$self->{GeneOntology}}); 2081 } else { 2082 return (()); 2083 } 2084} 2085 2086 2087=over 4 2088 2089=item set_5prime_partial() 2090 2091B<Description:> Sets the status of the is_5prime_partial attribute 2092 2093B<Parameters:> 1|0 2094 2095B<Returns:> none 2096 2097 20985prime partials are partial on their 5prime end and lack start codons. 2099 2100 2101=back 2102 2103=cut 2104 2105sub set_5prime_partial() { 2106 my $self = shift; 2107 my $value = shift; 2108 $self->{is_5prime_partial} = $value; 2109} 2110 2111 2112 2113=over 4 2114 2115=item set_3prime_partial() 2116 2117B<Description:> Sets the is_3prime_partial status 2118 2119B<Parameters:> 1|0 2120 2121B<Returns:> none 2122 21233prime partials are partial on their 3prime end and lack stop codons. 2124 2125=back 2126 2127=cut 2128 2129 2130sub set_3prime_partial() { 2131 my $self = shift; 2132 my $value = shift; 2133 $self->{is_3prime_partial} = $value; 2134} 2135 2136 2137 2138=over 4 2139 2140=item is_5prime_partial() 2141 2142B<Description:> Retrieves the 5-prime partial status of the gene. 2143 2144B<Parameters:> none 2145 2146B<Returns:> 1|0 2147 2148=back 2149 2150=cut 2151 2152 2153sub is_5prime_partial() { 2154 my $self = shift; 2155 return ($self->{is_5prime_partial}); 2156} 2157 2158 2159=over 4 2160 2161=item is_3prime_partial() 2162 2163B<Description:> Retrieves the 3-prime partial status of the gene. 2164 2165B<Parameters:> none 2166 2167B<Returns:> 1|0 2168 2169=back 2170 2171=cut 2172 2173 2174sub is_3prime_partial() { 2175 my $self = shift; 2176 return ($self->{is_3prime_partial}); 2177} 2178 2179=over 4 2180 2181=item get_5prime_UTR_coords 2182 2183 2184B<Description:> returns a list of coordinate pairs corresponding to the 5\' UTR coordinates 2185 2186B<Parameters:> none 2187 2188B<Returns:> ([end5,end3], ...) or empty list if none exist 2189 2190=back 2191 2192=cut 2193 2194 2195 ; 2196 2197sub get_5prime_UTR_coords { 2198 my $self = shift; 2199 2200 my $strand = $self->get_orientation(); 2201 2202 my @exons = $self->get_exons(); 2203 2204 my $seen_CDS_flag = 0; 2205 2206 my @utr_coords; 2207 foreach my $exon (@exons) { #relying on a sorted list 2208 my ($exon_end5, $exon_end3) = $exon->get_coords(); 2209 if (my $cds = $exon->get_CDS_obj()) { 2210 my ($cds_end5, $cds_end3) = $cds->get_coords(); 2211 if ($exon_end5 != $cds_end5) { 2212 my $adj_utr_end3_coord = ($strand eq '+') ? ($cds_end5 -1) : ($cds_end5 +1); 2213 push (@utr_coords, [$exon_end5, $adj_utr_end3_coord]); 2214 } 2215 2216 $seen_CDS_flag = 1; 2217 2218 } else { 2219 push (@utr_coords, [$exon_end5, $exon_end3]); 2220 } 2221 2222 if ($seen_CDS_flag) { 2223 last; 2224 } 2225 2226 } 2227 2228 return (@utr_coords); 2229} 2230 2231 2232 2233=over 4 2234 2235=item get_3prime_UTR_coords 2236 2237 2238B<Description:> returns a list of coordinate pairs corresponding to the 3\' UTR coordinates 2239 2240B<Parameters:> none 2241 2242B<Returns:> ([end5,end3], ...) or empty list if none exist 2243 2244=back 2245 2246=cut 2247 2248 ; 2249 2250sub get_3prime_UTR_coords { 2251 my $self = shift; 2252 2253 my $strand = $self->get_orientation(); 2254 2255 my @exons = reverse $self->get_exons(); 2256 2257 my @utr_coords; 2258 my $seen_CDS_flag = 0; 2259 foreach my $exon (@exons) { #relying on a reverse sorted list (3' exons should come first) 2260 my ($exon_end5, $exon_end3) = $exon->get_coords(); 2261 if (my $cds = $exon->get_CDS_obj()) { 2262 $seen_CDS_flag = 1; 2263 my ($cds_end5, $cds_end3) = $cds->get_coords(); 2264 if ($exon_end3 != $cds_end3) { 2265 my $adj_utr_end5_coord = ($strand eq '+') ? ($cds_end3 +1) : ($cds_end3 -1); 2266 push (@utr_coords, [$adj_utr_end5_coord, $exon_end3]); 2267 } 2268 2269 } else { 2270 push (@utr_coords, [$exon_end5, $exon_end3]); 2271 } 2272 if ($seen_CDS_flag) { 2273 last; 2274 } 2275 } 2276 2277 if (@utr_coords) { 2278 @utr_coords = reverse @utr_coords; 2279 } 2280 2281 return (@utr_coords); 2282} 2283 2284 2285 2286 2287 2288=over 4 2289 2290=item has_UTRs() 2291 2292B<Description:> indicates presence of UTR annotated in Gene 2293 2294B<Parameters:> none 2295 2296B<Returns:> ( has_5prime_UTR() || has_3prime_UTR() ) 2297 2298=back 2299 2300=cut 2301 2302sub has_UTRs { 2303 my $self = shift; 2304 return ( ($self->has_5prime_UTR() || $self->has_3prime_UTR() ) ); 2305} 2306 2307 2308 2309#### 2310sub has_5prime_UTR { 2311 my $self = shift; 2312 return ($self->get_5prime_UTR_length() > 2); 2313} 2314 2315#### 2316sub has_3prime_UTR { 2317 my $self = shift; 2318 return($self->get_3prime_UTR_length() > 2); 2319} 2320 2321 2322#### 2323sub get_5prime_UTR_length { 2324 my $self = shift; 2325 my @prime5_UTR_coords = $self->get_5prime_UTR_coords(); 2326 2327 my $len = 0; 2328 for my $coordset (@prime5_UTR_coords) { 2329 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 2330 $len += ($rend - $lend) + 1; 2331 } 2332 return($len); 2333} 2334 2335 2336#### 2337sub get_3prime_UTR_length { 2338 my $self = shift; 2339 my @prime3_UTR_coords = $self->get_3prime_UTR_coords(); 2340 2341 my $len = 0; 2342 for my $coordset (@prime3_UTR_coords) { 2343 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 2344 $len += ($rend - $lend) + 1; 2345 } 2346 return($len); 2347} 2348 2349 2350 2351=over 4 2352 2353=item get_5prime_UTR_sequence() 2354 2355B<Description:> retrieves 5prime UTR sequence 2356 2357B<Parameters:> genome sequence reference 2358 2359B<Returns:> string 2360 2361=back 2362 2363=cut 2364 2365#### 2366sub get_5prime_UTR_sequence { 2367 my $self = shift; 2368 my ($genome_seq_ref) = @_; 2369 unless (ref $genome_seq_ref eq "SCALAR") { 2370 confess "error, require genome sequence string reference"; 2371 } 2372 2373 unless ($self->has_5prime_UTR()) { 2374 return ""; 2375 } 2376 2377 my $orientation = $self->get_orientation(); 2378 my @coords = $self->get_5prime_UTR_coords(); 2379 2380 @coords = sort {$a->[0]<=>$b->[0]} @coords; 2381 2382 my $UTR_seq = ""; 2383 foreach my $coordset (@coords) { 2384 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 2385 2386 my $length = $rend - $lend + 1; 2387 $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length); 2388 } 2389 2390 if ($orientation eq '-') { 2391 $UTR_seq = &reverse_complement($UTR_seq); 2392 } 2393 2394 ## verify: 2395 $self->create_all_sequence_types($genome_seq_ref); 2396 my $cDNA = $self->get_cDNA_sequence(); 2397 2398 2399 unless (index($cDNA, $UTR_seq) == 0) { 2400 confess "Error, couldn't find UTR in cDNA"; 2401 } 2402 2403 2404 return ($UTR_seq); 2405} 2406 2407 2408=over 4 2409 2410=item get_3prime_UTR_sequence() 2411 2412B<Description:> retrieves 5prime UTR sequence 2413 2414B<Parameters:> genome sequence reference 2415 2416B<Returns:> string 2417 2418=back 2419 2420=cut 2421 2422#### 2423sub get_3prime_UTR_sequence { 2424 my $self = shift; 2425 my ($genome_seq_ref) = @_; 2426 unless (ref $genome_seq_ref eq "SCALAR") { 2427 confess "error, require genome sequence string reference"; 2428 } 2429 2430 unless ($self->has_3prime_UTR()) { 2431 return ""; 2432 } 2433 2434 my $orientation = $self->get_orientation(); 2435 my @coords = $self->get_3prime_UTR_coords(); 2436 2437 @coords = sort {$a->[0]<=>$b->[0]} @coords; 2438 2439 my $UTR_seq = ""; 2440 foreach my $coordset (@coords) { 2441 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 2442 2443 my $length = $rend - $lend + 1; 2444 $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length); 2445 } 2446 2447 if ($orientation eq '-') { 2448 $UTR_seq = &reverse_complement($UTR_seq); 2449 } 2450 2451 ## verify: 2452 $self->create_all_sequence_types($genome_seq_ref); 2453 my $cDNA = $self->get_cDNA_sequence(); 2454 my $cDNA_length = length($cDNA); 2455 my $utr_length = length($UTR_seq); 2456 2457 my $utr_start_pos = $cDNA_length - $utr_length + 1; 2458 2459 unless ((my $cDNA_utr = lc substr($cDNA, $utr_start_pos - 1, $utr_length)) eq lc $UTR_seq) { 2460 confess "Error, 3' UTR extracted from cDNA is different from UTR sequence extracted from genome.\n" 2461 . "cDNA_utr:\n$cDNA_utr\nUTR_from_genome:\n$UTR_seq\n\n"; 2462 } 2463 2464 2465 return ($UTR_seq); 2466} 2467 2468 2469 2470 2471=over 4 2472 2473=item trim_UTRs() 2474 2475B<Description:> Trims the UTR of the Gene_obj so that the Exon coordinates are identical to the CDS coordinates. Exons which lack CDS components and are completely UTR are removed. 2476 2477B<Parameters:> none 2478 2479B<Returns:> none 2480 2481=back 2482 2483=cut 2484 2485 ; 2486 2487sub trim_UTRs { 2488 my $self = shift; 2489 2490 ## adjust exon coordinates to CDS coordinates. 2491 ## if cds doesn't exist, rid exon: 2492 2493 my @new_exons; 2494 2495 my @exons = $self->get_exons(); 2496 foreach my $exon (@exons) { 2497 if (my $cds = $exon->get_CDS_obj()) { 2498 my ($exon_end5, $exon_end3) = $exon->get_coords(); 2499 my ($cds_end5, $cds_end3) = $cds->get_coords(); 2500 2501 if ($exon_end5 != $cds_end5 || $exon_end3 != $cds_end3) { 2502 $exon->set_coords($cds_end5, $cds_end3); 2503 } 2504 push (@new_exons, $exon); 2505 } 2506 } 2507 $self->{mRNA_exon_objs} = 0; #clear current gene structure 2508 $self->{mRNA_exon_objs} = \@new_exons; #replace gene structure 2509 $self->refine_gene_object(); #update 2510 return ($self); 2511} 2512 2513 2514 2515 2516=over 4 2517 2518=item remove_CDS_exon() 2519 2520B<Description:> Removes any existing CDS_exon_obj from this mRNA_exon_obj 2521 2522B<Parameters:> none 2523 2524B<Returns:> none 2525 2526=back 2527 2528=cut 2529 2530sub remove_CDS_exon { 2531 my $self = shift; 2532 $self->{CDS_exon_obj} = 0; 2533} 2534 2535 2536 2537 2538 2539=over 4 2540 2541=item get_gene_names() 2542 2543B<Description:> Retrieves gene names (primary gene name followed by secondary gene names, "$;" delimited. 2544 2545B<Parameters:> none 2546 2547B<Returns:> string 2548 2549 see $gene_obj->{gene_name} 2550 see $gene_obj->get_secondary_names() 2551 2552secondary gene names sorted lexicographically 2553 2554 2555=back 2556 2557=cut 2558 2559 2560 2561 2562#### 2563sub get_gene_names { 2564 my $gene_obj = shift; 2565 my @gene_names; 2566 if ($gene_obj->{gene_name}) { 2567 push (@gene_names, $gene_obj->{gene_name}); 2568 } 2569 if (my @secondary_names = $gene_obj->get_secondary_gene_names()) { 2570 push (@gene_names, @secondary_names); 2571 } 2572 my $ret_gene_names = join ("$;" , @gene_names); 2573 return ($ret_gene_names); 2574} 2575 2576 2577 2578=over 4 2579 2580=item get_secondary_gene_names() 2581 2582B<Description:> Retrieves secondary gene names as a "$;" delimited string. 2583 2584B<Parameters:> none 2585 2586B<Returns:> string 2587 2588=back 2589 2590=cut 2591 2592 2593#### 2594sub get_secondary_gene_names { 2595 my ($gene_obj) = @_; 2596 return (sort @{$gene_obj->{secondary_gene_names}}); 2597} 2598 2599 2600 2601 2602=over 4 2603 2604=item get_product_names() 2605 2606B<Description:> Retrieves product name, with the primary product name followed by secondary product names, delimited by "$;" 2607 2608B<Parameters:> none 2609 2610B<Returns:> string 2611 2612 see $gene_obj->{com_name} for primary product name 2613 see $gene_obj->get_secondary_product_names() 2614 2615=back 2616 2617=cut 2618 2619 ; 2620 2621#### 2622sub get_product_names { 2623 my $gene_obj = shift; 2624 my @product_names; 2625 if ($gene_obj->{com_name}) { 2626 push (@product_names, $gene_obj->{com_name}); 2627 } 2628 if (my @secondary_names = $gene_obj->get_secondary_product_names()) { 2629 push (@product_names, @secondary_names); 2630 } 2631 my $ret_product_names = join ("$;", @product_names); 2632 return ($ret_product_names); 2633} 2634 2635 2636 2637=over 4 2638 2639=item get_secondary_product_names() 2640 2641B<Description:> Retrieves secondary product names, delimited by "$;" and sorted lexicographically. 2642 2643B<Parameters:> none 2644 2645B<Returns:> string 2646 2647=back 2648 2649=cut 2650 2651 2652#### 2653sub get_secondary_product_names { 2654 my ($gene_obj) = @_; 2655 return (sort @{$gene_obj->{secondary_product_names}}); 2656} 2657 2658 2659 2660=over 4 2661 2662=item get_gene_symbols() 2663 2664B<Description:> Retrieves primary gene symbol followed by secondary gene symbols, delimited by "$;" 2665 2666B<Parameters:> none 2667 2668B<Returns:> string 2669 2670 see $gene_obj->{gene_sym} 2671 see $gene_obj->get_secondary_gene_symbols() 2672 2673=back 2674 2675=cut 2676 2677 ; 2678 2679#### 2680sub get_gene_symbols { 2681 my $gene_obj = shift; 2682 my @gene_symbols; 2683 if ($gene_obj->{gene_sym}) { 2684 push (@gene_symbols, $gene_obj->{gene_sym}); 2685 } 2686 if (my @secondary_symbols = $gene_obj->get_secondary_gene_symbols()) { 2687 push (@gene_symbols, @secondary_symbols); 2688 } 2689 my $ret_gene_symbols = join ("$;", @gene_symbols); 2690 return ($ret_gene_symbols); 2691} 2692 2693 2694=over 4 2695 2696=item get_secondary_gene_symbols() 2697 2698B<Description:> Retrieves secondary gene symbols, delimited by "$;" and sorted lexicographically 2699 2700B<Parameters:> none 2701 2702B<Returns:> string 2703 2704=back 2705 2706=cut 2707 2708 2709#### 2710sub get_secondary_gene_symbols { 2711 my ($gene_obj) = @_; 2712 return (sort @{$gene_obj->{secondary_gene_symbols}}); 2713} 2714 2715 2716 2717=over 4 2718 2719=item get_ec_numbers() 2720 2721B<Description:> Retrieves primary EC number followed by secondary EC numbers, "$;" delimited 2722 2723B<Parameters:> none 2724 2725B<Returns:> string 2726 2727 see $gene_obj->{ec_num} 2728 see $gene_obj->get_secondary_ec_numbers() 2729 2730=back 2731 2732=cut 2733 2734 ; 2735 2736#### 2737sub get_ec_numbers { 2738 my $gene_obj = shift; 2739 my @ec_numbers; 2740 if ($gene_obj->{ec_num}) { 2741 push (@ec_numbers, $gene_obj->{ec_num}); 2742 } 2743 if (my @secondary_ec_numbers = $gene_obj->get_secondary_ec_numbers()) { 2744 push (@ec_numbers, @secondary_ec_numbers); 2745 } 2746 my $ret_ec_numbers = join ("$;", @ec_numbers); 2747 return ($ret_ec_numbers); 2748} 2749 2750 2751 2752=over 4 2753 2754=item get_secondary_ec_numbers() 2755 2756B<Description:> Retrieves secondary EC numbers, "$;" delimited and sorted lexicographically 2757 2758B<Parameters:> none 2759 2760B<Returns:> string 2761 2762 2763=back 2764 2765=cut 2766 2767 2768#### 2769sub get_secondary_ec_numbers { 2770 my ($gene_obj) = @_; 2771 return (sort @{$gene_obj->{secondary_ec_numbers}}); 2772} 2773 2774 2775 2776=over 4 2777 2778=item add_secondary_gene_names() 2779 2780B<Description:> Adds secondary gene name(s) 2781 2782B<Parameters:> (gene_name_1, gene_name_2, ....) 2783 2784Single gene name or list of gene names is allowed 2785 2786 2787B<Returns:> none 2788 2789=back 2790 2791=cut 2792 2793 2794 2795#### 2796sub add_secondary_gene_names { 2797 my ($gene_obj, @gene_names) = @_; 2798 push (@{$gene_obj->{secondary_gene_names}}, @gene_names); 2799} 2800 2801 2802=over 4 2803 2804=item add_secondary_product_names() 2805 2806B<Description:> Adds secondary product names 2807 2808B<Parameters:> (product_name_1, product_name_2, ...) 2809 2810Single or list of product names as parameter 2811 2812B<Returns:> none 2813 2814Primary gene name added directly as an attribute like so 2815 $gene_obj->{gene_name} = name 2816 2817=back 2818 2819=cut 2820 2821 2822#### 2823sub add_secondary_product_names { 2824 my ($gene_obj, @product_names) = @_; 2825 &trim_leading_trailing_ws(\@product_names); 2826 push (@{$gene_obj->{secondary_product_names}}, @product_names); 2827} 2828 2829 2830=over 4 2831 2832=item add_secondary_gene_symbols() 2833 2834B<Description:> Add secondary gene symbols 2835 2836B<Parameters:> (gene_symbol_1, gene_symbol_2, ...) 2837 2838String or list context 2839 2840B<Returns:> none 2841 2842Primary gene_symbol added directly as attribute like so: 2843 $gene_obj->{gene_sym} = symbol 2844 2845=back 2846 2847=cut 2848 2849 2850#### 2851sub add_secondary_gene_symbols { 2852 my ($gene_obj, @gene_symbols) = @_; 2853 &trim_leading_trailing_ws(\@gene_symbols); 2854 push (@{$gene_obj->{secondary_gene_symbols}}, @gene_symbols); 2855} 2856 2857 2858 2859 2860 2861=over 4 2862 2863=item add_secondary_ec_numbers() 2864 2865B<Description:> Add secondary Enzyme Commission (EC) numbers 2866 2867B<Parameters:> (EC_1, EC_2, ...) 2868 2869String or list context 2870 2871B<Returns:> none 2872 2873 2874Primary EC number added directly as an attribute like so: 2875 $gene_obj->{ec_num} = EC_number 2876 2877=back 2878 2879=cut 2880 2881 2882#### 2883sub add_secondary_ec_numbers { 2884 my ($gene_obj, @ec_numbers) = @_; 2885 &trim_leading_trailing_ws(\@ec_numbers); 2886 push (@{$gene_obj->{secondary_ec_numbers}}, @ec_numbers); 2887} 2888 2889#### 2890sub to_alignment_GFF3_format { 2891 my ($gene_obj, $id, $target, $source) = @_; 2892 2893 unless (defined $source) { 2894 $source = "."; 2895 } 2896 2897 ## Note, only examines gene_obj and doesn't go deeper into alt-splicing layers, ... send isoforms in as separate objs. 2898 2899 unless ( (ref $gene_obj) && defined($id) && defined($target)) { 2900 croak "Error, need gene_obj, id, and target names as params"; 2901 } 2902 2903 my $gff3_alignment_text = ""; 2904 2905 my $orient = $gene_obj->get_orientation(); 2906 my $scaff = $gene_obj->{asmbl_id}; 2907 2908 my @exons = sort {$a->{end5}<=>$b->{end5}} $gene_obj->get_exons(); 2909 2910 if ($orient eq '-') { 2911 @exons = reverse @exons; 2912 } 2913 2914 2915 my $match_lend = 0; 2916 2917 foreach my $exon (@exons) { 2918 2919 my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords(); 2920 2921 my $m_lend = $match_lend + 1; 2922 my $m_rend = $match_lend + ($rend - $lend + 1); 2923 2924 2925 $gff3_alignment_text .= join("\t", $scaff, $source, "match", $lend, $rend, "100", $orient, '.', # giving everything 100% identity since genome-based 2926 "ID=$id;Target=$target $m_lend $m_rend +") . "\n"; 2927 2928 2929 $match_lend = $m_rend; 2930 2931 2932 } 2933 2934 return($gff3_alignment_text); 2935} 2936 2937 2938 2939 2940#### 2941sub to_transcript_GTF_format { 2942 my ($gene_obj) = @_; 2943 2944 ## no worries about protein-coding regions. Only report transcripts and exons tied to a particular gene. 2945 ## used with cufflinks package for computing FPKM values 2946 2947 my $gtf_text = ""; 2948 2949 foreach my $gene ($gene_obj, $gene_obj->get_additional_isoforms()) { 2950 2951 my $gene_id = $gene->{TU_feat_name} || ""; 2952 my $transcript_id = $gene->{Model_feat_name} || ""; 2953 my $asmbl_id = $gene_obj->{asmbl_id}; 2954 my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_transcript_span(); 2955 my $orientation = $gene_obj->get_orientation(); 2956 2957 my $com_name = $gene_obj->{com_name} || ""; 2958 $com_name =~ s/;/_/g; 2959 $com_name =~ s/\"//g; 2960 2961 2962 if ($gene->{gene_type} eq "protein-coding") { 2963 my @exons = $gene->get_exons(); 2964 2965 $gtf_text .= join("\t", $asmbl_id, ".", "transcript", $lend, $rend, ".", $orientation, ".", 2966 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n"; 2967 2968 foreach my $exon (@exons) { 2969 my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords(); 2970 2971 $gtf_text .= join("\t", $asmbl_id, ".", "exon", $lend, $rend, ".", $orientation, ".", 2972 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\";") . "\n"; 2973 2974 } 2975 2976 } 2977 else { 2978 2979 ## non-protein-coding features 2980 $gtf_text .= join("\t", $asmbl_id, ".", $gene->{gene_type}, $lend, $rend, ".", $orientation, ".", 2981 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n"; 2982 2983 2984 } 2985 2986 $gtf_text .= "\n"; 2987 } 2988 2989 2990 return($gtf_text); 2991} 2992 2993 2994 2995=over 4 2996 2997=item to_GTF_format() 2998 2999B<Description:> Outputs text corresponding to the representation of the gene in GTF format. 3000 3001B<Parameters:> $genome_seq_ref, %preferences 3002 3003B<Returns:> string 3004 3005 3006GTF format is described in "Current Protocols in Bioinformatics(2003)" 4.8.1-4.8.19 3007in "Using TWINSCAN to Predict Gene Structures in Genomic DNA Sequences". 3008 3009Each line of the GTF format includes the following tab-delimited fields: 3010 3011[seqname] [source] [feature] [start] [end] [score] [strand] [frame] [attributes] 3012 3013This is further elaborated below: 3014 3015[feature] contains one of the following: start_codon, stop_codon, CDS 3016[attributes] contains 'gene_id' and 'transcript_id' fields. All features of the same transcript should share the same transcript_id value. By default, the TU_feat_name and model_feat_name are used as the gene_id and transcript_id, respectively. 3017 3018 3019 Using the %preferences input parameter, the preferred values or gene attributes can be used for seqname, source, gene_id, or transcript_id, each used as a key to the %preferences hash. Given the value of %preferences is a gene attribute, that attribute value will be used, otherwise, the raw value will be used. 3020 3021For example: %preferences = ( seqname => 'mySeqname', 3022 gene_id => 'pub_locus' ); 3023 3024Would result in 'mySeqname' used in the [seqname] field, and the $gene_obj->{pub_locus} value 3025 3026Here are the defaults: 3027[seqname] = asmbl_id 3028[source] = annotation 3029gene_id (TU_feat_name) 3030transcript_id (Model_feat_name) 3031 3032** Partial Genes are NOT Supported ** ( undef is returned ) 3033** Genes with split start or stop codons are unsupported ** (undef is returned) 3034 3035=back 3036 3037=cut 3038 3039 ; 3040 3041sub to_GTF_format { 3042 my $gene_obj = shift; 3043 my ($genome_seq_ref, %preferences) = @_; 3044 3045 unless (ref $genome_seq_ref) { 3046 confess "Error, need genome seq reference as param"; 3047 } 3048 3049 my $is_pseudogene = $gene_obj->is_pseudogene(); 3050 3051 3052 my $TU_feat_name = $gene_obj->{TU_feat_name}; 3053 my $model_feat_name = $gene_obj->{Model_feat_name}; 3054 3055 # rid whitespace in identifiers 3056 $TU_feat_name =~ s/\s+/_/g; 3057 $model_feat_name =~ s/\s+/_/g; 3058 3059 my $seqname = $preferences{seqname} || $gene_obj->{asmbl_id}; 3060 my $source = $preferences{source} || $gene_obj->{source} || "."; 3061 3062 my $gene_id; 3063 if (my $token = $preferences{gene_id}) { 3064 $gene_id = $gene_obj->{$token}; 3065 } else { 3066 $gene_id = $TU_feat_name; 3067 } 3068 3069 my $transcript_id; 3070 if (my $token = $preferences{model_id}) { 3071 $transcript_id = $gene_obj->{$token}; 3072 } else { 3073 $transcript_id = $model_feat_name; 3074 } 3075 3076 my @exons = $gene_obj->get_exons(); 3077 my $orientation = $gene_obj->get_orientation(); 3078 my @gtf_text; 3079 3080 my $gene_obj_for_gtf = $gene_obj; #if got stop codon, will need to strip it off. 3081 my $com_name = $gene_obj->{com_name}; 3082 $com_name =~ s/\s+$// if $com_name; 3083 $com_name =~ s/[\"\']//g if $com_name; 3084 3085 my $name_txt = ($com_name) ? "Name \"$com_name\";" : ""; 3086 3087 3088 ## Gene record 3089 unless ($preferences{'gene_record_already_done'}) { 3090 3091 my ($gene_lend, $gene_rend) = sort {$a<=>$b} $gene_obj->get_gene_span(); 3092 3093 push (@gtf_text, [$seqname, 3094 $source, 3095 "gene", 3096 $gene_lend, 3097 $gene_rend, 3098 "0", 3099 $orientation, 3100 ".", 3101 "gene_id \"$gene_id\"; $name_txt"]); 3102 } 3103 3104 ## Transcript record 3105 my ($trans_lend, $trans_rend) = sort {$a<=>$b} $gene_obj->get_transcript_span(); 3106 push (@gtf_text, [$seqname, 3107 $source, 3108 "transcript", 3109 $trans_lend, 3110 $trans_rend, 3111 "0", 3112 $orientation, 3113 ".", 3114 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]); 3115 3116 unless ($is_pseudogene) { 3117 $gene_obj->set_CDS_phases($genome_seq_ref); 3118 3119 3120 ## check for start and stop codons. 3121 my $cds_seq = uc $gene_obj->create_CDS_sequence($genome_seq_ref); 3122 my @stop_codons = &Nuc_translator::get_stop_codons(); 3123 3124 my $first_CDS_segment = $gene_obj->get_first_CDS_segment(); 3125 my $first_phase = $first_CDS_segment->get_phase(); 3126 my $cds_is_integral_codon_num = (length($cds_seq) % 3 == 0) ? 1 : 0; 3127 3128 ## examine start codon: 3129 my $init_codon = substr($cds_seq, 0, 3); 3130 if ($first_phase == 0 && $init_codon eq 'ATG') { # got start codon. 3131 my @start_coordsets = $gene_obj->get_start_codon_coordinates(); 3132 foreach my $start_pair (@start_coordsets) { 3133 my ($start_lend, $start_rend) = sort {$a<=>$b} @$start_pair; 3134 push (@gtf_text, [$seqname, 3135 $source, 3136 "start_codon", 3137 $start_lend, 3138 $start_rend, 3139 "0", 3140 $orientation, 3141 "0", 3142 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]); 3143 } 3144 } 3145 3146 my $candidate_stop_codon = uc substr($cds_seq, length($cds_seq) - 3, 3); 3147 my @found_stop = grep { $_ eq $candidate_stop_codon } @stop_codons; 3148 3149 if (@found_stop) { 3150 # got a stop codon. 3151 # check to see that the stop codon is in-frame. 3152 if ((length($cds_seq) - $first_phase) % 3 == 0) { # yes, stop is in frame. 3153 3154 my @stop_codon_coords = $gene_obj->get_stop_codon_coords(); 3155 foreach my $stop_pair (@stop_codon_coords) { 3156 my ($stop_lend, $stop_rend) = sort {$a<=>$b} @$stop_pair; 3157 3158 push (@gtf_text, [$seqname, 3159 $source, 3160 "stop_codon", 3161 $stop_lend, 3162 $stop_rend, 3163 "0", 3164 $orientation, 3165 "0", 3166 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]); 3167 } 3168 3169 $gene_obj_for_gtf = $gene_obj->clone_gene(); 3170 3171 $gene_obj_for_gtf->trim_stop_codon(); 3172 3173 } 3174 } 3175 3176 } 3177 3178 ## report the exons and CDS regions: 3179 foreach my $exon ($gene_obj_for_gtf->get_exons()) { 3180 3181 my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords(); 3182 3183 push (@gtf_text, [$seqname, 3184 $source, 3185 "exon", 3186 $exon_lend, 3187 $exon_rend, 3188 "0", 3189 $orientation, 3190 ".", 3191 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]); 3192 3193 3194 3195 my $cds = ($is_pseudogene) ? $exon : $exon->get_CDS_exon_obj(); 3196 3197 if ($cds) { 3198 my $phase = "."; 3199 unless ($is_pseudogene) { 3200 $phase = $cds->get_phase(); 3201 if ($phase) { 3202 $phase = ($phase == 1) ? 2 : 1; # reverse it according to GFF3 vs. GTF representation. 3203 } 3204 } 3205 3206 my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds->get_coords(); 3207 3208 push (@gtf_text, [$seqname, 3209 $source, 3210 "CDS", 3211 $cds_lend, 3212 $cds_rend, 3213 "0", 3214 $orientation, 3215 "$phase", 3216 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]); 3217 } 3218 3219 3220 } 3221 3222 unless ($is_pseudogene) { 3223 3224 ## Get UTR info: 3225 { 3226 for my $pair ($gene_obj->get_3prime_UTR_coords) { 3227 my ($lend,$rend) = sort {$a<=>$b} @$pair; 3228 push (@gtf_text, [$seqname, 3229 $source, 3230 "3UTR", 3231 $lend, 3232 $rend, 3233 "0", 3234 $orientation, 3235 "0", 3236 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"] ); 3237 } 3238 for my $pair ($gene_obj->get_5prime_UTR_coords) { 3239 my ($lend,$rend) = sort {$a<=>$b} @$pair; 3240 push (@gtf_text, [$seqname, 3241 $source, 3242 "5UTR", 3243 $lend, 3244 $rend, 3245 "0", 3246 $orientation, 3247 "0", 3248 "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt" ] ); 3249 } 3250 } 3251 3252 } 3253 3254 @gtf_text = sort {$a->[3] <=> $b->[3]} @gtf_text; 3255 3256 if ($orientation eq '-') { 3257 @gtf_text = reverse @gtf_text; 3258 } 3259 3260 my $GTF = ""; 3261 foreach my $gtf_row (@gtf_text) { 3262 $GTF .= join ("\t", @$gtf_row) . "\n"; 3263 } 3264 3265 foreach my $isoform ($gene_obj->get_additional_isoforms()) { 3266 my %iso_pref = %preferences; 3267 $iso_pref{'gene_record_already_done'} = 1; 3268 $GTF .= "\n" . $isoform->to_GTF_format($genome_seq_ref, %iso_pref); 3269 } 3270 3271 return ($GTF); 3272} 3273 3274 3275 3276 3277 3278#### 3279sub get_start_codon_coordinates { 3280 my $gene_obj = shift; 3281 3282 my $orient = $gene_obj->get_orientation(); 3283 3284 ## just want the coordinate pairs that define the first three CDS bases. 3285 3286 my @cds_coords; 3287 foreach my $exon ($gene_obj->get_exons()) { 3288 if (my $cds = $exon->get_CDS_exon_obj()) { 3289 my ($cds_end5, $cds_end3) = $cds->get_coords(); 3290 push (@cds_coords, [$cds_end5, $cds_end3]); 3291 } 3292 } 3293 3294 my @start_coords; 3295 my $start_len_want = 3; 3296 foreach my $cds_coordpair (@cds_coords) { 3297 my ($cds_end5, $cds_end3) = @$cds_coordpair; 3298 my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1; 3299 3300 my $extract_len = ($cds_seg_len < $start_len_want) ? $cds_seg_len : $start_len_want; 3301 if ($orient eq '+') { 3302 push (@start_coords, [$cds_end5, $cds_end5 + $extract_len - 1]); 3303 } 3304 else { 3305 push (@start_coords, [$cds_end5, $cds_end5 - $extract_len + 1]); 3306 } 3307 $start_len_want -= $extract_len; 3308 3309 if ($start_len_want <= 0) { last; } 3310 } 3311 3312 if ($start_len_want > 0) { 3313 confess "Error, trouble extracting start codon coordinates from cds coordsets: " . Dumper (\@cds_coords); 3314 } 3315 3316 return (@start_coords); 3317} 3318 3319 3320 3321 3322#### 3323sub get_stop_codon_coords { 3324 my $gene_obj = shift; 3325 3326 my $orient = $gene_obj->get_orientation(); 3327 3328 ## just want the coordinate pairs that define the last three CDS bases. 3329 3330 my @cds_coords; 3331 foreach my $exon (reverse $gene_obj->get_exons()) { 3332 if (my $cds = $exon->get_CDS_exon_obj()) { 3333 my ($cds_end5, $cds_end3) = $cds->get_coords(); 3334 push (@cds_coords, [$cds_end5, $cds_end3]); 3335 } 3336 } 3337 3338 my @stop_coords; 3339 my $stop_len_want = 3; 3340 foreach my $cds_coordpair (@cds_coords) { 3341 my ($cds_end5, $cds_end3) = @$cds_coordpair; 3342 my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1; 3343 3344 my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want; 3345 if ($orient eq '+') { 3346 push (@stop_coords, [$cds_end3 - $extract_len + 1, $cds_end3]); 3347 } 3348 else { 3349 push (@stop_coords, [$cds_end3, $cds_end3 + $extract_len - 1]); 3350 } 3351 $stop_len_want -= $extract_len; 3352 3353 if ($stop_len_want <= 0) { last; } 3354 } 3355 3356 if ($stop_len_want > 0) { 3357 confess "Error, trouble extracting stop codon coordinates from cds coordsets: " . Dumper (\@cds_coords); 3358 } 3359 3360 3361 return (@stop_coords); 3362 3363} 3364 3365 3366#### 3367sub trim_stop_codon { 3368 my $gene_obj = shift; 3369 3370 ## just trimming the last three bases from the CDS's, changing the current gene object. 3371 3372 my @exons = reverse $gene_obj->get_exons(); 3373 3374 my $orient = $gene_obj->get_orientation(); 3375 3376 my $stop_len_want = 3; 3377 foreach my $exon (@exons) { 3378 if (my $cds = $exon->get_CDS_exon_obj()) { 3379 3380 my ($cds_end5, $cds_end3) = $cds->get_coords(); 3381 my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1; 3382 3383 my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want; 3384 3385 if ($cds_seg_len == $extract_len) { 3386 # delete it! 3387 $exon->delete_CDS_exon_obj(); 3388 } 3389 else { 3390 ## truncate it by extract_len 3391 if ($orient eq '+') { 3392 $cds->{end3} -= $extract_len; 3393 } 3394 3395 else { 3396 $cds->{end3} += $extract_len; 3397 } 3398 } 3399 $stop_len_want -= $extract_len; 3400 3401 if ($stop_len_want <= 0) { last; } 3402 } 3403 } 3404 if ($stop_len_want > 0) { 3405 confess "Error, trouble extracting all stop codon coordinates from cds coordsets. " . $gene_obj->toString(); 3406 } 3407 3408 return; 3409 3410} 3411 3412 3413 3414=over 4 3415 3416=item to_GFF3_format() 3417 3418B<Description:> Outputs text corresponding to the representation of the gene in GFF3 format (still under development). 3419 3420B<Parameters:> 3421 3422B<Returns:> string 3423 3424GFF3 defined at: 3425http://song.sourceforge.net/gff3-jan04.shtml 3426 3427(some text lifted from above site provided below for reference purposes) 3428 3429The format consists of 9 columns, separated by tabs or spaces. The 3430following unescaped characters are allowed within fields: 3431[a-zA-Z0-9.:^*$@!+_?-]. All other characters must must be escaped 3432using the URL escaping conventions. Unescaped quotation marks, 3433backslashes and other ad-hoc escaping conventions that have been added 3434to the GFF format are explicitly forbidden. The =, ; and % characters 3435have reserved meanings as described below, and must be escaped when 3436used in other contexts. 3437 3438Undefined fields are replaced with the "." character, as described in 3439the original GFF spec. 3440 3441Column 1: "seqid" 3442 3443The ID of the landmark used to establish the coordinate system for the 3444current feature. IDs must contain alphanumeric characters. 3445Whitespace, if present, must be escaped using URL escaping rule 3446(e.g. space="%20" or "+"). Sequences must *NOT* begin with an 3447unescaped ">". 3448 3449Column 2: "source" 3450 3451The source of the feature. This is unchanged from the older GFF specs 3452and is not part of a controlled vocabulary. 3453 3454Column 3: "type" 3455 3456The type of the feature (previously called the "method"). This is 3457constrained to be either: (a) a term from the "lite" sequence 3458ontology, SOFA; or (b) a SOFA accession number. The latter 3459alternative is distinguished using the syntax SO:000000. 3460 3461Columns 4 & 5: "start" and "end" 3462 3463The start and end of the feature, in 1-based integer coordinates, 3464relative to the landmark given in column 1. Start is always less than 3465or equal to end. 3466 3467For zero-length features, such as insertion sites, start equals end 3468and the implied site is to the right of the indicated base. This 3469convention holds regardless of the strandedness of the feature. 3470 3471Column 6: "score" 3472 3473The score of the feature, a floating point number. As in earlier 3474versions of the format, the semantics of the score are ill-defined. 3475It is strongly recommended that E-values be used for sequence 3476similarity features, and that P-values be used for ab initio gene 3477prediction features. 3478 3479Column 7: "strand" 3480 3481The strand of the feature. + for positive strand (relative to the 3482landmark), - for minus strand, and . for features that are not 3483stranded. In addition, ? can be used for features whose strandedness 3484is relevant, but unknown. 3485 3486Column 8: "phase" 3487 3488For features of type "exon", the phase indicates where the feature 3489begins with reference to the reading frame. The phase is one of the 3490integers 0, 1,or 2, indicating that the first base of the feature 3491corresponds to the first, second or last base of the codon, 3492respectively. This is NOT to be confused with the frame, but relates 3493to the relative position of the translational start in whatever strand 3494the feature is in. 3495 3496Column 9: "attributes" 3497 3498A list of feature attributes in the format tag=value. Multiple 3499tag=value pairs are separated by semicolons. URL escaping rules are 3500used for tags or values containing the following characters: ",=;". 3501Whitespace should be replaced with the "+" character or the %20 URL 3502escape. This will allow the file to survive text processing programs 3503that convert tabs into spaces. 3504 3505These tags have predefined meanings: 3506 3507 ID Indicates the name of the feature. IDs must be unique 3508 within the scope of the GFF file. 3509 3510 Name Display name for the feature. This is the name to be 3511 displayed to the user. Unlike IDs, there is no requirement 3512 that the Name be unique within the file. 3513 3514 Alias A secondary name for the feature. It is suggested that 3515 this tag be used whenever a secondary identifier for the 3516 feature is needed, such as locus names and 3517 accession numbers. Unlike ID, there is no requirement 3518 that Alias be unique within the file. 3519 3520 Parent Indicates the parent of the feature. A parent ID can be 3521 used to group exons into transcripts, transcripts into 3522 genes, an so forth. A feature may have multiple parents. 3523 3524 Target Indicates the target of a nucleotide-to-nucleotide or 3525 protein-to-nucleotide alignment. The format of the 3526 value is "target_id+start+end". 3527 3528 Gap The alignment of the feature to the target if the two are 3529 not colinear (e.g. contain gaps). The alignment format is 3530 taken from the CIGAR format described in the 3531 Exonerate documentation. 3532 (http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate 3533 ?cvsroot=Ensembl). See "THE GAP ATTRIBUTE" for a description 3534 of this format. 3535 3536 Note A free text note. 3537 3538 Dbxref A database cross reference. See the section 3539 "Ontology Associations and Db Cross References" for 3540 details on the format. 3541 3542 Ontology_term A cross reference to an ontology term. See 3543 the section "Ontology Associations and Db Cross References" 3544 for details. 3545 3546Multiple attributes of the same type are indicated by separating the 3547values with the comma "," character, as in: 3548 3549 Parent=AF2312,AB2812,abc-3 3550 3551Note that attribute names are case sensitive. "Parent" is not the 3552same as "parent". 3553 3554All attributes that begin with an uppercase letter are reserved for 3555later use. Attributes that begin with a lowercase letter can be used 3556freely by applications. 3557 3558 3559 3560=back 3561 3562=cut 3563 3564 ; 3565 3566 3567 3568sub to_GFF3_format { 3569 my ($gene_obj, %preferences) = @_; 3570 3571 my $gene_id = $gene_obj->{TU_feat_name}; 3572 3573 my $strand = $gene_obj->get_orientation(); 3574 3575 my @noteText; 3576 3577 if ($gene_obj->{is_pseudogene}) { 3578 push (@noteText, "(pseudogene)"); 3579 } 3580 3581 ## parse preferences 3582 my $asmbl_id = $preferences{seqid} || $gene_obj->{asmbl_id}; 3583 my $source = $preferences{source} || $gene_obj->{source} || "."; 3584 3585 unless (defined $asmbl_id) { 3586 confess "Error, no asmbl_id from gene_obj\n"; 3587 } 3588 3589 3590 my ($gene_lend, $gene_rend) = sort {$a<=>$b} $gene_obj->get_gene_span(); 3591 my $com_name = $gene_obj->{com_name}; 3592 unless ($com_name =~ /\w/) { 3593 $com_name = ""; 3594 } 3595 3596 if ($com_name) { 3597 # uri escape it: 3598 $com_name = uri_escape($com_name); 3599 } 3600 3601 my $gene_alias = ""; 3602 if (my $pub_locus = $gene_obj->{pub_locus}) { 3603 $gene_alias = "Alias=$pub_locus;"; 3604 } 3605 3606 my $feat_type = ($gene_obj->{gene_type} eq "protein-coding") ? "gene" : $gene_obj->{gene_type}; 3607 3608 3609 my $gff3_text = "$asmbl_id\t$source\t$feat_type\t$gene_lend\t$gene_rend\t.\t$strand\t.\tID=$gene_id;Name=$com_name;$gene_alias\n"; ## note, non-coding gene features are currently represented by a simple single coordinate pair. 3610 3611 if ($gene_obj->{gene_type} eq "protein-coding") { 3612 3613 my $gene_obj_ref = $gene_obj; 3614 3615 foreach my $gene_obj ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms() ) { 3616 3617 my $model_id = $gene_obj->{Model_feat_name}; 3618 my $model_alias = ""; 3619 if (my $model_locus = $gene_obj->{Model_pub_locus}) { 3620 $model_alias = "Alias=$model_locus;"; 3621 } 3622 3623 my ($mrna_lend, $mrna_rend) = $gene_obj->get_transcript_span(); 3624 3625 $gff3_text .= "$asmbl_id\t$source\tmRNA\t$mrna_lend\t$mrna_rend\t.\t$strand\t.\tID=$model_id;Parent=$gene_id;Name=$com_name;$model_alias\n"; 3626 3627 ## mark the first and last CDS entries (for now, an unpleasant hack!) 3628 my @exons = $gene_obj->get_exons(); 3629 ## find the first cds 3630 foreach my $exon (@exons) { 3631 if (my $cds = $exon->get_CDS_obj()) { 3632 $cds->{first_cds} = 1; 3633 last; 3634 } 3635 } 3636 @exons = reverse @exons; 3637 foreach my $exon (@exons) { 3638 if (my $cds = $exon->get_CDS_obj()) { 3639 $cds->{last_cds} = 1; 3640 last; 3641 } 3642 } 3643 3644 my $prime5_partial = $gene_obj->is_5prime_partial(); 3645 my $prime3_partial = $gene_obj->is_3prime_partial(); 3646 3647 3648 ## annotate 5' utr 3649 if ($gene_obj->has_CDS() && $gene_obj->has_5prime_UTR()) { 3650 my @prime5_utr = $gene_obj->get_5prime_UTR_coords(); 3651 if (@prime5_utr) { 3652 my $utr_count = 0; 3653 foreach my $coordset (@prime5_utr) { 3654 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 3655 $utr_count++; 3656 my $utr_id = "$model_id.utr5p$utr_count"; 3657 $gff3_text .= "$asmbl_id\t$source\tfive_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n"; 3658 } 3659 } 3660 } 3661 3662 3663 my $exon_counter = 0; 3664 foreach my $exon ($gene_obj->get_exons()) { 3665 $exon_counter++; 3666 my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords(); 3667 my $exon_ID_string = ""; 3668 if (my $exon_feat_name = $exon->{feat_name}) { 3669 $exon_ID_string = "$exon_feat_name"; 3670 } 3671 else { 3672 $exon_ID_string = "$model_id.exon$exon_counter"; 3673 } 3674 $gff3_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tID=${exon_ID_string};Parent=$model_id\n"; 3675 3676 if (my $cds_obj = $exon->get_CDS_obj()) { 3677 my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords(); 3678 my $phase = $cds_obj->{phase}; 3679 if (defined($phase)) { 3680 ## use GFF3 definition of phase, which is how many bases to trim before encountering first base of start 3681 if ($phase == 2) { 3682 $phase = 1; 3683 } 3684 elsif ($phase == 1) { 3685 $phase = 2; 3686 } 3687 # phase 0 remains 0 3688 } 3689 else { 3690 $phase = "."; #use phase info if avail 3691 } 3692 3693 3694 my $cds_ID_string = "cds.$model_id"; 3695 3696 # according to the GFF3 spec, CDS segments from the same coding region should have the same identifier. 3697 #if (my $cds_feat_name = $cds_obj->{feat_name}) { 3698 # $cds_ID_string = "$cds_feat_name"; 3699 #} 3700 #else { 3701 # $cds_ID_string = "$model_id.cds$exon_counter"; 3702 #} 3703 3704 my $partial_text = ""; 3705 if ($prime5_partial && $cds_obj->{first_cds}) { 3706 $partial_text .= ";5_prime_partial=true"; 3707 } 3708 if ($prime3_partial && $cds_obj->{last_cds}) { 3709 $partial_text .= ";3_prime_partial=true"; 3710 } 3711 3712 $gff3_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tID=${cds_ID_string};Parent=$model_id$partial_text\n"; 3713 } 3714 } 3715 3716 ## annotate 3' utr 3717 if ($gene_obj->has_CDS() && $gene_obj->has_3prime_UTR()) { 3718 my @prime3_utr = $gene_obj->get_3prime_UTR_coords(); 3719 if (@prime3_utr) { 3720 my $utr_count = 0; 3721 foreach my $coordset (@prime3_utr) { 3722 my ($lend, $rend) = sort {$a<=>$b} @$coordset; 3723 $utr_count++; 3724 my $utr_id = "$model_id.utr3p$utr_count"; 3725 $gff3_text .= "$asmbl_id\t$source\tthree_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n"; 3726 } 3727 } 3728 3729 } 3730 } 3731 3732 } ## end of protein-coding genes 3733 3734 3735 ## strip off any trailing whitespace and semicolons: 3736 my @lines = split (/\n/, $gff3_text); 3737 foreach my $line (@lines) { 3738 $line =~ s/\s+$//; 3739 $line =~ s/;$//; 3740 } 3741 3742 $gff3_text = join ("\n", @lines) . "\n"; 3743 3744 return ($gff3_text); 3745 3746} 3747 3748 3749 3750=over 4 3751 3752=item to_BED_format() 3753 3754B<Description:> describes gene in BED format 3755B<Parameters:> (uri_encode => 1|0) 3756B<Returns:> string 3757 3758 3759BED format described here: 3760http://genome.ucsc.edu/FAQ/FAQformat.html#format1 3761 3762 BED format 3763 3764 3765 3766 3767BED format provides a flexible way to define the data lines that are displayed in an annotation track. BED lines have three required fields and nine additional optional fields. The number of fields per line must be consistent throughout any single set of data in an annotation track. The order of the optional fields is binding: lower-numbered fields must always be populated if higher-numbered fields are used. 3768 3769The first three required BED fields are: 3770 37711. chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671). 3772 37732. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. 3774 37753. chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. 3776 3777The 9 additional optional BED fields are: 3778 37794. name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode. 3780 37815. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browsers translation of BED score values into shades of gray 3782 37836. strand - Defines the strand - either '+' or '-'. 3784 37857. thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). 3786 37878. thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). 3788 37899. itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. 3790 379110. blockCount - The number of blocks (exons) in the BED line. 3792 379311. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. 3794 379512. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. 3796 3797Example: 3798Heres an example of an annotation track that uses a complete BED definition: 3799track name=pairedReads description="Clone Paired Reads" useScore=1 3800chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 3801chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 3802 3803 3804 3805 3806=cut 3807 3808 3809sub to_BED_format { 3810 my $self = shift; 3811 my %params = @_; 3812 3813 my $strand = $self->get_strand(); 3814 3815 my ($coding_lend, $coding_rend) = sort {$a<=>$b} $self->get_CDS_span(); 3816 3817 my $scaffold = $self->{asmbl_id}; 3818 3819 my $gene_id = $self->{TU_feat_name}; 3820 my $trans_id = $self->{Model_feat_name}; 3821 3822 my $com_name = $self->{com_name} || ""; 3823 3824 my $score = $params{score} || 0; 3825 3826 if (my $alias = $self->{pub_locus}) { 3827 $com_name = "Alias=$alias;$com_name"; 3828 } 3829 3830 3831 if ($gene_id) { 3832 $com_name = "$gene_id;$com_name"; 3833 } 3834 3835 if ($trans_id) { 3836 $com_name = "ID=$trans_id;$com_name"; 3837 } 3838 else { 3839 $com_name = "ID=$com_name"; 3840 } 3841 3842 if ($params{uri_encode}) { 3843 $com_name = uri_escape($com_name); 3844 } 3845 3846 3847 my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons(); 3848 3849 my @exon_coords; 3850 foreach my $exon (@exons) { 3851 3852 my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords(); 3853 push (@exon_coords, [$exon_lend, $exon_rend]); 3854 } 3855 3856 3857 my @starts; 3858 my @lengths; 3859 3860 my $gene_lend = $exon_coords[0]->[0]; 3861 my $gene_rend = $exon_coords[$#exon_coords]->[1]; 3862 3863 foreach my $exon_coordset (@exon_coords) { 3864 my ($exon_lend, $exon_rend) = @$exon_coordset; 3865 3866 my $start = $exon_lend - $gene_lend; 3867 push (@starts, $start); 3868 3869 my $length = $exon_rend - $exon_lend + 1; 3870 push (@lengths, $length); 3871 } 3872 3873 3874 ## construct bed output. 3875 3876 $com_name =~ s/ /_/g; 3877 3878 my $bed_line = join("\t", $scaffold, 3879 $gene_lend-1, $gene_rend, 3880 $com_name, 3881 $score, 3882 $strand, 3883 $coding_lend-1, $coding_rend, 3884 "0", # rgb info - use '.' to allow user customization in IGV. Need 0 for compatibility with UCSC browser. 3885 scalar(@lengths), 3886 join(",", @lengths), 3887 join(",", @starts) 3888 ) . "\n"; 3889 3890 foreach my $isoform ($self->get_additional_isoforms()) { 3891 $bed_line .= $isoform->to_BED_format(%params); 3892 } 3893 3894 return($bed_line); 3895} 3896 3897 3898 3899# static method, returns gene object. 3900sub BED_line_to_gene_obj { 3901 my ($bed_line) = @_; 3902 3903 if (ref $bed_line) { 3904 confess "Error, static method, just provide bed text line, returns gene_obj"; 3905 } 3906 3907 3908 my @x = split(/\t/, $bed_line); 3909 3910 my $scaff = $x[0]; 3911 my $gene_lend = $x[1] + 1; 3912 my $gene_rend = $x[2]; 3913 3914 my $com_name = $x[3]; 3915 3916 my $score = $x[4]; 3917 my $orient = $x[5]; 3918 3919 if ($orient eq '*') { 3920 $orient = '+'; 3921 } 3922 3923 3924 my $coding_lend = $x[6] + 1; 3925 my $coding_rend = $x[7]; 3926 3927 my $rgb_color = $x[8]; 3928 3929 my $num_exons = $x[9]; 3930 3931 my $lengths_text = $x[10]; 3932 my $exon_relative_starts_text = $x[11]; 3933 3934 my @lengths = split(/,/, $lengths_text); 3935 my @exon_relative_starts = split(/,/, $exon_relative_starts_text); 3936 3937 my @exons; 3938 3939 while (@lengths) { 3940 my $len = shift @lengths; 3941 my $start = shift @exon_relative_starts; 3942 3943 my $exon_lend = $gene_lend + $start; 3944 my $exon_rend = $exon_lend + $len - 1; 3945 3946 3947 print "Len: $len, start=$start ====> $exon_lend - $exon_rend\n" if $DEBUG; 3948 3949 push (@exons, [$exon_lend, $exon_rend]); 3950 3951 } 3952 3953 3954 print "Coding: $coding_lend-$coding_rend, Exons: " . Dumper (\@exons) if $DEBUG; 3955 3956 my $gene_obj = new Gene_obj(); 3957 $gene_obj->build_gene_obj_exons_n_cds_range(\@exons, $coding_lend, $coding_rend, $orient); 3958 3959 $gene_obj->{com_name} = $com_name; 3960 $gene_obj->{asmbl_id} = $scaff; 3961 3962 $com_name =~ s/\s+/\|/g; # reformat as an identifier with no whitespace 3963 3964 $gene_obj->{TU_feat_name} = "$com_name"; 3965 $gene_obj->{Model_feat_name} = "m.$com_name"; 3966 3967 return($gene_obj); 3968 3969 3970} 3971 3972 3973 3974 3975 3976 3977## Private, remove leading and trailing whitespace characters: 3978sub trim_leading_trailing_ws { 3979 my ($ref) = @_; 3980 if (ref $ref eq "SCALAR") { 3981 $$ref =~ s/^\s+|\s+$//g; 3982 } elsif (ref $ref eq "ARRAY") { 3983 foreach my $element (@$ref) { 3984 $element =~ s/^\s+|\s+$//g; 3985 } 3986 } else { 3987 my $type = ref $ref; 3988 die "Currently don't support trim_leading_trailing_ws(ref type: $type)\n"; 3989 } 3990} 3991 3992 3993 3994=over 4 3995 3996=item to_GTF2_format() 3997 3998B<Description:> provides gene in GTF2 format 3999 4000B<Parameters:> genome_seq_ref, [properties_href] 4001 4002B<Returns:> text 4003 4004 4005properties_href encodes preferences like so 4006 4007 properties_href = { 4008 seqname => tigr_asmbl_id_1000, # by default, asmbl_id is used as encoded in gene_obj 4009 4010 source => MyGenePrediction, # by default, set to "TIGR" 4011 4012 include_comments => 0, # turned on by default, indicating partial or pseudogenes with preceding comment lines 4013 4014 } 4015 4016 4017 4018The GTF2 format is described here: 4019http://genes.cs.wustl.edu/GTF2.html 4020 4021as follows: 4022 4023GTF2 format (Revised Ensembl GTF) 4024Gene transfer format. This borrows from GFF, but has additional structure that warrants a separate definition and format name. 4025NEW! Validating Parser for GTF 4026 4027Structure is as GFF, so the fields are: 4028<seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments] 4029 4030Here is a simple example with 3 translated exons. Order of rows is not important. 4031 4032AB000381 Twinscan CDS 380 401 . + 0 gene_id "001"; transcript_id "001.1"; 4033AB000381 Twinscan CDS 501 650 . + 2 gene_id "001"; transcript_id "001.1"; 4034AB000381 Twinscan CDS 700 707 . + 2 gene_id "001"; transcript_id "001.1"; 4035AB000381 Twinscan start_codon 380 382 . + 0 gene_id "001"; transcript_id "001.1"; 4036AB000381 Twinscan stop_codon 708 710 . + 0 gene_id "001"; transcript_id "001.1"; 4037 4038The whitespace in this example is provided only for readability. In GTF, fields must be separated by a single TAB and no white space. 4039 4040<seqname> 4041The FPC contig ID from the Golden Path. 4042 4043<source> 4044The source column should be a unique label indicating where the annotations came from --- typically the name of either a prediction program or a public database. 4045 4046<feature> 4047The following feature types are required: "CDS", "start_codon", "stop_codon". The feature "exon" is optional, since this project will not evaluate predicted splice sites outside of protein coding regions. All other features will be ignored. 4048 4049CDS represents the coding sequence starting with the first translated codon and proceeding to the last translated codon. Unlike Genbank annotation, the stop codon is not included in the CDS for the terminal exon. 4050 4051<start> <end> 4052Integer start and end coordinates of the feature relative to the beginning of the sequence named in <seqname>. <start> must be less than or equal to <end>. Sequence numbering starts at 1. Values of <start> and <end> that extend outside the reference sequence are technically acceptable, but they are discouraged for purposes of this project. 4053 4054<score> 4055The score field will not be used for this project, so you can either provide a meaningful float or replace it by a dot. 4056 4057<frame> 40580 indicates that the first whole codon of the reading frame is located at 5'-most base. 1 means that there is one extra base before the first codon and 2 means that there are two extra bases before the first codon. Note that the frame is not the length of the CDS mod 3. 4059 4060Here are the details excised from the GFF spec. Important: Note comment on reverse strand. 4061 4062 '0' indicates that the specified region is in frame, i.e. that its first base corresponds to the first base of a codon. '1' indicates that there is one extra base, i.e. that the second base of the region corresponds to the first base of a codon, and '2' means that the third base of the region is the first base of a codon. If the strand is '-', then the first base of the region is value of <end>, because the corresponding coding region will run from <end> to <start> on the reverse strand. 4063 4064[attributes] 4065All four features have the same two mandatory attributes at the end of the record: 4066 4067 * gene_id value; A globally unique identifier for the genomic source of the transcript 4068 * transcript_id value; A globally unique identifier for the predicted transcript. 4069 4070These attributes are designed for handling multiple transcripts from the same genomic region. Any other attributes or comments must appear after these two and will be ignored. 4071 4072Attributes must end in a semicolon which must then be separated from the start of any subsequent attribute by exactly one space character (NOT a tab character). 4073 4074Textual attributes should be surrounded by doublequotes. 4075 4076Here is an example of a gene on the negative strand. Larger coordinates are 5' of smaller coordinates. Thus, the start codon is 3 bp with largest coordinates among all those bp that fall within the CDS regions. Similarly, the stop codon is the 3 bp with coordinates just less than the smallest coordinates within the CDS regions. 4077 4078AB000123 Twinscan CDS 193817 194022 . - 2 gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4079AB000123 Twinscan CDS 199645 199752 . - 2 gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4080AB000123 Twinscan CDS 200369 200508 . - 1 gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4081AB000123 Twinscan CDS 215991 216028 . - 0 gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4082AB000123 Twinscan start_codon 216026 216028 . - . gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4083AB000123 Twinscan stop_codon 193814 193816 . - . gene_id "AB000123.1"; transcript_id "AB00123.1.2"; 4084 4085Note the frames of the coding exons. For example: 4086 4087 1. The first CDS (from 216028 to 215991) always has frame zero. 4088 2. Frame of the 1st CDS =0, length =38. (frame - length) % 3 = 1, the frame of the 2nd CDS. 4089 3. Frame of the 2nd CDS=1, length=140. (frame - length) % 3 = 2, the frame of the 3rd CDS. 4090 4. Frame of the 3rd CDS=2, length=108. (frame - length) % 3 = 2, the frame of the terminal CDS. 4091 5. Alternatively, the frame of terminal CDS can be calculated without the rest of the gene. Length of the terminal CDS=206. length % 3 =2, the frame of the terminal CDS. 4092 4093Here is an example in which the "exon" feature is used. It is a 5 exon gene with 3 translated exons. 4094 4095AB000381 Twinscan exon 150 200 . + . gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4096AB000381 Twinscan exon 300 401 . + . gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4097AB000381 Twinscan CDS 380 401 . + 0 gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4098AB000381 Twinscan exon 501 650 . + . gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4099AB000381 Twinscan CDS 501 650 . + 2 gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4100AB000381 Twinscan exon 700 800 . + . gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4101AB000381 Twinscan CDS 700 707 . + 2 gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4102AB000381 Twinscan exon 900 1000 . + . gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4103AB000381 Twinscan start_codon 380 382 . + 0 gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4104AB000381 Twinscan stop_codon 708 710 . + 0 gene_id "AB000381.000"; transcript_id "AB000381.000.1"; 4105 4106 4107 4108 4109 4110=back 4111 4112=cut 4113 4114 4115 4116sub to_GTF2_format () { 4117 my $self = shift; 4118 my $genomic_seq_ref = shift; 4119 4120 my $properties_href = shift; 4121 unless ($properties_href) { 4122 $properties_href = {}; 4123 } 4124 4125 4126 ## need to adjust my frame definition so it's consistent with requirements above in spec. 4127 my $frame_convert = sub { 4128 my $phase = shift; 4129 4130 my %frame = ( 0 => 0, 4131 1 => 2, 4132 2 => 1 ); 4133 return ($frame{$phase}); 4134 }; 4135 4136 4137 my $gtf2_text = ""; 4138 4139 my $gene_obj = $self; 4140 4141 my $asmbl_id = $properties_href->{seqname} || $gene_obj->{asmbl_id} || die "Error, no asmbl_id as gene_obj att"; 4142 4143 my $source = $properties_href->{source} || "TIGR"; 4144 4145 my $gene_id = $gene_obj->{TU_feat_name}; 4146 my $model_id = $gene_obj->{Model_feat_name}; 4147 my $strand = $gene_obj->get_orientation(); 4148 4149 4150 my $comment_line = ""; 4151 if ($gene_obj->is_pseudogene()) { 4152 $comment_line .= "$model_id=pseudogene "; 4153 } 4154 4155 if ( $gene_obj->{gene_type} eq "protein-coding") { 4156 4157 if (! $gene_obj->is_pseudogene()) { 4158 4159 $gene_obj->set_CDS_phases($genomic_seq_ref); 4160 # also resets the 5' and 3' partiality attributes based on the longest orf. 4161 4162 4163 if ($gene_obj->is_5prime_partial()) { 4164 $comment_line .= "$model_id=5'partial "; 4165 } 4166 else { 4167 $gene_obj->validate_start_codon(); 4168 } 4169 4170 if ($gene_obj->is_3prime_partial() ) { 4171 $comment_line .= "$model_id=3'partial "; 4172 } 4173 else { 4174 $gene_obj->validate_stop_codon(); 4175 } 4176 } 4177 4178 my @stop_codon_objs; 4179 my @start_codons; 4180 4181 if (! $gene_obj->is_pseudogene()) { 4182 4183 if (! $gene_obj->is_3prime_partial()) { 4184 @stop_codon_objs = $gene_obj->_remove_stop_codons(); 4185 4186 unless (@stop_codon_objs) { 4187 confess $gene_obj->toString() . "Error, no stop codon objs retrieved for non 3' partial gene"; 4188 } 4189 } 4190 if (! $gene_obj->is_5prime_partial()) { 4191 @start_codons = $self->_extract_start_codons(); 4192 4193 unless (@start_codons) { 4194 confess $gene_obj->toString() . "Error, no start codon extracted for non 5'partial gene."; 4195 } 4196 } 4197 4198 } 4199 4200 foreach my $start_codon (@start_codons) { 4201 my ($start_lend, $start_rend) = sort {$a<=>$b} $start_codon->get_coords(); 4202 my $phase = &$frame_convert($start_codon->{phase}); 4203 $gtf2_text .= "$asmbl_id\t$source\tstart_codon\t$start_lend\t$start_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n"; 4204 } 4205 4206 4207 foreach my $exon ($gene_obj->get_exons()) { 4208 my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords(); 4209 $gtf2_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n"; 4210 4211 if ($gene_obj->is_pseudogene()) { next; } # don't bother trying to report nonsensical CDSs. 4212 4213 if (my $cds_obj = $exon->get_CDS_obj()) { 4214 my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords(); 4215 my $phase = $cds_obj->{phase}; 4216 unless (defined($phase)) { 4217 die "Error, no phase defined for cds($cds_lend-$cds_rend) of gene" . $gene_obj->toString(); 4218 } 4219 $phase = &$frame_convert($phase); 4220 4221 $gtf2_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n"; 4222 } 4223 } 4224 4225 foreach my $stop_codon (@stop_codon_objs) { 4226 my ($stop_lend, $stop_rend) = sort {$a<=>$b} $stop_codon->get_coords(); 4227 my $phase = &$frame_convert($stop_codon->{phase}); 4228 $gtf2_text .= "$asmbl_id\t$source\tstop_codon\t$stop_lend\t$stop_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n"; 4229 } 4230 4231 foreach my $isoform ($gene_obj->get_additional_isoforms() ) { 4232 $gtf2_text .= $isoform->to_GTF2_format($genomic_seq_ref, $properties_href); 4233 } 4234 } 4235 4236 if ($comment_line) { 4237 # prefix with \# to actually comment it in the file 4238 $comment_line = "#$comment_line\n"; 4239 } 4240 4241 my $comment_flag = $properties_href->{include_comments}; 4242 if (defined ($comment_flag) && $comment_flag == 0) { 4243 $comment_line = ""; # clear it 4244 } 4245 4246 4247 return ($comment_line . $gtf2_text); 4248} 4249 4250 4251sub _extract_start_codons { 4252 my $self = shift; 4253 4254 ## 5' partiality attribute is trusted here !!! 4255 4256 if ($self->is_5prime_partial()) { 4257 return(); 4258 } 4259 4260 my @exons = $self->get_exons(); 4261 my $orientation = $self->get_orientation(); 4262 4263 my @start_codons; 4264 4265 my $found_cds_flag = 0; 4266 4267 for (my $i = 0; $i <= $#exons; $i++) { 4268 if (my $cds = $exons[$i]->get_CDS_obj()) { 4269 # found first cds 4270 $found_cds_flag = 1; 4271 my ($cds_end5, $cds_end3) = $cds->get_coords(); 4272 my $cds_len = $cds->length(); 4273 if ($cds_len >= 3) { 4274 ## got start codon in entirety 4275 if ($orientation eq '+') { 4276 push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5+2)->set_phase(0)); 4277 last; 4278 } 4279 else { 4280 push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5-2)->set_phase(0)); 4281 last; 4282 } 4283 } 4284 else { 4285 ## split start codon 4286 push (@start_codons, $cds); # add current cds as start codon part 4287 my $missing_length = 3 - $cds_len; 4288 4289 ## examine next cds exon for part of it: 4290 my $next_cds = $exons[$i+1]->get_CDS_obj(); 4291 unless (ref $next_cds) { 4292 die "Error, no next cds for split start codon" . $self->toString(); 4293 } 4294 4295 my ($next_cds_end5, $next_cds_end3) = $next_cds->get_coords(); 4296 my $next_cds_len = $next_cds->length(); 4297 4298 if ($next_cds_len >= $missing_length) { 4299 # great, this has everything we need 4300 if ($orientation eq '+') { 4301 push (@start_codons, 4302 CDS_exon_obj->new($next_cds_end5, $next_cds_end5 + $missing_length-1)->set_phase($next_cds->{phase})); 4303 last; 4304 } 4305 else { 4306 push (@start_codons, 4307 CDS_exon_obj->new($next_cds_end5, $next_cds_end5 - $missing_length + 1)->set_phase($next_cds->{phase})); 4308 last; 4309 } 4310 } 4311 else { 4312 ## another split start codon portion. Just add the current cds, and get the first bp from the next cds 4313 push (@start_codons, $next_cds); 4314 4315 my $final_cds = $exons[$i+2]->get_CDS_obj(); 4316 unless (ref $final_cds) { 4317 die "Error getting final cds of three-part split start codon"; 4318 } 4319 unless ($final_cds->{phase} == 2) { 4320 die "Error, final cds of three-part stop codon is not in phase 2 "; 4321 } 4322 my ($final_cds_end5, $final_cds_end3) = $final_cds->get_coords(); 4323 push (@start_codons, 4324 CDS_exon_obj->new($final_cds_end5, $final_cds_end5)->set_phase(2)); 4325 last; 4326 } 4327 } # end of split start codon 4328 4329 } # end of found cds 4330 } # end of foreach exon 4331 4332 unless ($found_cds_flag) { 4333 die "Error, no cds exon found in search of start codon"; 4334 } 4335 4336 unless (@start_codons) { 4337 die "Error, no start codons found"; 4338 } 4339 ## ensure start codons sum to 3 4340 my $sum_len = 0; 4341 foreach my $start_codon (@start_codons) { 4342 $sum_len += $start_codon->length(); 4343 } 4344 unless ($sum_len == 3) { 4345 print "Error, sum len of start codons != 3 ( = $sum_len, instead) " . $self->toString() . "starts:\n"; 4346 my $i=0; 4347 foreach my $start (@start_codons) { 4348 $i++; 4349 print "start($i): " . $start->toString(); 4350 } 4351 die; 4352 } 4353 4354 return (@start_codons); 4355 4356} 4357 4358 4359 4360sub _remove_stop_codons { 4361 my $self = shift; 4362 4363 ## 3' partiality attribute is trusted here !!! 4364 4365 if ($self->is_3prime_partial()) { 4366 return (); 4367 } 4368 4369 my $orientation = $self->get_orientation(); 4370 my @exons = reverse $self->get_exons(); # examining exons in reverse order, starting from stop codon direction. 4371 4372 my @stop_codons; 4373 4374 my $found_cds_flag = 0; 4375 4376 ## find first exon 4377 for (my $i=0; $i <= $#exons; $i++) { 4378 if (my $cds = $exons[$i]->get_CDS_obj()) { 4379 4380 $found_cds_flag = 1; 4381 4382 my ($cds_end5, $cds_end3) = $cds->get_coords(); 4383 4384 my $cds_length = $cds->length(); 4385 if ($cds_length > 3) { 4386 ## cds exon encodes more than just the stop codon 4387 if ($orientation eq '+') { 4388 $cds->{end3} -= 3; 4389 push (@stop_codons, CDS_exon_obj->new($cds_end3 - 2, $cds_end3)->set_phase(0)); 4390 } 4391 else { 4392 $cds->{end3} += 3; 4393 push (@stop_codons, CDS_exon_obj->new($cds_end3 + 2, $cds_end3)->set_phase(0)); 4394 } 4395 last; 4396 4397 } 4398 elsif ($cds_length == 3) { 4399 ## Just a stop codon exon. We can remove it. 4400 push (@stop_codons, $cds); 4401 $exons[$i]->{CDS_exon_obj} = 0; # nullified 4402 last; 4403 } 4404 4405 else { 4406 ## cds exon encodes a split stop codon 4407 push (@stop_codons, $cds); # just add the last portion of stop codon 4408 $exons[$i]->{CDS_exon_obj} = 0; # nullified 4409 4410 ## check next portion of cds exon to see if it contains the rest of the stop 4411 my $next_exon = $exons[$i+1]; 4412 unless (ref $next_exon) { 4413 die "Error, incomplete stop codon and not enough exons! "; 4414 } 4415 my $missing_stop_length = 3 - $cds_length; 4416 my $next_cds_obj = $next_exon->get_CDS_obj(); 4417 unless (ref $next_cds_obj) { 4418 die "Error, next cds obj is missing!"; 4419 } 4420 4421 my $next_cds_length = $next_cds_obj->length(); 4422 my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords(); 4423 if ($next_cds_length <= $missing_stop_length) { 4424 ## encodes only the second part of the stop codon 4425 # add and nullify 4426 push (@stop_codons, $next_cds_obj); 4427 $next_exon->{CDS_exon_obj} = 0; 4428 4429 ## get the very last part of the stop 4430 $missing_stop_length -= $next_cds_length; 4431 if ($missing_stop_length > 0) { 4432 ## must be still missing the first bp of the stop codon 4433 if ($missing_stop_length != 1) { 4434 die "Error, too much of the stop codon is left (missing_length = $missing_stop_length). Should only be 1 "; 4435 } 4436 my $next_exon = $exons[$i+2]; 4437 unless (ref $next_exon) { 4438 die "Error, second next exon is unavail "; 4439 } 4440 my $next_cds_obj = $next_exon->get_CDS_obj(); 4441 unless (ref $next_cds_obj) { 4442 die "Error, second next cds obj is unavail"; 4443 } 4444 my $cds_length = $next_cds_obj->length(); 4445 my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords(); 4446 if ($cds_length > 1) { 4447 if ($orientation eq '+') { 4448 $next_cds_obj->{end3}-=1; 4449 push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0)); 4450 } 4451 else { 4452 $next_cds_obj->{end3}+=1; 4453 push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0)); 4454 } 4455 } 4456 } 4457 } 4458 else { 4459 # split stop codon 4460 #missing length of cds exon is present in the second portion of the stop 4461 if ($orientation eq '+') { 4462 $next_cds_obj->{end3} -= $missing_stop_length; 4463 push (@stop_codons, CDS_exon_obj->new($cds_end3 - $missing_stop_length + 1, $cds_end3)->set_phase(0)); 4464 } 4465 else { 4466 $next_cds_obj->{end3} += $missing_stop_length; 4467 push (@stop_codons, CDS_exon_obj->new($cds_end3 + $missing_stop_length -1, $cds_end3)->set_phase(0)); 4468 } 4469 } 4470 } # end of split stop codon 4471 4472 last; 4473 4474 } # end of found cds obj 4475 4476 4477 } # end of foreach exon 4478 4479 unless ($found_cds_flag) { 4480 die "Error, no cds exon was found. "; 4481 } 4482 4483 4484 unless (@stop_codons) { 4485 die "Error, no stop codons extracted from non-partial gene."; 4486 } 4487 4488 @stop_codons = reverse @stop_codons; # reorder according to gene direction 4489 4490 ## make sure sum (stop_codons) length == 3 4491 my $sum_len = 0; 4492 foreach my $stop_codon (@stop_codons) { 4493 $sum_len += $stop_codon->length(); 4494 } 4495 if ($sum_len != 3) { 4496 print "Error, stop codons sum length != 3 ( = $sum_len, instead) " . $self->toString(); 4497 my $i=0; 4498 foreach my $stop_codon (@stop_codons) { 4499 print "stop($i): " . $stop_codon->toString(); 4500 } 4501 4502 die; 4503 } 4504 4505 return (@stop_codons); 4506 4507} 4508 4509 4510 4511sub has_CDS { 4512 my $self = shift; 4513 4514 foreach my $exon ($self->get_exons()) { 4515 if (ref ($exon->get_CDS_obj())) { 4516 return (1); 4517 } 4518 } 4519 4520 return (0); # no cds entry found 4521} 4522 4523 4524#### 4525sub set_CDS_phases_from_init_phase { 4526 my ($self, $init_phase) = @_; 4527 4528 my @exons = $self->get_exons(); 4529 4530 my $curr_cds_len = $init_phase; 4531 4532 foreach my $exon (@exons) { 4533 if (my $cds = $exon->get_CDS_obj()) { 4534 $cds->set_phase($curr_cds_len % 3); 4535 my $cds_len = $cds->length(); 4536 $curr_cds_len += $cds_len; 4537 } 4538 } 4539 4540 return; 4541} 4542 4543 4544 4545sub set_CDS_phases { 4546 my ($self, $genomic_seq_ref) = @_; 4547 4548 4549 my $start_pos = 1; 4550 if ($self->has_CDS() && ! $self->is_pseudogene()) { 4551 4552 $self->create_all_sequence_types($genomic_seq_ref); 4553 4554 my $cds_sequence = $self->get_CDS_sequence(); 4555 my $protein_seq = $self->get_protein_sequence(); 4556 4557 ## first, clear the partial attributes: 4558 $self->set_5prime_partial(0); 4559 $self->set_3prime_partial(0); 4560 4561 4562 if ($protein_seq !~ /^M/) { 4563 # lacks start codon 4564 $self->set_5prime_partial(1); 4565 } 4566 if ($protein_seq !~ /\*$/) { 4567 # lacks stop codon 4568 $self->set_3prime_partial(1); 4569 } 4570 4571 $start_pos = $self->_get_cds_start_pos($cds_sequence); 4572 4573 4574 my $first_phase = $start_pos - 1; 4575 4576 my @exons = $self->get_exons(); 4577 my @cds_objs; 4578 foreach my $exon (@exons) { 4579 my $cds = $exon->get_CDS_obj(); 4580 if (ref $cds) { 4581 push (@cds_objs, $cds); 4582 } 4583 } 4584 4585 my $cds_obj = shift @cds_objs; 4586 $cds_obj->{phase} = $first_phase; 4587 my $cds_length = abs ($cds_obj->{end3} - $cds_obj->{end5}) + 1; 4588 $cds_length -= $first_phase; 4589 4590 while (@cds_objs) { 4591 my $next_cds_obj = shift @cds_objs; 4592 $next_cds_obj->{phase} = $cds_length % 3; 4593 $cds_length += abs ($next_cds_obj->{end3} - $next_cds_obj->{end5}) + 1; 4594 } 4595 } 4596 4597 foreach my $isoform ($self->get_additional_isoforms()) { 4598 $isoform->set_CDS_phases($genomic_seq_ref); 4599 } 4600 4601 return; 4602 4603} 4604 4605 4606sub get_first_CDS_segment { 4607 my $gene_obj = shift; 4608 my @exons = $gene_obj->get_exons(); 4609 4610 foreach my $exon (@exons) { 4611 if (my $cds = $exon->get_CDS_exon_obj()) { 4612 return ($cds); 4613 } 4614 } 4615 4616 return undef; 4617} 4618 4619sub _get_cds_start_pos { 4620 my ($self, $cds_sequence) = @_; 4621 my $cds_length = length($cds_sequence); 4622 # if cds is set of triplets, assume translate at codon pos 1. 4623 my $codon_start; 4624 4625 ## must determine where translation starts: 4626 my $new_orfFinder = new Longest_orf(); 4627 $new_orfFinder->allow_partials(); 4628 $new_orfFinder->forward_strand_only(); 4629 4630 my $longest_orf = $new_orfFinder->get_longest_orf($cds_sequence); 4631 4632 unless (ref $longest_orf) { 4633 die "No longest ORF found in sequence"; 4634 } 4635 4636 ## examine the first three ORFs, prefer long orf with stop codon. 4637 my $orfPos = $longest_orf->{start}; #init to first, longest orf. 4638 4639 unless (defined $orfPos) { 4640 die "Error, orfPos not defined! " . Dumper ($longest_orf); 4641 } 4642 4643 my $bestOrfPos; 4644 my @allOrfs = $new_orfFinder->orfs(); 4645 4646 for my $orfIndex (0..2) { 4647 my $orf = $allOrfs[$orfIndex]; 4648 if ($orf) { 4649 my $start = $orf->{start}; 4650 my $length = $orf->{length}; 4651 my $protein = $orf->{protein}; 4652 if ($length > $cds_length - 3 && $start <= 3 && $protein =~ /\*$/) { 4653 unless ($bestOrfPos) { 4654 $bestOrfPos = $start; 4655 } 4656 } 4657 } 4658 } 4659 4660 if ($bestOrfPos && $bestOrfPos != $orfPos) { 4661 $orfPos = $bestOrfPos; 4662 } 4663 4664 if ($orfPos >3) { 4665 confess "Error, longest ORF is found at position $orfPos, and should be between 1 and 3. What's wrong with your gene?" . $self->toString(); 4666 } 4667 $codon_start = $orfPos; 4668 4669 return ($codon_start); 4670} 4671 4672 4673=over 4 4674 4675=item dispose() 4676 4677B<Description:> Sets all attributes = 0, hopefully to faciliate targeting for garbage collection. (experimental method) 4678 4679B<Parameters:> none 4680 4681B<Returns:> none 4682 4683=back 4684 4685=cut 4686 4687sub dispose { 4688 my $self = shift; 4689 foreach my $att (keys %$self) { 4690 $self->{$att} = 0; 4691 } 4692} 4693 4694 4695 4696sub DESTROY { 4697 my $self = shift; 4698 4699 warn "DESTROYING gene_obj: " . $self->{TU_feat_name} . "," . $self->{Model_feat_name} . "\n" if $main::DEBUG; 4700 4701} 4702 4703 4704sub validate_start_codon { 4705 ## requires that you have the CDS sequence already set 4706 my $self = shift; 4707 4708 my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence. It must be built prior to calling this method"; 4709 ## currently, only trust Met start codons. 4710 my $start_codon = uc substr($cds_sequence, 0, 3); 4711 if ($start_codon ne "ATG") { 4712 die $self->toString() . "Error, start codon is not M (codon $start_codon instead)!"; 4713 # call within an eval block to catch exception 4714 } 4715} 4716 4717 4718sub validate_stop_codon { 4719 ## requires that you have the CDS sequence already set 4720 my $self = shift; 4721 4722 my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence. It must be built prior to calling this method"; 4723 4724 my @stop_codons = &Nuc_translator::get_stop_codons(); 4725 4726 my $curr_stop_codon = substr($cds_sequence, length($cds_sequence)-3, 3); 4727 4728 my $found_stop_codon_flag = 0; 4729 foreach my $stop (@stop_codons) { 4730 if ($stop eq $curr_stop_codon) { 4731 $found_stop_codon_flag = 1; 4732 last; 4733 } 4734 } 4735 4736 unless ($found_stop_codon_flag) { 4737 die $self->toString() . "Error, stop codon $curr_stop_codon is not an acceptable stop codon: [@stop_codons]\n"; 4738 } 4739 4740} 4741 4742 4743 4744 4745###################################################################################################################################### 4746###################################################################################################################################### 4747 4748 4749=head1 NAME 4750 4751package mRNA_exon_obj 4752 4753=cut 4754 4755=head1 DESCRIPTION 4756 4757 The mRNA_exon_obj represents an individual spliced mRNA exon of a gene. The coordinates of the exon can be manipulated, and the mRNA_exon_obj can contain a single CDS_exon_obj. A mRNA_exon_obj lacking a CDS_exon_obj component is an untranslated (UTR) exon. 4758 4759 A mature Gene_obj is expected to have at least one mRNA_exon_obj component. 4760 4761=cut 4762 4763 4764package mRNA_exon_obj; 4765 4766use strict; 4767use warnings; 4768use Storable qw (store retrieve freeze thaw dclone); 4769 4770=over 4 4771 4772=item new() 4773 4774B<Description:> Instantiates an mRNA_exon_obj 4775 4776B<Parameters:> <(end5, end3)> 4777 4778The end5 and end3 coordinates can be optionally passed into the constructor to set these attributes. Alternatively, the set_coords() method can be used to set these values. 4779 4780B<Returns:> $mRNA_exon_obj 4781 4782=back 4783 4784=cut 4785 4786 4787 ; 4788 4789sub new { 4790 shift; 4791 my $self = { end5 => 0, # stores end5 of mRNA exon 4792 end3 => 0, # stores end3 of mRNA exon 4793 CDS_exon_obj => 0, # stores object reference to CDS_obj 4794 feat_name => 0, # stores TIGR temp id 4795 strand => undef, # +|- 4796 }; 4797 4798 # end5 and end3 can be included as parameters in constructor. 4799 if (@_) { 4800 my ($end5, $end3) = @_; 4801 if (defined($end5) && defined($end3)) { 4802 $self->{end5} = $end5; 4803 $self->{end3} = $end3; 4804 } 4805 } 4806 4807 bless ($self); 4808 return ($self); 4809} 4810 4811 4812 4813=over 4 4814 4815=item get_CDS_obj() 4816 4817B<Description:> Retrieves the CDS_exon_obj component of this mRNA_exon_obj 4818 4819B<Parameters:> none 4820 4821B<Returns:> $cds_exon_obj 4822 4823If no CDS_exon_obj is attached, returns 0 4824 4825=back 4826 4827=cut 4828 4829 ; 4830 4831sub get_CDS_obj { 4832 my $self = shift; 4833 return ($self->{CDS_exon_obj}); 4834} 4835 4836 4837## alias 4838sub get_CDS_exon_obj { 4839 my $self = shift; 4840 return ($self->get_CDS_obj()); 4841} 4842 4843 4844=over 4 4845 4846=item get_mRNA_exon_end5_end3() 4847 4848B<Description:> Retrieves the end5, end3 coordinates of the exon 4849 4850**Method Deprecated**, use get_coords() 4851 4852B<Parameters:> none 4853 4854B<Returns:> (end5, end3) 4855 4856=back 4857 4858=cut 4859 4860 4861sub get_mRNA_exon_end5_end3 { 4862 my $self = shift; 4863 return ($self->{end5}, $self->{end3}); 4864} 4865 4866 4867 4868=over 4 4869 4870=item set_CDS_exon_obj() 4871 4872B<Description:> Sets the CDS_exon_obj of the mRNA_exon_obj 4873 4874B<Parameters:> $cds_exon_obj 4875 4876B<Returns:> none 4877 4878=back 4879 4880=cut 4881 4882 ; 4883sub set_CDS_exon_obj { 4884 my $self = shift; 4885 my $ref = shift; 4886 if (ref($ref)) { 4887 $self->{CDS_exon_obj} = $ref; 4888 } 4889} 4890 4891 4892 4893#### 4894sub delete_CDS_exon_obj { 4895 my $self = shift; 4896 $self->{CDS_exon_obj} = undef; 4897 return; 4898} 4899 4900 4901=over 4 4902 4903=item add_CDS_exon_obj() 4904 4905B<Description:> Instantiates and adds a new CDS_exon_obj to the mRNA_exon_obj given the CDS coordinates. 4906 4907B<Parameters:> (end5, end3) 4908 4909B<Returns:> none 4910 4911=back 4912 4913=cut 4914 4915 4916sub add_CDS_exon_obj { 4917 my $self = shift; 4918 my ($end5, $end3) = @_; 4919 my $cds_obj = CDS_exon_obj->new ($end5, $end3); 4920 $self->set_CDS_exon_obj($cds_obj); 4921} 4922 4923 4924=over 4 4925 4926=item set_feat_name() 4927 4928B<Description:> Sets the feat_name attribute of the mRNA_exon_obj 4929 4930B<Parameters:> $feat_name 4931 4932B<Returns:> none 4933 4934=back 4935 4936=cut 4937 4938 4939 4940sub set_feat_name { 4941 my $self = shift; 4942 my $feat_name = shift; 4943 $self->{feat_name} = $feat_name; 4944} 4945 4946 4947=over 4 4948 4949=item clone_exon() 4950 4951B<Description:> Creates a deep clone of this mRNA_exon_obj, using dclone() of Storable.pm 4952 4953B<Parameters:> none 4954 4955B<Returns:> $mRNA_exon_obj 4956 4957=back 4958 4959=cut 4960 4961 4962 4963sub clone_exon { 4964 my $self = shift; 4965 4966 my $clone_exon = dclone($self); 4967 4968 return ($clone_exon); 4969} 4970 4971 4972 4973=over 4 4974 4975=item get_CDS_end5_end3 () 4976 4977B<Description:> Retrieves end5, end3 of the CDS_exon_obj component of this mRNA_exon_obj 4978 4979B<Parameters:> none 4980 4981B<Returns:> (end5, end3) 4982 4983An empty array is returned if no CDS_exon_obj is attached. 4984 4985=back 4986 4987=cut 4988 4989 4990sub get_CDS_end5_end3 { 4991 my $self = shift; 4992 my $cds_obj = $self->get_CDS_obj(); 4993 if ($cds_obj) { 4994 return ($cds_obj->get_CDS_end5_end3()); 4995 } else { 4996 return ( () ); 4997 } 4998} 4999 5000 5001=over 4 5002 5003=item get_coords() 5004 5005B<Description:> Retrieves the end5, end3 coordinates of this mRNA_exon_obj 5006 5007B<Parameters:> none 5008 5009B<Returns:> (end5, end3) 5010 5011=back 5012 5013=cut 5014 5015 5016sub get_coords { 5017 my $self = shift; 5018 return ($self->get_mRNA_exon_end5_end3()); 5019} 5020 5021 5022=over 4 5023 5024=item set_coords() 5025 5026B<Description:> Sets the end5, end3 coordinates of the mRNA_exon_obj 5027 5028B<Parameters:> (end5, end3) 5029 5030B<Returns:> none 5031 5032=back 5033 5034=cut 5035 5036 5037## simpler coord setting (end5, end3) 5038sub set_coords { 5039 my $self = shift; 5040 my $end5 = shift; 5041 my $end3 = shift; 5042 $self->{end5} = $end5; 5043 $self->{end3} = $end3; 5044} 5045 5046 5047=over 4 5048 5049=item get_strand() 5050 5051B<Description:> Retrieves the orientation of the mRNA_exon_obj based on gene models transcribed orientation. 5052 5053B<Parameters:> none 5054 5055B<Returns:> +|-|undef 5056 5057If end5 == end3, strand orientation cannot be inferred based on coordinates alone, so undef is returned. 5058 5059=back 5060 5061=cut 5062 5063 5064 ; 5065 5066sub get_orientation { 5067 # determine positive or reverse orientation 5068 my $self = shift; 5069 return ($self->{strand}); 5070} 5071 5072 5073sub get_strand { ## preferred 5074 my $self = shift; 5075 return($self->get_orientation()); 5076} 5077 5078 5079#### 5080sub merge_exon { 5081 my $self = shift; 5082 my $other_exon = shift; 5083 5084 my $cds = $self->get_CDS_exon_obj(); 5085 5086 my $other_cds = $other_exon->get_CDS_exon_obj(); 5087 5088 if ($other_cds) { 5089 if ($cds) { 5090 $cds->merge_CDS($other_cds); 5091 } 5092 else { 5093 # current exon lacks cds. Set this one to it. 5094 $self->set_CDS_exon_obj($other_cds); 5095 } 5096 } 5097 5098 5099 ## merge the exons. 5100 my @coords = sort {$a<=>$b} ($self->get_coords(), $other_exon->get_coords()); 5101 my $lend = shift @coords; 5102 my $rend = pop @coords; 5103 5104 my ($new_end5, $new_end3) = ($self->get_orientation() eq '+') ? ($lend, $rend) : ($rend, $lend); 5105 5106 $self->set_coords($new_end5, $new_end3); 5107 5108 return; 5109} 5110 5111 5112 5113 5114 5115 5116=over 4 5117 5118=item toString() 5119 5120B<Description:> Provides a textual description of the mRNA_exon_obj 5121 5122B<Parameters:> none 5123 5124B<Returns:> $text 5125 5126=back 5127 5128=cut 5129 5130 ; 5131 5132 5133sub toString { 5134 my $self = shift; 5135 my @coords = $self->get_mRNA_exon_end5_end3(); 5136 my $feat_name = $self->{feat_name}; 5137 my $text = ""; 5138 if ($feat_name) { 5139 $text .= "feat_name: $feat_name\t"; 5140 } 5141 $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n"; 5142 return ($text); 5143} 5144 5145 5146sub length { 5147 my $self = shift; 5148 5149 my $len = abs ($self->{end5} - $self->{end3}) + 1; 5150 5151 return($len); 5152} 5153 5154 5155 5156 5157 5158########################################################################################################################## 5159########################################################################################################################## 5160 5161 5162 5163=head1 NAME 5164 5165package CDS_exon_obj 5166 5167=cut 5168 5169 5170=head1 DESCRIPTION 5171 5172 The CDS_exon_obj represents the protein-coding portion of an mRNA_exon_obj. 5173 5174=cut 5175 5176 5177 5178package CDS_exon_obj; 5179 5180use strict; 5181use warnings; 5182use Storable qw (store retrieve freeze thaw dclone); 5183use Carp; 5184 5185 5186=over 4 5187 5188=item new() 5189 5190B<Description:> Cosntructor for the CDS_exon_obj 5191 5192B<Parameters:> <(end5, end3)> 5193 5194The (end5, end3) parameter is optional. Alternatively, the set_coords() method can be used to set these values. 5195 5196B<Returns:> $cds_exon_obj 5197 5198=back 5199 5200=cut 5201 5202 ; 5203 5204sub new { 5205 shift; 5206 my $self = { end5 => 0, #stores end5 of cds exon 5207 end3 => 0, #stores end3 of cds exon 5208 phase => undef, #must set if to output in gff3 format. 5209 feat_name => 0, #tigr's temp id 5210 strand => undef, # +|- 5211 }; 5212 5213 5214 # end5 and end3 are allowed constructor parameters 5215 if (@_) { 5216 my ($end5, $end3) = @_; 5217 if (defined ($end5) && defined ($end3)) { 5218 $self->{end5} = $end5; 5219 $self->{end3} = $end3; 5220 } 5221 } 5222 bless ($self); 5223 return ($self); 5224} 5225 5226 5227 5228=over 4 5229 5230=item set_feat_name() 5231 5232B<Description:> Sets the feat_name attribute value of the CDS_exon_obj 5233 5234B<Parameters:> $feat_name 5235 5236B<Returns:> none 5237 5238=back 5239 5240=cut 5241 5242 5243sub set_feat_name { 5244 my $self = shift; 5245 my $feat_name = shift; 5246 $self->{feat_name} = $feat_name; 5247} 5248 5249 5250=over 4 5251 5252=item get_CDS_end5_end3() 5253 5254B<Description:> Retrieves the end5, end3 coordinates of the CDS_exon_obj 5255 5256** Method deprecated **, use get_coords() 5257 5258 5259B<Parameters:> none 5260 5261B<Returns:> (end5, end3) 5262 5263=back 5264 5265=cut 5266 5267 5268sub get_CDS_end5_end3 { 5269 my $self = shift; 5270 return ($self->{end5}, $self->{end3}); 5271} 5272 5273 5274 5275=over 4 5276 5277=item set_coords() 5278 5279B<Description:> Sets the (end5, end3) values of the CDS_exon_obj 5280 5281B<Parameters:> (end5, end3) 5282 5283B<Returns:> none 5284 5285=back 5286 5287=cut 5288 5289 5290 5291sub set_coords { 5292 my $self = shift; 5293 my $end5 = shift; 5294 my $end3 = shift; 5295 $self->{end5} = $end5; 5296 $self->{end3} = $end3; 5297} 5298 5299=over 4 5300 5301=item get_coords() 5302 5303B<Description:> Retrieves the (end5, end3) coordinates of the CDS_exon_obj 5304 5305B<Parameters:> none 5306 5307B<Returns:> (end5, end3) 5308 5309 5310The get_coords() method behaves similarly among Gene_obj, mRNA_exon_obj, and CDS_exon_obj, and is generally preferred to other existing methods for extracting these coordinate values. Other methods persist for backwards compatibility with older applications, but have been largely deprecated. 5311 5312 5313=back 5314 5315=cut 5316 5317 5318 5319sub get_coords { 5320 my $self = shift; 5321 return ($self->get_CDS_end5_end3()); 5322} 5323 5324 5325=over 4 5326 5327=item get_orientation() 5328 5329B<Description:> Retrieves the orientation of the CDS_exon_obj based on gene models orientation. 5330 5331B<Parameters:> none 5332 5333B<Returns:> +|-|undef 5334 5335undef returned if end5 == end3 5336 5337=back 5338 5339=cut 5340 5341 ; 5342 5343sub get_orientation { 5344 # determine positive or reverse orientation 5345 my $self = shift; 5346 return ($self->{strand}); 5347} 5348 5349 5350sub get_strand { ## preferred 5351 my $self = shift; 5352 return($self->get_orientation()); 5353} 5354 5355 5356=over 4 5357 5358=item toString() 5359 5360B<Description:> Retrieves a textual description of the CDS_exon_obj 5361 5362B<Parameters:> none 5363 5364B<Returns:> $text 5365 5366=back 5367 5368=cut 5369 5370 5371 5372=over 4 5373 5374=item clone_cds() 5375 5376B<Description:> Creates a deep clone of this CDS_exon_obj, using dclone() of Storable.pm 5377 5378B<Parameters:> none 5379 5380B<Returns:> $mRNA_exon_obj 5381 5382=back 5383 5384=cut 5385 5386 5387 5388sub clone_cds { 5389 my $self = shift; 5390 5391 my $clone_cds = dclone($self); 5392 5393 return ($clone_cds); 5394} 5395 5396 5397=over 4 5398 5399=item length() 5400 5401B<Description:> length of this cds segment 5402 5403B<Parameters:> none 5404 5405B<Returns:> int 5406 5407=back 5408 5409=cut 5410 5411 5412sub length { 5413 my $self = shift; 5414 my $length = abs ($self->{end3} - $self->{end5}) + 1; 5415 return ($length); 5416} 5417 5418 5419 5420=over 4 5421 5422=item set_phase() 5423 5424B<Description:> set phase of the CDS incident bp 5425 5426B<Parameters:> [012] 5427 5428B<Returns:> self 5429 5430 5431phase 0 = first bp of codon 5432phase 1 = second bp of codon 5433phase 2 = third bp of codon 5434 5435 5436=back 5437 5438=cut 5439 5440 5441sub set_phase { 5442 my $self = shift; 5443 my $phase = shift; 5444 $self->{phase} = $phase; 5445 return($self); 5446} 5447 5448=over 4 5449 5450=item get_phase() 5451 5452B<Description:> gets phase of the CDS incident bp 5453 5454B<Parameters:> none 5455 5456B<Returns:> [012] or undef if not set 5457 5458 5459phase 0 = first bp of codon 5460phase 1 = second bp of codon 5461phase 2 = third bp of codon 5462 5463 5464=back 5465 5466=cut 5467 5468 5469 5470 5471sub get_phase { 5472 my $self = shift; 5473 my $phase = $self->{phase}; 5474 return($phase); 5475} 5476 5477 5478 5479#### 5480sub merge_CDS { 5481 my $self = shift; 5482 my $other_cds = shift; 5483 5484 my $orientation = $self->get_orientation(); 5485 unless ($orientation) { 5486 confess "Error, self CDS lacks orientation\n"; 5487 } 5488 5489 my @coords = sort {$a<=>$b} ($self->get_coords(), $other_cds->get_coords()); 5490 my $lend = shift @coords; 5491 my $rend = pop @coords; 5492 5493 unless ($lend && $rend) { 5494 confess "Error, trying to merge CDSs but coordinates are not available: \n" 5495 . "self: " . $self->toString() 5496 . "\n" 5497 . "other: " . $other_cds->toString() . "\n"; 5498 } 5499 5500 my ($end5, $end3) = ($orientation eq '+') ? ($lend, $rend) : ($rend, $lend); 5501 5502 $self->set_coords($end5, $end3); 5503} 5504 5505sub toString { 5506 my $self = shift; 5507 my @coords = $self->get_CDS_end5_end3(); 5508 my $feat_name = $self->{feat_name}; 5509 my $text = ""; 5510 if ($feat_name) { 5511 $text .= "feat_name: $feat_name\t"; 5512 } 5513 $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n"; 5514 return ($text); 5515} 5516 5517 55181; 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533