1#!/usr/bin/env perl
2
3package main;
4our $DEBUG;
5
6package Gene_obj;
7use strict;
8use Nuc_translator;
9#use Gene_ontology;
10use Longest_orf;
11use Storable qw (store retrieve freeze thaw dclone);
12use warnings;
13use Data::Dumper;
14use Carp qw (croak cluck confess);
15use URI::Escape;
16
17=head1 NAME
18
19package Gene_obj
20
21=cut
22
23
24
25=head1 DESCRIPTION
26
27    Gene_obj(s) encapsulate the elements of both gene structure and gene function. The gene structure is stored in a hierarchical fashion as follows:
28
29    Gene  =========================================================
30
31    Exon  =========     =========         =========        ========
32
33    CDS      ======     =========         ======
34
35
36    where a Gene is a container for Exon(s), and each Exon is a container for a CDS, and an Exon can contain a single CDS component.  An Exon lacking a CDS exon is an untranslated exon or UTR exon.  The region of an Exon which extends beyond the CDS is also considered a UTR.
37
38
39    There are several ways to instantiate gene objects.  A simple example is described:
40
41    Exon and CDS component coordinates can be assigned as hashes.
42
43    ie.
44
45    my %mrna = ( 100 => 200,
46	         300 => 500 );
47
48    my %CDS = ( 150=>200,
49		300=>450);
50
51    my $sequence = "GACTACATTTAATAGGGCCC"; #string representing the genomic sequence
52    my $gene = new Gene_obj();
53
54    $gene->{com_name} = "hypothetical protein";
55
56    $gene->populate_gene_obj(\%CDS, \%mRNA, \$sequence);
57    print $gene->toString();
58
59
60
61    Alternatively, the individual components of genes (Exons and CDSs) can be instantiated separately and used to build the Gene from the ground up (See packages mRNA_exon_obj and CDS_exon_obj following this Gene_obj documentation).
62
63    my $cds_exon = new CDS_exon_obj (150, 200);
64
65    my $mRNA_exon = new mRNA_exon_obj (100, 200);
66
67    $mRNA_exon->set_CDS_exon_obj($cds_exon);
68
69    my $gene_obj = new Gene_obj ();
70
71    $gene_obj->{gene_name} = "hypothetical gene";
72    $gene_obj->{com_name} = "hypothetical protein";
73
74    $gene_obj->add_mRNA_exon_obj($mRNA_exon);
75
76    $gene_obj->refine_gene_object();
77
78    $gene_obj->create_all_sequence_types (\$sequence);  #ref to genomic sequence string.
79
80    print $gene_obj->toString();
81
82
83    The API below describes useful functions for navigating and manipulating the Gene object along with all of its attributes.
84
85
86
87=cut
88
89
90
91
92
93
94=over 4
95
96=item new()
97
98B<Description:> Constructor for Gene_obj
99
100B<Parameters:> none
101
102B<Returns:> $gene_obj
103
104
105The Gene_obj contains several attributes which can be manipulated directly (or by get/set methods if they exist).  These attributes include:
106
107    asmbl_id # identifier for the genomic contig for which this gene is anchored.
108    TU_feat_name #feat_names are TIGR temporary identifiers.
109    Model_feat_name # temp TIGR identifier for gene models
110    locus  #identifier for a gene (TU) ie. T2P3.5
111    pub_locus  #another identifier for a gene (TU)   ie. At2g00010
112    model_pub_locus #identifier for a gene model (model)  ie. At2g00010.1
113    model_locus #analagous to locus, but for model rather than gene (TU)
114    alt_locus   #alternative locus
115    gene_name # name for gene
116    com_name  # name for gene product
117    comment #internal comment
118    pub_comment #comment related to gene
119    ec_num   # enzyme commission number
120    gene_sym  # gene symbol
121    is_5prime_partial # 0|1  missing start codon.
122    is_3prime_partial # 0|1  missing stop codon.
123    is_pseudogene # 0|1
124    curated_com_name # 0|1
125    curated_gene_structure # 0|1
126
127    ## Other attributes set internally  Access-only, do not set directly.
128
129    gene_length  # length of gene span (int).
130    mid_pt  # holds midpoint of gene-span
131    strand  # [+-]
132    protein_seq # holds protein sequence
133    protein_seq_length
134    CDS_sequence  #holds CDS sequence (translated to protein); based on CDS_exon coordinates
135    CDS_seq_length
136    cDNA_sequence  #holds cDNA sequence; based on mRNA exon coordinates.
137    cDNA_seq_length
138    gene_sequence #holds unspliced transcript
139    gene_sequence_length #length of unspliced transcript
140    gene_type # "protein-coding", #default type for gene object.  Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accommodate other gene or feature types.
141    num_additional_isoforms # int
142
143
144=back
145
146=cut
147
148
149
150sub new {
151    shift;
152    my $self = { asmbl_id => 0, #genomic contig ID
153                 locus => undef,       #text
154                 pub_locus => undef,   #text  ie. At2g00010
155                 model_pub_locus =>undef, #text ie. At2g00010.1
156                 model_locus => undef, #text ie. F12G15.1
157                 alt_locus => undef,   #text
158                 gene_name => undef, #text
159                 com_name => undef,    #text
160                 comment => undef,
161                 curated_com_name => 0,
162                 curated_gene_structure => 0,
163                 pub_comment => undef, #text
164                 ec_num => undef, #text (enzyme commission number)
165                 gene_type => "protein-coding", #default type for gene object.  Could be changed to "rRNA|snoRNA|snRNA|tRNA" to accomodate other gene or feature types.
166                 gene_sym => undef, #text (gene symbol)
167                 mRNA_coords => 0, #assigned to anonymous hash of end5->end3 relative to the parent sequence
168                 CDS_coords => 0,  #assigned to anonymous hash of end5->end3 relative to the parent sequence
169                 mRNA_exon_objs => 0,  # holds arrayref to mRNA_obj, retrieve only thru method: get_exons()
170                 num_exons => 0,      # number of exons in this gene_obj
171                 model_span => [],     # holds array ref to (end5,end3) for CDS range of gene.
172                 gene_span => [],      # holds array ref to (end5,end3) for mRNA range of gene.
173                 gene_length => 0,     # length of gene span (int).
174                 mid_pt => 0,         # holds midpoint of gene-span
175                 strand => 0,      # [+-]
176                 gi => undef,          #text
177                 prot_acc => undef,     #text
178                 is_pseudogene => 0, # toggle indicating pseudogene if 1.
179                 is_5prime_partial => 0, #boolean indicating missing 5' part of gene.
180                 is_3prime_partial => 0, #boolean
181                 protein_seq => undef,    # holds protein sequence
182                 protein_seq_length => 0,
183                 CDS_sequence => undef,    #holds CDS sequence (translated to protein); based on CDS_exon coordinates
184                 CDS_seq_length => 0,
185                 cDNA_sequence => undef,   #holds cDNA sequence; based on mRNA exon coordinates.
186                 cDNA_seq_length => 0,
187                 gene_sequence => undef, #holds unspliced transcript
188                 gene_sequence_length => 0, #length of unspliced transcript
189                 TU_feat_name => undef,    #feat_names are TIGR temporary identifiers.
190                 Model_feat_name =>undef,
191                 classification => 'annotated_genes', #type of seq_element.
192                 gene_synonyms => [],    #list of synonymous model feat_names
193                 GeneOntology=>[], #list of Gene_ontology assignment objects.  ...see GeneOntology.pm
194
195                 ## Additional functional attributes:
196                 secondary_gene_names => [],
197                 secondary_product_names => [],
198                 secondary_gene_symbols => [],
199                 secondary_ec_numbers =>[],
200
201
202                 ## Alternative splicing support.
203                 num_additional_isoforms => 0,  # number of additional isoforms stored in additonal_isoform list below
204                 additional_isoforms => [] # stores list of Gene_objs corresponding to the additional isoforms.
205
206                 };
207    bless($self);
208    return ($self);
209}
210
211
212
213
214=over 4
215
216=item erase_gene_structure()
217
218B<Description:> Removes the structural components of a gene (ie. exons, CDSs, coordinate spans, any corresponding sequences)
219
220B<Parameters:> none
221
222B<Returns:> none
223
224=back
225
226=cut
227
228
229## erase gene structure
230sub erase_gene_structure {
231    my $self = shift;
232    $self->{mRNA_exon_objs} = 0;
233    $self->{num_exons} = 0;
234    $self->{model_span} = [];
235    $self->{gene_span} = [];
236    $self->{gene_length} = 0;
237    $self->{strand} = 0;
238    $self->{protein_seq} = 0;
239    $self->{CDS_sequence} = 0;
240    $self->{CDS_seq_length} = 0;
241    $self->{cDNA_sequence} = 0;
242    $self->{cDNA_seq_length} = 0;
243}
244
245
246=over 4
247
248=item clone_gene()
249
250B<Description:> Clones this Gene_obj by copying attributes from this Gene to a new gene.  Does NOT do a deep clone for all attributes.  See dclone() for a more rigorous cloning method.  This method is safer because all references are not cloned, only the critical ones.
251
252B<Parameters:> none
253
254B<Returns:> new Gene_obj
255
256=back
257
258=cut
259
260
261
262## all objects are cloned.  References to data only are not.
263sub clone_gene {
264    my $self = shift;
265    my $clone = new Gene_obj();
266
267
268    ## Copy over the non-ref attribute values.
269    foreach my $key (keys %$self) {
270        my $value = $self->{$key};
271        if (defined $value) {
272            ## Not copying over refs.
273            if (ref $value) {
274                next;
275            }
276
277            ## Not copying over attributes of length > 200, such as protein/nucleotide sequences
278            my $length = length($value);
279            if ($length > 200) { next;}
280        }
281
282        # passed tests above, copying attribute.
283        $clone->{$key} = $value;
284
285    }
286
287    ## copy over the gene synonyms.
288    my @gene_syns = @{$self->{gene_synonyms}};
289    $clone->{gene_synonyms} = \@gene_syns;
290
291
292    ## copy the GO assignments:
293    my @GO_assignments = $self->get_gene_ontology_objs();
294    if (@GO_assignments) {
295        foreach my $go_assignment (@GO_assignments) {
296            my $go_clone = dclone($go_assignment);
297            $clone->add_gene_ontology_objs($go_clone);
298        }
299    }
300
301
302    ## copy gene structure.
303    my @exons = $self->get_exons();
304    foreach my $exon (@exons) {
305        $clone->add_mRNA_exon_obj($exon->clone_exon());
306    }
307
308    foreach my $isoform ($self->get_additional_isoforms()) {
309        my $isoform_clone = $isoform->clone_gene();
310        $clone->add_isoform($isoform_clone);
311    }
312
313    $clone->refine_gene_object();
314
315    return ($clone);
316}
317
318
319
320
321=over 4
322
323=item deep_clone()
324
325B<Description:> Provides a deep clone of a gene_obj.  Only references supported in Gene_obj documentation are supported.  Those added in a rogue way are undef()d
326
327B<Parameters:> none
328
329B<Returns:> $gene_obj
330
331uses the Storable dclone() function to deep clone the Gene_obj
332
333=back
334
335=cut
336
337
338    ;
339## all objects are cloned.  References to data only are not.
340sub deep_clone {
341    my $self = shift;
342    my $clone = dclone($self);
343
344    my %supported_refs = (model_span => 1,
345                          gene_span => 1,
346                          gene_synonyms => 1,
347                          Gene_ontology => 1,
348                          additional_isoforms=>1,
349                          mRNA_exon_objs => 1);
350
351    foreach my $gene_obj ($clone, $clone->get_additional_isoforms()) {
352
353        my @keys = keys %$gene_obj;
354        foreach my $key (@keys) {
355            my $value = $gene_obj->{$key};
356            if (ref $value && !$supported_refs{$key}) {
357                $gene_obj->{$key} = undef;
358            }
359        }
360    }
361
362    return ($clone);
363}
364
365
366=over 4
367
368=item populate_gene_obj()
369
370B<Description:> Given CDS and mRNA coordinates stored in hash form, a gene object is populated with mRNA and CDS exons.  This is one available way to populate a newly instantiated Gene_obj.
371
372B<Parameters:> $cds_hash_ref, $mRNA_hash_ref, <$seq_ref>
373
374$mRNA_hash_ref is a reference to a hash holding the end5 => end3 coordinates of the Exons
375
376$cds_hash_ref same as mRNA_has_ref except holds the CDS end5 => end3 coordinates.
377
378$seq_ref is a reference to a string containing the genomic sequence.  This is an optional parameter.
379
380
381B<Returns:> none
382
383=back
384
385=cut
386
387    ;
388
389## Do several things at once: assign CDS and mRNA coordinates, and build gene sequences.
390## The \$seq_ref is optional in case you want to create the sequence types.
391sub populate_gene_obj {
392    my ($self, $cds_ref, $mRNA_ref, $seq_ref) = @_;
393    $self->set_CDS_coords ($cds_ref);
394    $self->set_mRNA_coords ($mRNA_ref);
395    $self->refine_gene_object();
396    if (ref $seq_ref) {
397        $self->create_all_sequence_types($seq_ref);
398    }
399    ## reinitialize the hashrefs:
400    $self->{mRNA_coords} = 0;
401    $self->{CDS_coords} = 0;
402
403
404}
405
406
407# alias above
408sub populate_gene_object {
409    my $self = shift;
410    $self->populate_gene_obj(@_);
411}
412
413
414####
415sub populate_gene_object_via_CDS_coords {
416	my $self = shift;
417	my @coordsets = @_;
418
419	foreach my $coordset (@coordsets) {
420		my ($end5, $end3) = @$coordset;
421		my $mrna_exon_obj = mRNA_exon_obj->new($end5, $end3);
422		my $cds_obj = CDS_exon_obj->new($end5, $end3);
423		$mrna_exon_obj->{CDS_exon_obj} = $cds_obj;
424		$self->add_mRNA_exon_obj($mrna_exon_obj);
425	}
426
427	$self->refine_gene_object();
428	return;
429}
430
431
432sub build_gene_obj_exons_n_cds_range {
433	my $self = shift;
434	my ($exons_aref, $cds_lend, $cds_rend, $orient) = @_;
435
436	my @exon_coords;
437	foreach my $exon_aref (@$exons_aref) {
438		my ($exon_lend, $exon_rend) = sort {$a<=>$b} @$exon_aref;
439		push (@exon_coords, [$exon_lend, $exon_rend] );
440	}
441	@exon_coords = sort {$a->[0]<=>$b->[0]} @exon_coords;
442
443
444	unless ($orient =~ /^[\+\-]$/) {
445		confess "Error, orient not [+-] ";
446	}
447
448	## build the CDS coordinates.
449
450	my @cds_range;
451
452    if ($cds_lend > 0 && $cds_rend > 0) {
453
454        ($cds_lend, $cds_rend) = sort {$a<=>$b} ($cds_lend, $cds_rend);
455
456        foreach my $exon_coords_aref (@exon_coords) {
457            my ($exon_lend, $exon_rend) = @$exon_coords_aref;
458
459            if ($exon_rend >= $cds_lend && $exon_lend <= $cds_rend) {
460
461                ## got overlap
462                my $cds_exon_lend = ($cds_lend < $exon_lend) ? $exon_lend : $cds_lend;
463
464                my $cds_exon_rend = ($cds_rend > $exon_rend) ? $exon_rend : $cds_rend;
465
466                push (@cds_range, [$cds_exon_lend, $cds_exon_rend]);
467            }
468        }
469
470        unless (@cds_range) {
471            confess "Error, no CDS exon coords built based on exon overlap";
472        }
473    }
474	## all coordinate sets are ordered left to right.
475	# build the coordinates href
476
477	my %exon_coords;
478	my %cds_coords;
479	foreach my $exon_coords_aref (@exon_coords) {
480		my ($exon_lend, $exon_rend) = @$exon_coords_aref;
481		my ($exon_end5, $exon_end3) = ($orient eq '+') ? ($exon_lend, $exon_rend) : ($exon_rend, $exon_lend);
482		$exon_coords{$exon_end5} = $exon_end3;
483	}
484	foreach my $cds_coords_aref (@cds_range) {
485		my ($cds_lend, $cds_rend) = @$cds_coords_aref;
486		my ($cds_end5, $cds_end3) = ($orient eq '+') ? ($cds_lend, $cds_rend) : ($cds_rend, $cds_lend);
487		$cds_coords{$cds_end5} = $cds_end3;
488	}
489
490	# print Dumper (\%cds_coords) . Dumper (\%exon_coords);
491
492	$self->populate_gene_obj(\%cds_coords, \%exon_coords);
493
494	return ($self);
495}
496
497
498####
499sub join_adjacent_exons {
500    my $self = shift;
501
502    my @exons = $self->get_exons();
503
504    my $strand = $self->get_orientation();
505
506    my $first_exon = shift @exons;
507    my @new_exons = ($first_exon);
508
509    while (@exons) {
510        my $prev_exon = $new_exons[$#new_exons];
511        my ($prev_end5, $prev_end3) = $prev_exon->get_coords();
512
513        my $next_exon = shift @exons;
514        my ($next_end5, $next_end3) = $next_exon->get_coords();
515
516        if ( ($strand eq '+' && $prev_end3 == $next_end5 - 1)  # adjacent
517             ||
518             ($strand eq '-' && $prev_end3 == $next_end5 + 1) ) {
519
520            $prev_exon->merge_exon($next_exon);
521        }
522        else {
523            push (@new_exons, $next_exon);
524        }
525    }
526
527    $self->{mRNA_exon_objs} = [@new_exons];
528
529    $self->refine_gene_object();
530
531    return;
532}
533
534
535
536
537=over 4
538
539=item AAToNucleotideCoords()
540
541B<Description:> Converts an amino acid -based coordinate to a genomic sequence -based coordinate.
542
543B<Parameters:> $aa_coord
544
545B<Returns:> $genomic_coord
546
547undef is returned if the aa_coord could not be converted.
548
549
550=back
551
552=cut
553
554    ;
555
556sub AAToNucleotideCoords{
557    my($self) = shift;
558    my($aacoord) = shift;
559    my($debug) = shift;
560    my($PCDS_coords) = {};
561    my($A2NMapping) = {};
562    my($currAA) = 1;
563    my $strand = $self->{strand};
564    my @exons = $self->get_exons();
565    my($cds_count)=0;
566    my($translated_bp)=-1;
567    my($lastcarryover)=0;
568    my($end_bp);
569    foreach my $exon (sort {
570        if($strand eq "+"){
571            $a->{end5}<=>$b->{end5};
572        }
573        else{
574            $b->{end5}<=>$a->{end5};
575        }
576    } @exons) {
577        my $cds = $exon->get_CDS_obj();
578        if ($cds) {
579            my @cds_coords = $cds->get_CDS_end5_end3();
580            my($bpspread) = abs($cds_coords[0]-$cds_coords[1]);
581            $bpspread+=$lastcarryover;
582            my($nextAA) = int($bpspread/3); # last complete AA in CDS
583            $lastcarryover = $bpspread%3;
584            $PCDS_coords->{$currAA} = $currAA+$nextAA-1;
585            if($strand eq "+"){
586                $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1];
587            }
588            else{
589                $A2NMapping->{$currAA} = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0];
590            }
591            print "DEBUG: $strand $cds_count AA range ($currAA - $PCDS_coords->{$currAA}) nucleotide start($A2NMapping->{$currAA})\n" if($debug);
592            $currAA = $currAA+$nextAA;
593            $cds_count++;
594            if($strand eq "+"){
595                $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[1]:$cds_coords[0];
596            }
597            else{
598                $end_bp = $cds_coords[0]<$cds_coords[1]?$cds_coords[0]:$cds_coords[1];
599            }
600        }
601    }
602    # PCDS_coords key/value are start/stop aa counts for each cds;
603    # A2NMapping stores cds AA start key to cds nucleotide start
604    $cds_count=0;
605    foreach my $PCDS_end5 (sort {
606        $a<=>$b;
607	}(keys %$PCDS_coords)) {
608        my($PCDS_end3) = $PCDS_coords->{$PCDS_end5};
609	    if($aacoord>=$PCDS_end5 && $aacoord<=$PCDS_end3){
610            my($nucleotide_start) = $A2NMapping->{$PCDS_end5};
611            my($aa_offset) = $aacoord - $PCDS_end5;
612            my($nucleotide_offset) = $aa_offset*3;
613            print "DEBUG: CDS offset $aa_offset AA $nucleotide_offset bp\n" if($debug);
614            if($strand eq "+"){
615                $translated_bp = $nucleotide_start+$nucleotide_offset;
616            }
617            else{
618                $translated_bp = $nucleotide_start-$nucleotide_offset;
619            }
620            print "DEBUG: Mapping $aacoord to $translated_bp in cds $cds_count\n" if($debug);
621            print "DEBUG: CDS $PCDS_end5 - $PCDS_end3 nucleotide start $A2NMapping->{$PCDS_end5}, nuc offset $nucleotide_offset\n" if($debug);
622	    }
623
624        $cds_count++;
625	}
626    #}
627    if($translated_bp == -1){
628        $translated_bp = undef;
629        print STDERR "Unable to translate AA coordinate: $aacoord. Off end. Using undef\n" if($debug);
630    }
631    return $translated_bp;
632}
633
634
635
636## private method, used by populate_gene_obj()
637# sets CDS_coords instance member to a hash reference of CDS coordinates.   $hash{end5} = end3
638sub set_CDS_coords {
639    my $self = shift;
640    my $hash_ref = shift;
641    if (ref ($hash_ref) eq 'HASH') {
642        $self->{CDS_coords} = $hash_ref;
643    } else {
644        print STDERR "Cannot set CDS_coords, must have hash reference\n";
645    }
646}
647
648
649
650
651=over 4
652
653=item get_gene_span()
654
655B<Description:> Retrieves the coordinates which span the length of the gene along the genomic sequence.
656
657B<Parameters:> none
658
659B<Returns:> (end5, end3)
660
661These coordinates represent the minimal and maximal exonic coordinates of the gene.  Orientation can be inferred by the relative values of end5 and end3.
662
663
664=back
665
666=cut
667
668    ;
669
670## All return gene end5, end3 ###
671sub get_gene_span {
672    my $self = shift;
673    return (@{$self->{gene_span}});
674}
675
676
677
678
679## private
680sub get_seq_span {
681    my $self = shift;
682    return ($self->get_gene_span());
683}
684
685
686
687=over 4
688
689=item get_coords()
690
691B<Description:> See get_gene_span()
692
693B<Parameters:> none
694
695B<Returns:> (end5, end3)
696
697=back
698
699=cut
700
701
702sub get_coords {
703    my $self = shift;
704    return ($self->get_gene_span());
705}
706
707
708
709=over 4
710
711=item get_model_span()
712
713B<Description:> Retrieves the coordinates spanned by the protein-coding region of the gene along the genomic sequence.
714
715B<Parameters:> none
716
717B<Returns:> (end5, end3)
718
719These coordinates are determined by the min and max of the CDS components of the gene.
720
721=back
722
723=cut
724
725
726
727
728sub get_model_span {
729    my $self = shift;
730    return (@{$self->{model_span}});
731}
732
733
734sub get_CDS_span { # preferred
735	my $self = shift;
736	return($self->get_model_span());
737}
738
739
740=over 4
741
742=item get_transcript_span()
743
744B<Description:> Retrieves the coordinates spanned by the exonic regions of the gene along the genomic sequence.
745
746B<Parameters:> none
747
748B<Returns:> (lend, rend)
749
750These coordinates are determined by the min and max of the CDS components of the gene.
751
752=back
753
754=cut
755
756
757sub get_transcript_span {
758	my $self = shift;
759
760	my @coords;
761	my @exons = $self->get_exons();
762	foreach my $exon (@exons) {
763		push (@coords, $exon->get_coords());
764	}
765	@coords = sort {$a<=>$b} @coords;
766
767	my $lend = shift @coords;
768	my $rend = pop @coords;
769
770	return($lend, $rend);
771}
772
773
774sub is_pseudogene {
775    my $self = shift;
776    return ($self->{is_pseudogene});
777}
778
779sub set_pseudogene {
780    my $self = shift;
781    my $pseudogene_val = shift;
782    unless ($pseudogene_val =~ /[01]/) {
783        confess "Error, can set pseudogene to zero or one only.\n";
784    }
785
786    foreach my $gene ($self, $self->get_additional_isoforms()) {
787        $gene->{is_pseudogene} = $pseudogene_val;
788    }
789
790    return;
791}
792
793
794
795#private
796# sets mRNA_coords instance member to a hash reference of CDS coordinates.   $hash{end5} = end3
797sub set_mRNA_coords {
798    my $self = shift;
799    my $hash_ref = shift;
800    if (ref ($hash_ref) eq 'HASH') {
801        $self->{mRNA_coords} = $hash_ref;
802    } else {
803        print STDERR "Cannot set CDS_coords, must have hash reference\n";
804    }
805}
806
807
808=over 4
809
810=item refine_gene_object()
811
812B<Description:> This method performs some data management operations and should be called at any time modifications have been made to the gene structure (ie. exons added or modified, model isoforms added, etc).  It performs the following orientations:
813
814    -Sets (or resets)  gene span and model span coordinates, strand orientation, gene length, mid-point.
815
816B<Parameters:> none
817
818B<Returns:> none
819
820=back
821
822=cut
823
824## Once mRNA_coords and CDS_coords have been assigned, this will populate the remaining elements in the gene object.
825
826sub refine_gene_object {
827    my ($self) = shift;
828    #check to see if mRNA_coords field is populated.  If not, initialize.
829    if ($self->{mRNA_coords} == 0) {
830        $self->{mRNA_coords} = {};
831    }
832    my ($CDS_coords, $mRNA_coords) = ($self->{CDS_coords},  $self->{mRNA_coords});
833
834    unless ($CDS_coords && $mRNA_coords) {
835        #maybe created exon objects already
836        if ($self->{mRNA_exon_objs}) {
837            $self->trivial_refinement();
838        }
839        return;
840    }
841    # intialize mRNA_exon_objs to array ref.
842    $self->{mRNA_exon_objs} = [];
843    #retrieve coordinate data.
844    my %mRNA = %$mRNA_coords;
845    my %CDS = %$CDS_coords;
846    my @mRNAcoords = keys %mRNA;
847    my @CDScoords = keys %CDS;
848    my (%new_mRNA, %new_CDS);
849    ## if correlation between mRNA exons and CDS exons, then map CDS's to mRNA's, otherwise, replicate CDSs as mRNAs
850    if ($#mRNAcoords >= $#CDScoords) {
851
852        foreach my $mRNA_end5 (keys %mRNA) {
853            my $mRNA_end3 = $mRNA{$mRNA_end5};
854            #find overlapping cds exon to mRNA exon
855            #easy to compare if in same orientation for all comparisons
856            my ($m1, $m2) = ($mRNA_end5 < $mRNA_end3) ? ($mRNA_end5, $mRNA_end3) : ($mRNA_end3, $mRNA_end5);
857            #create mRNA_exon_obj
858            my $mRNA_exon_obj = mRNA_exon_obj->new ($mRNA_end5, $mRNA_end3);
859            $new_mRNA{$mRNA_end5} = $mRNA_end3;
860            foreach my $CDS_end5 (keys %CDS) {
861                my $CDS_end3 = $CDS{$CDS_end5};
862                my ($c1, $c2) = ($CDS_end5 < $CDS_end3) ? ($CDS_end5, $CDS_end3) : ($CDS_end3, $CDS_end5);
863                ## do overlap comparison; CDS must be contained within mRNA exon
864                if ( ($c1 >= $m1) && ($c2 <= $m2)) {
865                    # found the contained CDS
866                    $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3);
867                    $new_CDS{$CDS_end5} = $CDS_end3;
868                    last;
869                }
870            }
871            $self->add_mRNA_exon_obj($mRNA_exon_obj);
872        }
873    } else { # remap CDSs to mRNAS
874        print STDERR "ERROR: mRNA exons < CDS exons.  Copying all CDS exons into mRNA exons. \n\n";
875        foreach my $CDS_end5 (keys %CDS) {
876            my $CDS_end3 = $CDS{$CDS_end5};
877            my $mRNA_exon_obj = mRNA_exon_obj->new ($CDS_end5, $CDS_end3);
878            $mRNA_exon_obj->{CDS_exon_obj} = CDS_exon_obj->new ($CDS_end5, $CDS_end3);
879            $self->add_mRNA_exon_obj($mRNA_exon_obj);
880            $new_mRNA{$CDS_end5} = $CDS_end3;
881            $new_CDS{$CDS_end5} = $CDS_end3;
882        }
883    }
884
885    $self->trivial_refinement();
886
887    ## assign orientation to all children exon and CDS components.
888    my $strand = $self->get_orientation();
889    foreach my $exon ($self->get_exons()) {
890        $exon->{strand} = $strand;
891        if (my $cds = $exon->get_CDS_exon_obj()) {
892            $cds->{strand} = $strand;
893        }
894    }
895    return;
896
897}
898
899
900## alias
901sub refine_gene_obj {
902    my $self = shift;
903    $self->refine_gene_object();
904}
905
906
907=over 4
908
909=item get_exons()
910
911B<Description:>Retrieves a list of exons belonging to this Gene_obj
912
913B<Parameters:> none
914
915B<Returns:> @exons
916
917@exons is an ordered list of mRNA_exon_obj; the first exon of the list corresponds to the first exon of the spliced gene.
918
919=back
920
921=cut
922
923    ;
924
925sub get_exons {
926    my ($self) = shift;
927    if ($self->{mRNA_exon_objs} != 0) {
928        my @exons = (@{$self->{mRNA_exon_objs}});
929        @exons = sort {$a->{end5}<=>$b->{end5}} @exons;
930        if ($self->{strand} eq '-') {
931            @exons = reverse (@exons);
932        }
933        return (@exons);
934    } else {
935        my @x = ();
936        return (@x); #empty array
937    }
938}
939
940
941## private
942sub get_segments {
943    my $self = shift;
944    return ($self->get_exons());
945}
946
947
948
949=over 4
950
951=item number_of_exons()
952
953B<Description:> Provides the number of exons contained by the Gene
954
955B<Parameters:> none
956
957B<Returns:> int
958
959=back
960
961=cut
962
963
964
965sub number_of_exons {
966    my $self = shift;
967    my $exon_number = $#{$self->{mRNA_exon_objs}} + 1;
968    return ($exon_number);
969}
970
971
972
973
974
975
976
977=over 4
978
979=item get_intron_coordinates()
980
981B<Description:> Provides an ordered list of intron coordinates
982
983B<Parameters:> none
984
985B<Returns:> ( [end5,end3], ....)
986
987A list of arrayRefs are returned providing the coordinates of introns, ordered from first intron to last intron within the gene.
988
989=back
990
991=cut
992
993    ;
994
995sub get_intron_coordinates {
996    my $gene_obj = shift;
997    my $strand = $gene_obj->get_orientation();
998    my @exons = $gene_obj->get_exons();
999    ## exon list should already be sorted.
1000    my @introns = ();
1001
1002    my $num_exons = $#exons + 1;
1003    if ($num_exons > 1) { #only genes with multiple exons will have introns.
1004        if ($strand eq '+') {
1005            my $first_exon = shift @exons;
1006            while (@exons) {
1007                my $next_exon = shift @exons;
1008                my ($first_end5, $first_end3) = $first_exon->get_coords();
1009                my ($next_end5, $next_end3) = $next_exon->get_coords();
1010                my $intron_end5 = $first_end3 + 1;
1011                my $intron_end3 = $next_end5 -1;
1012                if ($intron_end5 < $intron_end3) {
1013                    push (@introns, [$intron_end5, $intron_end3]);
1014                }
1015                $first_exon = $next_exon;
1016            }
1017        } elsif ($strand eq '-') {
1018            my $first_exon = shift @exons;
1019            while (@exons) {
1020                my $next_exon = shift @exons;
1021                my ($first_end5, $first_end3) = $first_exon->get_coords();
1022                my ($next_end5, $next_end3) = $next_exon->get_coords();
1023                my $intron_end5 = $first_end3 - 1;
1024                my $intron_end3 = $next_end5 +1;
1025                if ($intron_end5 > $intron_end3) {
1026                    push (@introns, [$intron_end5, $intron_end3]);
1027                }
1028                $first_exon = $next_exon;
1029            }
1030
1031        } else {
1032            die "Strand for gene_obj is not specified." . $gene_obj->toString();
1033        }
1034    }
1035    return (@introns);
1036}
1037
1038
1039
1040
1041
1042#private
1043sub trivial_refinement {
1044    my $self = shift;
1045    my @exons = $self->get_exons();
1046    $self->{num_exons} = scalar(@exons);
1047    my (%mRNAexons, %CDSexons);
1048    foreach my $exon (@exons) {
1049        my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3();
1050        $mRNAexons{$exon_end5} = $exon_end3;
1051        my $cds;
1052        if ($cds = $exon->get_CDS_obj()) {
1053            my ($cds_end5, $cds_end3) = $cds->get_CDS_end5_end3();
1054            $CDSexons{$cds_end5} = $cds_end3;
1055        }
1056    }
1057    my @mRNAexonsEnd5s = sort {$a<=>$b} keys %mRNAexons;
1058    my @CDSexonsEnd5s = sort {$a<=>$b} keys %CDSexons;
1059    my $strand = 0; #initialize.
1060    foreach my $mRNAend5 (@mRNAexonsEnd5s) {
1061        my $mRNAend3 = $mRNAexons{$mRNAend5};
1062        if ($mRNAend5 == $mRNAend3) {next;}
1063        $strand = ($mRNAend5 < $mRNAend3) ? '+':'-';
1064        last;
1065    }
1066    $self->{strand} = $strand;
1067
1068    ## determine gene and model boundaries:
1069    my ($gene_end5, $gene_end3, $model_end5, $model_end3);
1070    my @gene_coords = sort {$a<=>$b} %mRNAexons;
1071    my @model_coords = sort {$a<=>$b} %CDSexons;
1072    my $gene_lend = shift @gene_coords;
1073    my $gene_rend = pop @gene_coords;
1074    ## bound gene by transcript span
1075    ($gene_end5, $gene_end3) = ($strand eq "+") ? ($gene_lend, $gene_rend) : ($gene_rend, $gene_lend);
1076    if (@model_coords) {
1077        ## bound model by protein coding span
1078        my $model_lend = shift @model_coords;
1079        my $model_rend = pop @model_coords;
1080        ($model_end5, $model_end3) = ($strand eq "+") ? ($model_lend, $model_rend) : ($model_rend, $model_lend);
1081    } else {
1082        ## give it gene boundaries instead:
1083        ($model_end5, $model_end3) = ($gene_end5, $gene_end3);
1084    }
1085
1086    $self->{gene_span} = [$gene_end5, $gene_end3];
1087    $self->{gene_length} = abs ($gene_end3 - $gene_end5) + 1;
1088    $self->{mid_pt} = int (($gene_end5 + $gene_end3)/2);
1089    $self->{model_span} = [$model_end5, $model_end3];
1090
1091    ## Refine isoforms if they exist.
1092    if (my @isoforms = $self->get_additional_isoforms()) {
1093        my @gene_span_coords = $self->get_gene_span();
1094        foreach my $isoform (@isoforms) {
1095            $isoform->refine_gene_object();
1096            push (@gene_span_coords, $isoform->get_gene_span());
1097        }
1098        @gene_span_coords = sort {$a<=>$b} @gene_span_coords;
1099        my $lend = shift @gene_span_coords;
1100        my $rend = pop @gene_span_coords;
1101        my $strand = $self->{strand};
1102        if ($strand eq '-') {
1103            ($lend, $rend) = ($rend, $lend);
1104        }
1105        my $gene_length = abs ($lend -$rend) + 1;
1106        foreach my $gene ($self, @isoforms) {
1107            $gene->{gene_span} = [$lend, $rend];
1108            $gene->{gene_length} = $gene_length;
1109        }
1110    }
1111
1112}
1113
1114
1115
1116
1117=over 4
1118
1119=item add_mRNA_exon_obj()
1120
1121B<Description:> Used to add a single mRNA_exon_obj to the Gene_obj
1122
1123B<Parameters:> mRNA_exon_obj
1124
1125B<Returns:> none
1126
1127=back
1128
1129=cut
1130
1131    ;
1132
1133sub add_mRNA_exon_obj {
1134    my ($self) = shift;
1135    my ($mRNA_exon_obj) = shift;
1136    if (!ref($self->{mRNA_exon_objs})) {
1137        $self->{mRNA_exon_objs} = [];
1138    }
1139    my $index = $#{$self->{mRNA_exon_objs}};
1140    $index++;
1141    $self->{mRNA_exon_objs}->[$index] = $mRNA_exon_obj;
1142}
1143
1144#private
1145## forcibly set protein sequence value
1146
1147
1148sub set_protein_sequence {
1149    my $self = shift;
1150    my $protein = shift;
1151    if ($protein) {
1152        $self->{protein_seq} = $protein;
1153        $self->{protein_seq_length} = length ($protein);
1154    } else {
1155        print STDERR "No incoming protein sequence to set to.\n" . $self->toString();
1156    }
1157}
1158
1159#private
1160## forcibly set CDS sequence value
1161sub set_CDS_sequence {
1162    my $self = shift;
1163    my $cds_seq = shift;
1164    if ($cds_seq) {
1165        $self->{CDS_sequence} = $cds_seq;
1166        $self->{CDS_sequence_length} = length ($cds_seq);
1167    } else {
1168        print STDERR "No incoming CDS sequence to set to\n" . $self->toString();
1169    }
1170}
1171
1172#private
1173sub set_cDNA_sequence {
1174    my $self = shift;
1175    my $cDNA_seq = shift;
1176    if ($cDNA_seq) {
1177        $self->{cDNA_sequence} = $cDNA_seq;
1178        $self->{cDNA_sequence_length} = length($cDNA_seq);
1179    } else {
1180        print STDERR "No incoming cDNA sequence to set to.\n" . $self->toString();
1181    }
1182}
1183
1184#private
1185sub set_gene_sequence {
1186    my $self = shift;
1187    my $seq = shift;
1188    if ($seq) {
1189        $self->{gene_sequence} = $seq;
1190        $self->{gene_sequence_length} = length ($seq);
1191    } else {
1192        print STDERR "No incoming gene sequence to set to\n" . $self->toString();
1193    }
1194}
1195
1196
1197=over 4
1198
1199=item create_all_sequence_types()
1200
1201B<Description:> Given a scalar reference to the genomic sequence, the CDS, cDNA, unspliced transcript and protein sequences are constructed and populated within the Gene_obj
1202
1203B<Parameters:> $genomic_seq_ref, [%params]
1204
1205B<Returns:> 0|1
1206
1207returns 1 upon success, 0 upon failure
1208
1209By default, the protein and CDS sequence are populated.  If you want the unspliced genomic sequence, you need to specify this in the attributes:
1210
1211    %params = ( potein => 1,
1212		CDS => 1,
1213		cDNA => 1,
1214		unspliced_transcript => 0)
1215
1216
1217=back
1218
1219=cut
1220
1221
1222## Create all gene sequences (protein, cds, cdna, genomic)
1223sub create_all_sequence_types {
1224    my $self = shift;
1225    my $big_seq_ref = shift;
1226    my %atts = @_;
1227
1228    unless (ref($big_seq_ref) eq 'SCALAR') {
1229        print STDERR "I require a sequence reference to create sequence types\n";
1230        return (undef());
1231    }
1232    $self->create_cDNA_sequence($big_seq_ref) unless (exists($atts{cDNA}) && $atts{cDNA});
1233    $self->create_gene_sequence($big_seq_ref, 1) if ($atts{unspliced_transcript}); #highlight exons by default.
1234
1235    if ($self->is_coding_gene()) {
1236        $self->create_CDS_sequence ($big_seq_ref) unless (exists ($atts{CDS}) && $atts{CDS});
1237        $self->create_protein_sequence($big_seq_ref) unless (exists ($atts{protein}) && $atts{protein});
1238    }
1239
1240    if (my @isoforms = $self->get_additional_isoforms()) {
1241        foreach my $isoform (@isoforms) {
1242            $isoform->create_all_sequence_types($big_seq_ref, %atts);
1243        }
1244    }
1245    return(1);
1246}
1247
1248#private
1249## Create cDNA sequence
1250sub create_cDNA_sequence {
1251    my $self = shift;
1252    my $seq_ref = shift;
1253    my $sequence_ref;
1254    unless ($seq_ref) {
1255        print STDERR "The parent sequence must be specified for the cDNA creation method\n";
1256        return;
1257    }
1258    ## hopefully the sequence came in as a reference.  If not, make one to it.
1259    ## Don't want to pass chromosome sequences in by value!
1260    if (ref($seq_ref)) {
1261        $sequence_ref = $seq_ref;
1262    } else {
1263        $sequence_ref = \$seq_ref;
1264    }
1265    my @exons = $self->get_exons();
1266    my $strand = $self->{strand};
1267    my $cDNA_seq = "";
1268    foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) {
1269        my $c1 = $exon_obj->{end5};
1270        my $c2 = $exon_obj->{end3};
1271        ## sequence retrieval coordinates must be in forward orientation
1272        my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1);
1273        $cDNA_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
1274    }
1275    if ($strand eq '-') {
1276        $cDNA_seq = &reverse_complement($cDNA_seq);
1277    }
1278    $self->set_cDNA_sequence($cDNA_seq);
1279    return ($cDNA_seq);
1280}
1281
1282#private
1283## create a CDS sequence, and populate the protein field.
1284sub create_CDS_sequence {
1285    my $self = shift;
1286    my $seq_ref = shift;
1287    my $sequence_ref;
1288    unless ($seq_ref) {
1289        print STDERR "The parent sequence must be specified for the CDS creation method\n";
1290        return;
1291    }
1292
1293    unless ($self->is_coding_gene()) {
1294        print STDERR "Warning: No coding region specified for gene: " . $self->toString();
1295        return("");
1296    }
1297
1298
1299    ## hopefully the sequence came in as a reference.  If not, make one to it.
1300    ## Don't want to pass chromosome sequences in by value!
1301    if (ref($seq_ref)) {
1302        $sequence_ref = $seq_ref;
1303    } else {
1304        $sequence_ref = \$seq_ref;
1305    }
1306    my @exons = $self->get_exons();
1307    my $strand = $self->{strand};
1308    my $cds_seq = "";
1309    foreach my $exon_obj (sort {$a->{end5}<=>$b->{end5}} @exons) {
1310        my $CDS_obj = $exon_obj->get_CDS_obj();
1311        if (ref $CDS_obj) {
1312            my ($c1, $c2) = $CDS_obj->get_CDS_end5_end3();
1313            ## sequence retrieval coordinates must be in forward orientation
1314            my ($coord1, $coord2) = ($strand eq '+') ? ($c1, $c2) : ($c2, $c1);
1315            $cds_seq .= substr ($$sequence_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
1316        }
1317    }
1318    if ($strand eq '-') {
1319        $cds_seq = &reverse_complement($cds_seq);
1320    }
1321    $self->set_CDS_sequence($cds_seq);
1322
1323    return ($cds_seq);
1324}
1325
1326
1327
1328sub is_coding_gene {
1329    my $self = shift;
1330
1331    if ($self->get_CDS_length()) {
1332        return(1);
1333    }
1334    else {
1335        return(0);
1336    }
1337}
1338
1339
1340
1341#private
1342## Translation requires parent nucleotide sequence (bac, chromosome, whatever).
1343sub create_protein_sequence {
1344    my $self = shift;
1345    my $seq_ref = shift; # optional
1346
1347    unless ($self->is_coding_gene()) {
1348        print STDERR "Warning: No coding sequence for gene: " . $self->toString();
1349        return("");
1350    }
1351
1352    my $cds_sequence = $self->get_CDS_sequence();
1353    unless ($cds_sequence) {
1354
1355        ## if has a CDS, then try to translate it if the genome sequence is available.
1356
1357        unless (ref($seq_ref) eq 'SCALAR') {
1358            print STDERR "I require an assembly sequence ref if the CDS is unavailable\n";
1359            return;
1360        }
1361        $cds_sequence = $self->create_CDS_sequence($seq_ref);
1362    }
1363    my $protein = &Nuc_translator::get_protein ($cds_sequence);
1364    $self->set_protein_sequence($protein);
1365    return ($protein);
1366}
1367
1368#private
1369## Create the unspliced nucleotide transcript
1370sub create_gene_sequence {
1371    my $self = shift;
1372    my $big_seq_ref = shift;
1373    my $highlight_exons_flag = shift; #upcases exons, lowcases introns.
1374    unless (ref ($big_seq_ref) eq 'SCALAR') {
1375        print STDERR "I require a reference to the assembly sequence!!\n";
1376        return (undef());
1377    }
1378    my $strand = $self->{strand};
1379    my ($gene_seq);
1380    if ($highlight_exons_flag) {
1381        my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons();
1382        my $exon = shift @exons;
1383        my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
1384        $gene_seq = uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1));
1385        my $prev_rend = $rend;
1386        while (@exons) {
1387            $exon = shift @exons;
1388            ## Add intron, then exon
1389            my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
1390            $gene_seq .= lc (substr ($$big_seq_ref, $prev_rend, $lend - $prev_rend-1));
1391            $gene_seq .= uc (substr ($$big_seq_ref, $lend - 1, $rend - $lend + 1));
1392            $prev_rend = $rend;
1393        }
1394
1395    } else { #just get the sequence spanned by min and max coords
1396        my ($coord1, $coord2) = sort {$a<=>$b} $self->get_gene_span();
1397        $gene_seq = substr ($$big_seq_ref, ($coord1 - 1), ($coord2 - $coord1 + 1));
1398    }
1399
1400    $gene_seq = &reverse_complement($gene_seq) if ($strand eq '-');
1401    $self->set_gene_sequence($gene_seq);
1402    return ($gene_seq);
1403}
1404
1405## retrieving the sequences
1406
1407=over 4
1408
1409=item get_protein_sequence()
1410
1411B<Description:> Retrieves the protein sequence
1412
1413B<Parameters:> none
1414
1415B<Returns:> $protein
1416
1417Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
1418
1419
1420=back
1421
1422=cut
1423
1424    ;
1425
1426sub get_protein_sequence {
1427    my $self = shift;
1428    return ($self->{protein_seq});
1429}
1430
1431## alias
1432sub get_protein_seq {
1433    my $self = shift;
1434    return ($self->get_protein_sequence());
1435}
1436
1437
1438
1439=over 4
1440
1441=item get_CDS_sequence()
1442
1443B<Description:> Retrieves the CDS sequence.  The CDS sequence is the protein-coding nucleotide sequence.
1444
1445B<Parameters:> none
1446
1447B<Returns:> $cds
1448
1449Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
1450
1451=back
1452
1453=cut
1454
1455
1456sub get_CDS_sequence {
1457    my $self = shift;
1458    return ($self->{CDS_sequence});
1459}
1460
1461=over 4
1462
1463=item get_cDNA_sequence()
1464
1465B<Description:> Retrieves the tentative cDNA sequence for the Gene.  The cDNA includes the CDS with potential UTR extensions.
1466
1467B<Parameters:> none
1468
1469B<Returns:> $cdna
1470
1471Note: You must have called create_all_sequence_types($genomic_ref) before protein sequence is available for retrieval.
1472
1473
1474=back
1475
1476=cut
1477
1478
1479
1480sub get_cDNA_sequence {
1481    my $self = shift;
1482    return ($self->{cDNA_sequence});
1483}
1484
1485
1486
1487sub get_CDS_length {
1488    my $self = shift;
1489
1490    my $cds_length = 0;
1491
1492    my @exons = $self->get_exons();
1493    foreach my $exon (@exons) {
1494        if (my $cds = $exon->get_CDS_obj()) {
1495            $cds_length += $cds->length();
1496        }
1497    }
1498
1499
1500    return ($cds_length);
1501}
1502
1503sub get_cDNA_length {
1504    my $self = shift;
1505
1506    my $cdna_length = 0;
1507
1508    my @exons = $self->get_exons();
1509    foreach my $exon (@exons) {
1510        $cdna_length += $exon->length();
1511    }
1512
1513    return($cdna_length);
1514
1515}
1516
1517
1518
1519
1520
1521
1522
1523=over 4
1524
1525=item get_gene_sequence()
1526
1527B<Description:> Retrieves the unspliced transcript of the gene.
1528
1529B<Parameters:> none
1530
1531B<Returns:> $unspliced_transcript
1532
1533=back
1534
1535=cut
1536
1537
1538sub get_gene_sequence {
1539    my $self = shift;
1540    return ($self->{gene_sequence});
1541}
1542
1543
1544
1545
1546=over 4
1547
1548=item get_gene_synonyms()
1549
1550B<Description:> Retrieves the Model_feat_name(s) for the synonomous gene models found on other BACs or contigs.
1551
1552B<Parameters:> none
1553
1554B<Returns:> @model_feat_names
1555
1556
1557For Arabidopsis, gene models are found within overlapping regions of BAC sequences, in which the gene models are annotated on both corresponding BACs.  Given a Gene_obj for a model on one BAC, the synomous gene on the overlapping BAC can be identified via this method.
1558
1559
1560=back
1561
1562=cut
1563
1564
1565sub get_gene_synonyms {
1566    my $self = shift;
1567    return (@{$self->{gene_synonyms}});
1568}
1569
1570
1571
1572=over 4
1573
1574=item clear_sequence_info()
1575
1576B<Description:> Clears the sequence fields stored within a Gene_obj, including the CDS, cDNA, gene_sequence, and protein sequence.  Often, these sequence fields, when populated, can consume large amounts of memory in comparison to the coordinate and functional annotation data.  This method is useful to clear this memory when the sequences are not needed.  The create_all_sequence_types($genomic_seq_ref) can be called again later to repopulate these sequences when they are needed.
1577
1578B<Parameters:> none
1579
1580B<Returns:> none
1581
1582=back
1583
1584=cut
1585
1586
1587## sequences consume huge amounts of memory in comparison to other gene features.
1588## want to clear them from time to time to save memory.
1589
1590    ;
1591
1592sub clear_sequence_info {
1593    my $self = shift;
1594    $self->{protein_seq} = undef;
1595    $self->{CDS_sequence} = undef;
1596    $self->{cDNA_sequence} = undef;
1597    $self->{gene_sequence} = undef;
1598}
1599
1600
1601=over 4
1602
1603=item set_gene_type()
1604
1605B<Description:> Sets the type of gene.  Expected types include:
1606
1607    protein-coding #default setting
1608    rRNA
1609    snoRNA
1610    snRNA
1611    tRNA
1612
1613    ...or others as needed.  Nothing is restricted.
1614
1615B<Parameters:> $type
1616
1617B<Returns:> none
1618
1619=back
1620
1621=cut
1622
1623
1624####
1625sub set_gene_type {
1626    my ($self) = shift;
1627    my ($gene_type) = shift;
1628    $self->{gene_type} = $gene_type;
1629}
1630
1631
1632=over 4
1633
1634=item adjust_gene_coordinates()
1635
1636B<Description:> Used to add or subtract a specified number of bases from each gene component coordinate.
1637
1638B<Parameters:> $adj_amount
1639
1640$adj_amoount is a positive or negative integer.
1641
1642B<Returns:> none
1643
1644=back
1645
1646=cut
1647
1648
1649    ;
1650
1651####
1652# add value to all gene component coordinates
1653sub adjust_gene_coordinates {
1654    my $self = shift;
1655    my $adj_amount = shift;
1656    my @exons = $self->get_exons();
1657    foreach my $exon (@exons) {
1658        my ($end5, $end3) = $exon->get_coords();
1659        $exon->set_coords($end5 + $adj_amount, $end3 + $adj_amount);
1660        my $cds = $exon->get_CDS_obj();
1661        if (ref $cds) {
1662            my ($end5, $end3) = $cds->get_coords();
1663            $cds->set_coords($end5 + $adj_amount, $end3 + $adj_amount);
1664        }
1665    }
1666
1667    ## don't forget about alt splicing isoforms!
1668    my @isoforms = $self->get_additional_isoforms();
1669    foreach my $isoform (@isoforms) {
1670        $isoform->adjust_gene_coordinates($adj_amount);
1671    }
1672    $self->refine_gene_object();
1673}
1674
1675
1676
1677
1678=over 4
1679
1680=item toString()
1681
1682B<Description:> Textually describes the Gene_obj including coordinates and attributes.
1683
1684B<Parameters:> <%attributes_list>
1685
1686%attributes_list is optional and can control whether certain attributes are included in the textual output
1687
1688Default settings are:
1689
1690    %attributes_list = (
1691			 -showIsoforms => 1,  #set to 0 to avoid isoform info to the text output.
1692			 -showSeqs => 0  #set to 1 for avoiding protein, cds, genomic, cdna seqs as output.
1693			 )
1694
1695B<Returns:> $text
1696
1697=back
1698
1699=cut
1700
1701    ;
1702
1703
1704## retrieve text output describing the gene.
1705sub toString {
1706    my $self = shift;
1707    my %atts = @_;
1708    # atts defaults:
1709    #       -showIsoforms=>1
1710    #       -showSeqs => 0
1711
1712    my $output = "";
1713    foreach my $key (keys %$self) {
1714        my $value = $self->{$key};
1715        unless (defined $value) { next;}
1716        if (ref $value) {
1717            if ($key =~ /secondary/ && ref $value eq "ARRAY") {
1718                foreach my $val (@$value) {
1719                    $output .= "\t\t$key\t$val\n";
1720                }
1721            }
1722
1723
1724        } else {
1725            if ($self->{is_pseudogene} && $key =~ /cds|cdna|protein/i && $key =~ /seq/) {
1726                next;
1727            }
1728            if ((!$atts{-showSeqs}) && $key =~/seq/) { next; }
1729            if ( ($value eq '0' || !defined($value)) && $key !~/^is_/) { next;} #dont print unpopulated info.
1730            $output .= "\t$key:\t$value\n";
1731        }
1732    }
1733    $output .= "\tgene_synonyms: @{$self->{gene_synonyms}}\n";
1734
1735    $output .=  "\tmRNA_coords\t";
1736
1737    if (ref ($self->{mRNA_coords}) eq "HASH") {
1738        foreach my $end5 (sort {$a<=>$b} keys %{$self->{mRNA_coords}}) {
1739            $output .=  "$end5-$self->{mRNA_coords}->{$end5} ";
1740        }
1741    }
1742    $output .= "\n"
1743        . "\tCDS_coords\t";
1744    if (ref ($self->{CDS_coords}) eq "HASH") {
1745        foreach my $end5 (sort {$a<=>$b} keys %{$self->{CDS_coords}}) {
1746            $output .= "$end5-$self->{CDS_coords}->{$end5} ";
1747        }
1748    }
1749
1750    my @exons = $self->get_exons();
1751    foreach my $exon (@exons) {
1752        $output .=  "\n\t\tRNA-exon: $exon->{end5}, $exon->{end3}\t";
1753        my $cds = $exon->{CDS_exon_obj};
1754        if ($cds) {
1755            $output .= "CDS-exon: $cds->{end5}, $cds->{end3}";
1756        }
1757    }
1758
1759    if (ref $self->{gene_span}) {
1760        my ($gene_end5, $gene_end3) = @{$self->{gene_span}};
1761        $output .= "\n\tgene_span: $gene_end5-$gene_end3";
1762    }
1763    if (ref $self->{model_span}) {
1764        my ($model_end5, $model_end3) = @{$self->{model_span}};
1765        $output .= "\n\tmodel_span: $model_end5-$model_end3";
1766    }
1767    my @gene_ontology_objs = $self->get_gene_ontology_objs();
1768    if (@gene_ontology_objs) {
1769        $output .= "\n\tGene Ontology Assignments:\n";
1770        foreach my $go_assignment (@gene_ontology_objs) {
1771            $output .= "\t" . $go_assignment->toString();
1772        }
1773    }
1774
1775    unless (defined ($atts{-showIsoforms}) && $atts{-showIsoforms} == 0) {
1776        foreach my $isoform ($self->get_additional_isoforms()) {
1777            $output .= "\n\n\tISOFORM:\n" . $isoform->toString();
1778        }
1779    }
1780    $output .= "\n\n"; #spacer at terminus
1781    return ($output);
1782}
1783
1784
1785####
1786## Splice site validation section
1787####
1788
1789=over 4
1790
1791=item validate_splice_sites()
1792
1793B<Description:> Validates the presence of consensus splice sites
1794
1795B<Parameters:> $genomic_seq_ref
1796
1797$genomic_seq_ref is a scalar reference to the string containing the genomic sequence.
1798
1799B<Returns:> $errors
1800
1801If the empty string ("") is returned, then no inconsistencies were identified.
1802
1803=back
1804
1805=cut
1806
1807    ;
1808
1809####
1810sub validate_splice_sites {
1811    my $self = shift;
1812    my $asmbl_seq_ref = shift;
1813    unless (ref ($asmbl_seq_ref)) {
1814        print STDERR "I require a sequence reference\n";
1815        return (undef());
1816    }
1817    my $error_string = "";
1818    my $strand = $self->{strand};
1819    my @exons = $self->get_exons();
1820    my $num_exons = $#exons + 1;
1821    if ($num_exons == 1) {
1822        #no splice sites to confirm.
1823        return ("");
1824    }
1825    for (my $i = 1; $i <= $num_exons; $i++) {
1826        my $exon_type;
1827        if ($i == 1) {
1828            $exon_type = "initial";
1829        } elsif ($i == $num_exons) {
1830            $exon_type = "terminal";
1831        } else {
1832            $exon_type = "internal";
1833        }
1834        my $exon = $exons[$i - 1];
1835        my ($exon_end5, $exon_end3) = $exon->get_mRNA_exon_end5_end3();
1836        my ($coord1, $coord2) = sort {$a<=>$b} ($exon_end5, $exon_end3);
1837        ## get two coordinate sets corresponding to potential splice sites
1838        my $splice_1_start = $coord1-2-1;
1839        my $splice_2_start = $coord2-1+1;
1840        #print "confirming splice sites at "  . ($splice_1_start +1) . " and " . ($splice_2_start + 1) . "\n"if $SEE;
1841        my $splice_1 = substr ($$asmbl_seq_ref, $splice_1_start, 2);
1842        my $splice_2 = substr ($$asmbl_seq_ref, $splice_2_start, 2);
1843        my ($acceptor, $donor) = ($strand eq '+') ? ($splice_1, $splice_2) : (&reverse_complement($splice_2), &reverse_complement($splice_1));
1844        my $check_acceptor = ($acceptor =~ /ag/i);
1845        my $check_donor = ($donor =~ /gt|gc/i);
1846        ## associate results of checks with exon type.
1847        if ($exon_type eq "initial" || $exon_type eq "internal") {
1848            unless ($check_donor) {
1849                $error_string .= "non-consensus $donor donor splice site at $coord1\n";
1850            }
1851        }
1852
1853        if ($exon_type eq "internal" || $exon_type eq "terminal") {
1854            unless ($check_acceptor) {
1855                $error_string .=  "\tnon-consensus $acceptor acceptor splice site at $coord2\n";
1856            }
1857        }
1858    }
1859    return ($error_string);
1860}
1861
1862
1863
1864=over 4
1865
1866=item get_annot_text()
1867
1868B<Description:> Provides basic functional annotation for a Gene_obj
1869
1870B<Parameters:> none
1871
1872B<Returns:> $string
1873
1874$string includes locus, pub_locus, com_name, and pub_comment
1875
1876=back
1877
1878=cut
1879
1880
1881    ;
1882
1883####
1884sub get_annot_text {
1885    my $self = shift;
1886    my $locus = $self->{locus};
1887    my $pub_locus = $self->{pub_locus};
1888    my $com_name = $self->{com_name};
1889    my $pub_comment = $self->{pub_comment};
1890    my $text = "";
1891    foreach my $token ($locus, $pub_locus, $com_name, $pub_comment) {
1892        if ($token) {
1893            $text .= "$token ";
1894        }
1895    }
1896    return ($text);
1897}
1898
1899
1900
1901=over 4
1902
1903=item add_isoform()
1904
1905B<Description:> Adds a Gene_obj to an existing Gene_obj as an alternative splicing variant.
1906
1907B<Parameters:> Gene_obj
1908
1909B<Returns:> none
1910
1911=back
1912
1913=cut
1914
1915    ;
1916sub add_isoform {
1917    my $self = shift;
1918    my @gene_objs = @_;
1919    foreach my $gene_obj (@gene_objs) {
1920        $self->{num_additional_isoforms}++;
1921        push (@{$self->{additional_isoforms}}, $gene_obj);
1922    }
1923}
1924
1925
1926
1927
1928
1929=over 4
1930
1931=item has_additional_isoforms()
1932
1933B<Description:> Provides number of additional isoforms.  Typically used as a boolean.
1934
1935B<Parameters:> none
1936
1937B<Returns:> number of additional isoforms (int)
1938
1939If no additional isoforms exist, returns 0
1940
1941
1942boolean usage:
1943
19440 = false (has no more)
1945nonzero = true (has additional isoforms)
1946
1947=back
1948
1949=cut
1950
1951sub has_additional_isoforms {
1952    my $self = shift;
1953    return ($self->{num_additional_isoforms});
1954}
1955
1956
1957
1958=over 4
1959
1960=item delete_isoforms()
1961
1962B<Description:> removes isoforms stored in this Gene_obj (assigning to a new anonymous arrayref)
1963
1964B<Parameters:> Gene_obj
1965
1966B<Returns:> none
1967
1968=back
1969
1970=cut
1971
1972sub delete_isoforms {
1973    my $self = shift;
1974    $self->{num_additional_isoforms} = 0;
1975    $self->{additional_isoforms} = [];
1976}
1977
1978
1979
1980
1981
1982=over 4
1983
1984=item get_additional_isoforms()
1985
1986B<Description:> Retrieves the additional isoforms for a given Gene_obj
1987
1988B<Parameters:> none
1989
1990B<Returns:> @Gene_objs
1991
1992If no additional isoforms exist, an empty array is returned.
1993
1994=back
1995
1996=cut
1997
1998
1999sub get_additional_isoforms {
2000    my $self = shift;
2001    return (@{$self->{additional_isoforms}});
2002}
2003
2004
2005
2006=over 4
2007
2008=item get_orientation()
2009
2010B<Description:> Retrieves the strand orientation of the Gene_obj
2011
2012B<Parameters:> none
2013
2014B<Returns:> +|-
2015
2016=back
2017
2018=cut
2019
2020
2021sub get_orientation {
2022    my $self = shift;
2023    return ($self->{strand});
2024}
2025
2026
2027
2028sub get_strand { ## preferred
2029	my $self = shift;
2030	return($self->get_orientation());
2031}
2032
2033
2034
2035=over 4
2036
2037=item add_gene_ontology_objs()
2038
2039B<Description:> Adds a list of Gene_ontology objects to a Gene_obj
2040
2041B<Parameters:> @Gene_ontology_objs
2042
2043@Gene_ontology_objs is a list of objects instantiated from Gene_ontology.pm
2044
2045B<Returns:> none
2046
2047=back
2048
2049=cut
2050
2051
2052sub add_gene_ontology_objs {
2053    my ($self, @ontology_objs) = @_;
2054    push (@{$self->{GeneOntology}}, @ontology_objs);
2055}
2056
2057
2058
2059=over 4
2060
2061=item get_gene_ontology_objs()
2062
2063B<Description:> Retrieves Gene_ontology objs assigned to the Gene_obj
2064
2065B<Parameters:> none
2066
2067B<Returns:> @Gene_ontology_objs
2068
2069@Gene_ontology_objs are objects instantiated from package Gene_ontology  (See Gene_ontology.pm)
2070
2071=back
2072
2073=cut
2074
2075    ;
2076
2077sub get_gene_ontology_objs {
2078    my $self = shift;
2079    if (ref ($self->{GeneOntology})) {
2080        return (@{$self->{GeneOntology}});
2081    } else {
2082        return (());
2083    }
2084}
2085
2086
2087=over 4
2088
2089=item set_5prime_partial()
2090
2091B<Description:> Sets the status of the is_5prime_partial attribute
2092
2093B<Parameters:> 1|0
2094
2095B<Returns:> none
2096
2097
20985prime partials are partial on their 5prime end and lack start codons.
2099
2100
2101=back
2102
2103=cut
2104
2105sub set_5prime_partial() {
2106    my $self = shift;
2107    my $value = shift;
2108    $self->{is_5prime_partial} = $value;
2109}
2110
2111
2112
2113=over 4
2114
2115=item set_3prime_partial()
2116
2117B<Description:> Sets the is_3prime_partial status
2118
2119B<Parameters:> 1|0
2120
2121B<Returns:> none
2122
21233prime partials are partial on their 3prime end and lack stop codons.
2124
2125=back
2126
2127=cut
2128
2129
2130sub set_3prime_partial() {
2131    my $self = shift;
2132    my $value = shift;
2133    $self->{is_3prime_partial} = $value;
2134}
2135
2136
2137
2138=over 4
2139
2140=item is_5prime_partial()
2141
2142B<Description:> Retrieves the 5-prime partial status of the gene.
2143
2144B<Parameters:> none
2145
2146B<Returns:> 1|0
2147
2148=back
2149
2150=cut
2151
2152
2153sub is_5prime_partial() {
2154    my $self = shift;
2155    return ($self->{is_5prime_partial});
2156}
2157
2158
2159=over 4
2160
2161=item is_3prime_partial()
2162
2163B<Description:> Retrieves the 3-prime partial status of the gene.
2164
2165B<Parameters:> none
2166
2167B<Returns:> 1|0
2168
2169=back
2170
2171=cut
2172
2173
2174sub is_3prime_partial() {
2175    my $self = shift;
2176    return ($self->{is_3prime_partial});
2177}
2178
2179=over 4
2180
2181=item get_5prime_UTR_coords
2182
2183
2184B<Description:> returns a list of coordinate pairs corresponding to the 5\' UTR coordinates
2185
2186B<Parameters:> none
2187
2188B<Returns:> ([end5,end3], ...) or empty list if none exist
2189
2190=back
2191
2192=cut
2193
2194
2195    ;
2196
2197sub get_5prime_UTR_coords {
2198    my $self = shift;
2199
2200    my $strand = $self->get_orientation();
2201
2202    my @exons = $self->get_exons();
2203
2204    my $seen_CDS_flag = 0;
2205
2206    my @utr_coords;
2207    foreach my $exon (@exons) { #relying on a sorted list
2208        my ($exon_end5, $exon_end3) = $exon->get_coords();
2209        if (my $cds = $exon->get_CDS_obj()) {
2210            my ($cds_end5, $cds_end3) = $cds->get_coords();
2211            if ($exon_end5 != $cds_end5) {
2212                my $adj_utr_end3_coord = ($strand eq '+') ? ($cds_end5 -1) : ($cds_end5 +1);
2213                push (@utr_coords, [$exon_end5, $adj_utr_end3_coord]);
2214            }
2215
2216            $seen_CDS_flag = 1;
2217
2218        } else {
2219            push (@utr_coords, [$exon_end5, $exon_end3]);
2220        }
2221
2222        if ($seen_CDS_flag) {
2223            last;
2224        }
2225
2226    }
2227
2228    return (@utr_coords);
2229}
2230
2231
2232
2233=over 4
2234
2235=item get_3prime_UTR_coords
2236
2237
2238B<Description:> returns a list of coordinate pairs corresponding to the 3\' UTR coordinates
2239
2240B<Parameters:> none
2241
2242B<Returns:> ([end5,end3], ...) or empty list if none exist
2243
2244=back
2245
2246=cut
2247
2248    ;
2249
2250sub get_3prime_UTR_coords {
2251    my $self = shift;
2252
2253    my $strand = $self->get_orientation();
2254
2255    my @exons = reverse $self->get_exons();
2256
2257    my @utr_coords;
2258    my $seen_CDS_flag = 0;
2259    foreach my $exon (@exons) { #relying on a reverse sorted list (3' exons should come first)
2260        my ($exon_end5, $exon_end3) = $exon->get_coords();
2261        if (my $cds = $exon->get_CDS_obj()) {
2262            $seen_CDS_flag = 1;
2263            my ($cds_end5, $cds_end3) = $cds->get_coords();
2264            if ($exon_end3 != $cds_end3) {
2265                my $adj_utr_end5_coord = ($strand eq '+') ? ($cds_end3 +1) : ($cds_end3 -1);
2266                push (@utr_coords, [$adj_utr_end5_coord, $exon_end3]);
2267            }
2268
2269        } else {
2270            push (@utr_coords, [$exon_end5, $exon_end3]);
2271        }
2272        if ($seen_CDS_flag) {
2273            last;
2274        }
2275    }
2276
2277    if (@utr_coords) {
2278        @utr_coords = reverse @utr_coords;
2279    }
2280
2281    return (@utr_coords);
2282}
2283
2284
2285
2286
2287
2288=over 4
2289
2290=item has_UTRs()
2291
2292B<Description:> indicates presence of UTR annotated in Gene
2293
2294B<Parameters:> none
2295
2296B<Returns:> ( has_5prime_UTR() || has_3prime_UTR() )
2297
2298=back
2299
2300=cut
2301
2302sub has_UTRs {
2303    my $self = shift;
2304    return ( ($self->has_5prime_UTR() || $self->has_3prime_UTR() ) );
2305}
2306
2307
2308
2309####
2310sub has_5prime_UTR {
2311    my $self = shift;
2312    return ($self->get_5prime_UTR_length() > 2);
2313}
2314
2315####
2316sub has_3prime_UTR {
2317    my $self = shift;
2318    return($self->get_3prime_UTR_length() > 2);
2319}
2320
2321
2322####
2323sub get_5prime_UTR_length {
2324    my $self = shift;
2325    my @prime5_UTR_coords = $self->get_5prime_UTR_coords();
2326
2327    my $len = 0;
2328    for my $coordset (@prime5_UTR_coords) {
2329        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
2330        $len += ($rend - $lend) + 1;
2331    }
2332    return($len);
2333}
2334
2335
2336####
2337sub get_3prime_UTR_length {
2338    my $self = shift;
2339    my @prime3_UTR_coords = $self->get_3prime_UTR_coords();
2340
2341    my $len = 0;
2342    for my $coordset (@prime3_UTR_coords) {
2343        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
2344        $len += ($rend - $lend) + 1;
2345    }
2346    return($len);
2347}
2348
2349
2350
2351=over 4
2352
2353=item get_5prime_UTR_sequence()
2354
2355B<Description:> retrieves 5prime UTR sequence
2356
2357B<Parameters:> genome sequence reference
2358
2359B<Returns:> string
2360
2361=back
2362
2363=cut
2364
2365####
2366sub get_5prime_UTR_sequence {
2367    my $self = shift;
2368    my ($genome_seq_ref) = @_;
2369    unless (ref $genome_seq_ref eq "SCALAR") {
2370        confess "error, require genome sequence string reference";
2371    }
2372
2373    unless ($self->has_5prime_UTR()) {
2374        return "";
2375    }
2376
2377    my $orientation = $self->get_orientation();
2378    my @coords = $self->get_5prime_UTR_coords();
2379
2380    @coords = sort {$a->[0]<=>$b->[0]} @coords;
2381
2382    my $UTR_seq = "";
2383    foreach my $coordset (@coords) {
2384        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
2385
2386        my $length = $rend - $lend + 1;
2387        $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length);
2388    }
2389
2390    if ($orientation eq '-') {
2391        $UTR_seq = &reverse_complement($UTR_seq);
2392    }
2393
2394    ## verify:
2395    $self->create_all_sequence_types($genome_seq_ref);
2396    my $cDNA = $self->get_cDNA_sequence();
2397
2398
2399    unless (index($cDNA, $UTR_seq) == 0) {
2400        confess "Error, couldn't find UTR in cDNA";
2401    }
2402
2403
2404    return ($UTR_seq);
2405}
2406
2407
2408=over 4
2409
2410=item get_3prime_UTR_sequence()
2411
2412B<Description:> retrieves 5prime UTR sequence
2413
2414B<Parameters:> genome sequence reference
2415
2416B<Returns:> string
2417
2418=back
2419
2420=cut
2421
2422####
2423sub get_3prime_UTR_sequence {
2424    my $self = shift;
2425    my ($genome_seq_ref) = @_;
2426    unless (ref $genome_seq_ref eq "SCALAR") {
2427        confess "error, require genome sequence string reference";
2428    }
2429
2430    unless ($self->has_3prime_UTR()) {
2431        return "";
2432    }
2433
2434    my $orientation = $self->get_orientation();
2435    my @coords = $self->get_3prime_UTR_coords();
2436
2437    @coords = sort {$a->[0]<=>$b->[0]} @coords;
2438
2439    my $UTR_seq = "";
2440    foreach my $coordset (@coords) {
2441        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
2442
2443        my $length = $rend - $lend + 1;
2444        $UTR_seq .= substr($$genome_seq_ref, $lend - 1, $length);
2445    }
2446
2447    if ($orientation eq '-') {
2448        $UTR_seq = &reverse_complement($UTR_seq);
2449    }
2450
2451    ## verify:
2452    $self->create_all_sequence_types($genome_seq_ref);
2453    my $cDNA = $self->get_cDNA_sequence();
2454    my $cDNA_length = length($cDNA);
2455    my $utr_length = length($UTR_seq);
2456
2457    my $utr_start_pos = $cDNA_length - $utr_length + 1;
2458
2459    unless ((my $cDNA_utr =  lc substr($cDNA, $utr_start_pos - 1, $utr_length)) eq lc $UTR_seq) {
2460        confess "Error, 3' UTR extracted from cDNA is different from UTR sequence extracted from genome.\n"
2461            . "cDNA_utr:\n$cDNA_utr\nUTR_from_genome:\n$UTR_seq\n\n";
2462    }
2463
2464
2465    return ($UTR_seq);
2466}
2467
2468
2469
2470
2471=over 4
2472
2473=item trim_UTRs()
2474
2475B<Description:> Trims the UTR of the Gene_obj so that the Exon coordinates are identical to the CDS coordinates.  Exons which lack CDS components and are completely UTR are removed.
2476
2477B<Parameters:> none
2478
2479B<Returns:> none
2480
2481=back
2482
2483=cut
2484
2485    ;
2486
2487sub trim_UTRs {
2488    my $self = shift;
2489
2490    ## adjust exon coordinates to CDS coordinates.
2491    ## if cds doesn't exist, rid exon:
2492
2493    my @new_exons;
2494
2495    my @exons = $self->get_exons();
2496    foreach my $exon (@exons) {
2497        if (my $cds = $exon->get_CDS_obj()) {
2498            my ($exon_end5, $exon_end3) = $exon->get_coords();
2499            my ($cds_end5, $cds_end3) = $cds->get_coords();
2500
2501            if ($exon_end5 != $cds_end5 || $exon_end3 != $cds_end3) {
2502                $exon->set_coords($cds_end5, $cds_end3);
2503            }
2504            push (@new_exons, $exon);
2505        }
2506    }
2507    $self->{mRNA_exon_objs} = 0; #clear current gene structure
2508    $self->{mRNA_exon_objs} = \@new_exons; #replace gene structure
2509    $self->refine_gene_object(); #update
2510    return ($self);
2511}
2512
2513
2514
2515
2516=over 4
2517
2518=item remove_CDS_exon()
2519
2520B<Description:> Removes any existing CDS_exon_obj from this mRNA_exon_obj
2521
2522B<Parameters:> none
2523
2524B<Returns:> none
2525
2526=back
2527
2528=cut
2529
2530sub remove_CDS_exon {
2531    my $self = shift;
2532    $self->{CDS_exon_obj} = 0;
2533}
2534
2535
2536
2537
2538
2539=over 4
2540
2541=item get_gene_names()
2542
2543B<Description:> Retrieves gene names  (primary gene name followed by secondary gene names, "$;" delimited.
2544
2545B<Parameters:> none
2546
2547B<Returns:> string
2548
2549     see $gene_obj->{gene_name}
2550     see $gene_obj->get_secondary_names()
2551
2552secondary gene names sorted lexicographically
2553
2554
2555=back
2556
2557=cut
2558
2559
2560
2561
2562####
2563sub get_gene_names {
2564    my $gene_obj = shift;
2565    my @gene_names;
2566    if ($gene_obj->{gene_name}) {
2567        push (@gene_names, $gene_obj->{gene_name});
2568    }
2569    if (my @secondary_names = $gene_obj->get_secondary_gene_names()) {
2570        push (@gene_names, @secondary_names);
2571    }
2572    my $ret_gene_names = join ("$;" , @gene_names);
2573    return ($ret_gene_names);
2574}
2575
2576
2577
2578=over 4
2579
2580=item get_secondary_gene_names()
2581
2582B<Description:> Retrieves secondary gene names as a "$;" delimited string.
2583
2584B<Parameters:> none
2585
2586B<Returns:> string
2587
2588=back
2589
2590=cut
2591
2592
2593####
2594sub get_secondary_gene_names {
2595    my ($gene_obj) = @_;
2596    return (sort @{$gene_obj->{secondary_gene_names}});
2597}
2598
2599
2600
2601
2602=over 4
2603
2604=item get_product_names()
2605
2606B<Description:> Retrieves product name, with the primary product name followed by secondary product names, delimited by "$;"
2607
2608B<Parameters:> none
2609
2610B<Returns:> string
2611
2612    see $gene_obj->{com_name} for primary product name
2613    see $gene_obj->get_secondary_product_names()
2614
2615=back
2616
2617=cut
2618
2619    ;
2620
2621####
2622sub get_product_names {
2623    my $gene_obj = shift;
2624    my @product_names;
2625    if ($gene_obj->{com_name}) {
2626        push (@product_names, $gene_obj->{com_name});
2627    }
2628    if (my @secondary_names = $gene_obj->get_secondary_product_names()) {
2629        push (@product_names, @secondary_names);
2630    }
2631    my $ret_product_names = join ("$;", @product_names);
2632    return ($ret_product_names);
2633}
2634
2635
2636
2637=over 4
2638
2639=item get_secondary_product_names()
2640
2641B<Description:> Retrieves secondary product names, delimited by "$;" and sorted lexicographically.
2642
2643B<Parameters:> none
2644
2645B<Returns:> string
2646
2647=back
2648
2649=cut
2650
2651
2652####
2653sub get_secondary_product_names {
2654    my ($gene_obj) = @_;
2655    return (sort @{$gene_obj->{secondary_product_names}});
2656}
2657
2658
2659
2660=over 4
2661
2662=item get_gene_symbols()
2663
2664B<Description:> Retrieves primary gene symbol followed by secondary gene symbols, delimited by "$;"
2665
2666B<Parameters:> none
2667
2668B<Returns:> string
2669
2670    see $gene_obj->{gene_sym}
2671    see $gene_obj->get_secondary_gene_symbols()
2672
2673=back
2674
2675=cut
2676
2677    ;
2678
2679####
2680sub get_gene_symbols {
2681    my $gene_obj = shift;
2682    my @gene_symbols;
2683    if ($gene_obj->{gene_sym}) {
2684        push (@gene_symbols, $gene_obj->{gene_sym});
2685    }
2686    if (my @secondary_symbols = $gene_obj->get_secondary_gene_symbols()) {
2687        push (@gene_symbols, @secondary_symbols);
2688    }
2689    my $ret_gene_symbols = join ("$;", @gene_symbols);
2690    return ($ret_gene_symbols);
2691}
2692
2693
2694=over 4
2695
2696=item get_secondary_gene_symbols()
2697
2698B<Description:> Retrieves secondary gene symbols, delimited by "$;" and sorted lexicographically
2699
2700B<Parameters:> none
2701
2702B<Returns:> string
2703
2704=back
2705
2706=cut
2707
2708
2709####
2710sub get_secondary_gene_symbols {
2711    my ($gene_obj) = @_;
2712    return (sort @{$gene_obj->{secondary_gene_symbols}});
2713}
2714
2715
2716
2717=over 4
2718
2719=item get_ec_numbers()
2720
2721B<Description:> Retrieves primary EC number followed by secondary EC numbers, "$;" delimited
2722
2723B<Parameters:> none
2724
2725B<Returns:> string
2726
2727    see $gene_obj->{ec_num}
2728    see $gene_obj->get_secondary_ec_numbers()
2729
2730=back
2731
2732=cut
2733
2734    ;
2735
2736####
2737sub get_ec_numbers {
2738    my $gene_obj = shift;
2739    my @ec_numbers;
2740    if ($gene_obj->{ec_num}) {
2741        push (@ec_numbers, $gene_obj->{ec_num});
2742    }
2743    if (my @secondary_ec_numbers = $gene_obj->get_secondary_ec_numbers()) {
2744        push (@ec_numbers, @secondary_ec_numbers);
2745    }
2746    my $ret_ec_numbers = join ("$;", @ec_numbers);
2747    return ($ret_ec_numbers);
2748}
2749
2750
2751
2752=over 4
2753
2754=item get_secondary_ec_numbers()
2755
2756B<Description:> Retrieves secondary EC numbers, "$;" delimited and sorted lexicographically
2757
2758B<Parameters:> none
2759
2760B<Returns:> string
2761
2762
2763=back
2764
2765=cut
2766
2767
2768####
2769sub get_secondary_ec_numbers {
2770    my ($gene_obj) = @_;
2771    return (sort @{$gene_obj->{secondary_ec_numbers}});
2772}
2773
2774
2775
2776=over 4
2777
2778=item add_secondary_gene_names()
2779
2780B<Description:> Adds secondary gene name(s)
2781
2782B<Parameters:> (gene_name_1, gene_name_2, ....)
2783
2784Single gene name or list of gene names is allowed
2785
2786
2787B<Returns:> none
2788
2789=back
2790
2791=cut
2792
2793
2794
2795####
2796sub add_secondary_gene_names {
2797    my ($gene_obj, @gene_names) = @_;
2798    push (@{$gene_obj->{secondary_gene_names}}, @gene_names);
2799}
2800
2801
2802=over 4
2803
2804=item add_secondary_product_names()
2805
2806B<Description:> Adds secondary product names
2807
2808B<Parameters:> (product_name_1, product_name_2, ...)
2809
2810Single or list of product names as parameter
2811
2812B<Returns:> none
2813
2814Primary gene name added directly as an attribute like so
2815    $gene_obj->{gene_name} = name
2816
2817=back
2818
2819=cut
2820
2821
2822####
2823sub add_secondary_product_names {
2824    my ($gene_obj, @product_names) = @_;
2825    &trim_leading_trailing_ws(\@product_names);
2826    push (@{$gene_obj->{secondary_product_names}}, @product_names);
2827}
2828
2829
2830=over 4
2831
2832=item add_secondary_gene_symbols()
2833
2834B<Description:> Add secondary gene symbols
2835
2836B<Parameters:> (gene_symbol_1, gene_symbol_2, ...)
2837
2838String or list context
2839
2840B<Returns:> none
2841
2842Primary gene_symbol added directly as attribute like so:
2843    $gene_obj->{gene_sym} = symbol
2844
2845=back
2846
2847=cut
2848
2849
2850####
2851sub add_secondary_gene_symbols {
2852    my ($gene_obj, @gene_symbols) = @_;
2853    &trim_leading_trailing_ws(\@gene_symbols);
2854    push (@{$gene_obj->{secondary_gene_symbols}}, @gene_symbols);
2855}
2856
2857
2858
2859
2860
2861=over 4
2862
2863=item add_secondary_ec_numbers()
2864
2865B<Description:> Add secondary Enzyme Commission (EC) numbers
2866
2867B<Parameters:> (EC_1, EC_2, ...)
2868
2869String or list context
2870
2871B<Returns:> none
2872
2873
2874Primary EC number added directly as an attribute like so:
2875    $gene_obj->{ec_num} = EC_number
2876
2877=back
2878
2879=cut
2880
2881
2882####
2883sub add_secondary_ec_numbers {
2884    my ($gene_obj, @ec_numbers) = @_;
2885    &trim_leading_trailing_ws(\@ec_numbers);
2886    push (@{$gene_obj->{secondary_ec_numbers}}, @ec_numbers);
2887}
2888
2889####
2890sub to_alignment_GFF3_format {
2891    my ($gene_obj, $id, $target, $source) = @_;
2892
2893    unless (defined $source) {
2894        $source = ".";
2895    }
2896
2897    ## Note, only examines gene_obj and doesn't go deeper into alt-splicing layers, ... send isoforms in as separate objs.
2898
2899    unless ( (ref $gene_obj)  && defined($id) && defined($target)) {
2900        croak "Error, need gene_obj, id, and target names as params";
2901    }
2902
2903    my $gff3_alignment_text = "";
2904
2905    my $orient = $gene_obj->get_orientation();
2906    my $scaff = $gene_obj->{asmbl_id};
2907
2908    my @exons = sort {$a->{end5}<=>$b->{end5}} $gene_obj->get_exons();
2909
2910    if ($orient eq '-') {
2911        @exons = reverse @exons;
2912    }
2913
2914
2915    my $match_lend = 0;
2916
2917    foreach my $exon (@exons) {
2918
2919        my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
2920
2921        my $m_lend = $match_lend + 1;
2922        my $m_rend = $match_lend + ($rend - $lend + 1);
2923
2924
2925        $gff3_alignment_text .= join("\t", $scaff, $source, "match", $lend, $rend, "100", $orient, '.', # giving everything 100% identity since genome-based
2926                                     "ID=$id;Target=$target $m_lend $m_rend +") . "\n";
2927
2928
2929        $match_lend = $m_rend;
2930
2931
2932    }
2933
2934    return($gff3_alignment_text);
2935}
2936
2937
2938
2939
2940####
2941sub to_transcript_GTF_format {
2942	my ($gene_obj) = @_;
2943
2944	## no worries about protein-coding regions.  Only report transcripts and exons tied to a particular gene.
2945	## used with cufflinks package for computing FPKM values
2946
2947	my $gtf_text = "";
2948
2949	foreach my $gene ($gene_obj, $gene_obj->get_additional_isoforms()) {
2950
2951		my $gene_id = $gene->{TU_feat_name} || "";
2952		my $transcript_id = $gene->{Model_feat_name} || "";
2953		my $asmbl_id = $gene_obj->{asmbl_id};
2954		my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_transcript_span();
2955		my $orientation = $gene_obj->get_orientation();
2956
2957        my $com_name = $gene_obj->{com_name} || "";
2958        $com_name =~ s/;/_/g;
2959        $com_name =~ s/\"//g;
2960
2961
2962        if ($gene->{gene_type} eq "protein-coding") {
2963            my @exons = $gene->get_exons();
2964
2965            $gtf_text .= join("\t", $asmbl_id, ".", "transcript", $lend, $rend, ".", $orientation, ".",
2966                              "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n";
2967
2968            foreach my $exon (@exons) {
2969                my ($lend, $rend) = sort {$a<=>$b} $exon->get_coords();
2970
2971                $gtf_text .= join("\t", $asmbl_id, ".", "exon", $lend, $rend, ".", $orientation, ".",
2972                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\";") . "\n";
2973
2974            }
2975
2976        }
2977        else {
2978
2979            ## non-protein-coding features
2980            $gtf_text .= join("\t", $asmbl_id, ".", $gene->{gene_type}, $lend, $rend, ".", $orientation, ".",
2981                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; name \"$com_name\";") . "\n";
2982
2983
2984        }
2985
2986        $gtf_text .= "\n";
2987    }
2988
2989
2990	return($gtf_text);
2991}
2992
2993
2994
2995=over 4
2996
2997=item to_GTF_format()
2998
2999B<Description:> Outputs text corresponding to the representation of the gene in GTF format.
3000
3001B<Parameters:> $genome_seq_ref,  %preferences
3002
3003B<Returns:> string
3004
3005
3006GTF format is described in "Current Protocols in Bioinformatics(2003)" 4.8.1-4.8.19
3007in "Using TWINSCAN to Predict Gene Structures in Genomic DNA Sequences".
3008
3009Each line of the GTF format includes the following tab-delimited fields:
3010
3011[seqname] [source] [feature] [start] [end] [score] [strand] [frame] [attributes]
3012
3013This is further elaborated below:
3014
3015[feature] contains one of the following: start_codon, stop_codon, CDS
3016[attributes] contains 'gene_id' and 'transcript_id' fields.  All features of the same transcript should share the same transcript_id value.  By default, the TU_feat_name and model_feat_name are used as the gene_id and transcript_id, respectively.
3017
3018
3019    Using the %preferences input parameter, the preferred values or gene attributes can be used for seqname, source, gene_id, or transcript_id, each used as a key to the %preferences hash.  Given the value of %preferences is a gene attribute, that attribute value will be used, otherwise, the raw value will be used.
3020
3021For example:  %preferences = ( seqname => 'mySeqname',
3022                               gene_id => 'pub_locus' );
3023
3024Would result in 'mySeqname' used in the [seqname] field, and the $gene_obj->{pub_locus} value
3025
3026Here are the defaults:
3027[seqname] = asmbl_id
3028[source] = annotation
3029gene_id (TU_feat_name)
3030transcript_id (Model_feat_name)
3031
3032** Partial Genes are NOT Supported **  ( undef is returned )
3033** Genes with split start or stop codons are unsupported ** (undef is returned)
3034
3035=back
3036
3037=cut
3038
3039    ;
3040
3041sub to_GTF_format {
3042    my $gene_obj = shift;
3043    my ($genome_seq_ref, %preferences) = @_;
3044
3045    unless (ref $genome_seq_ref) {
3046        confess "Error, need genome seq reference as param";
3047    }
3048
3049    my $is_pseudogene = $gene_obj->is_pseudogene();
3050
3051
3052    my $TU_feat_name = $gene_obj->{TU_feat_name};
3053    my $model_feat_name = $gene_obj->{Model_feat_name};
3054
3055    # rid whitespace in identifiers
3056    $TU_feat_name =~ s/\s+/_/g;
3057    $model_feat_name =~ s/\s+/_/g;
3058
3059    my $seqname = $preferences{seqname} || $gene_obj->{asmbl_id};
3060    my $source = $preferences{source} || $gene_obj->{source} || ".";
3061
3062    my $gene_id;
3063    if (my $token = $preferences{gene_id}) {
3064        $gene_id = $gene_obj->{$token};
3065    } else {
3066        $gene_id = $TU_feat_name;
3067    }
3068
3069    my $transcript_id;
3070    if (my $token = $preferences{model_id}) {
3071        $transcript_id = $gene_obj->{$token};
3072    } else {
3073        $transcript_id = $model_feat_name;
3074    }
3075
3076    my @exons = $gene_obj->get_exons();
3077    my $orientation = $gene_obj->get_orientation();
3078    my @gtf_text;
3079
3080    my $gene_obj_for_gtf = $gene_obj;  #if got stop codon, will need to strip it off.
3081	my $com_name = $gene_obj->{com_name};
3082	$com_name =~ s/\s+$// if $com_name;
3083	$com_name =~ s/[\"\']//g if $com_name;
3084
3085	my $name_txt = ($com_name) ? "Name \"$com_name\";" : "";
3086
3087
3088    ## Gene record
3089    unless ($preferences{'gene_record_already_done'}) {
3090
3091        my ($gene_lend, $gene_rend) = sort {$a<=>$b} $gene_obj->get_gene_span();
3092
3093        push (@gtf_text, [$seqname,
3094                          $source,
3095                          "gene",
3096                          $gene_lend,
3097                          $gene_rend,
3098                          "0",
3099                          $orientation,
3100                          ".",
3101                          "gene_id \"$gene_id\"; $name_txt"]);
3102    }
3103
3104    ## Transcript record
3105    my ($trans_lend, $trans_rend) = sort {$a<=>$b} $gene_obj->get_transcript_span();
3106    push (@gtf_text, [$seqname,
3107                      $source,
3108                      "transcript",
3109                      $trans_lend,
3110                      $trans_rend,
3111                      "0",
3112                      $orientation,
3113                      ".",
3114                      "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
3115
3116    unless ($is_pseudogene) {
3117        $gene_obj->set_CDS_phases($genome_seq_ref);
3118
3119
3120        ## check for start and stop codons.
3121        my $cds_seq = uc $gene_obj->create_CDS_sequence($genome_seq_ref);
3122        my @stop_codons = &Nuc_translator::get_stop_codons();
3123
3124        my $first_CDS_segment = $gene_obj->get_first_CDS_segment();
3125        my $first_phase = $first_CDS_segment->get_phase();
3126        my $cds_is_integral_codon_num = (length($cds_seq) % 3 == 0) ? 1 : 0;
3127
3128        ## examine start codon:
3129        my $init_codon = substr($cds_seq, 0, 3);
3130        if ($first_phase == 0 && $init_codon eq 'ATG') { # got start codon.
3131            my @start_coordsets = $gene_obj->get_start_codon_coordinates();
3132            foreach my $start_pair (@start_coordsets) {
3133                my ($start_lend, $start_rend) = sort {$a<=>$b} @$start_pair;
3134                push (@gtf_text, [$seqname,
3135                                  $source,
3136                                  "start_codon",
3137                                  $start_lend,
3138                                  $start_rend,
3139                                  "0",
3140                                  $orientation,
3141                                  "0",
3142                                  "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
3143            }
3144        }
3145
3146        my $candidate_stop_codon = uc substr($cds_seq, length($cds_seq) - 3, 3);
3147        my @found_stop = grep { $_ eq $candidate_stop_codon } @stop_codons;
3148
3149        if (@found_stop) {
3150            # got a stop codon.
3151            # check to see that the stop codon is in-frame.
3152            if ((length($cds_seq) - $first_phase) % 3 == 0) { # yes, stop is in frame.
3153
3154                my @stop_codon_coords = $gene_obj->get_stop_codon_coords();
3155                foreach my $stop_pair (@stop_codon_coords) {
3156                    my ($stop_lend, $stop_rend) = sort {$a<=>$b} @$stop_pair;
3157
3158                    push (@gtf_text, [$seqname,
3159                                      $source,
3160                                      "stop_codon",
3161                                      $stop_lend,
3162                                      $stop_rend,
3163                                      "0",
3164                                      $orientation,
3165                                      "0",
3166                                      "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
3167                }
3168
3169                $gene_obj_for_gtf = $gene_obj->clone_gene();
3170
3171                $gene_obj_for_gtf->trim_stop_codon();
3172
3173            }
3174        }
3175
3176    }
3177
3178    ## report the exons and CDS regions:
3179    foreach my $exon ($gene_obj_for_gtf->get_exons()) {
3180
3181        my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
3182
3183        push (@gtf_text, [$seqname,
3184                          $source,
3185                          "exon",
3186                          $exon_lend,
3187                          $exon_rend,
3188                          "0",
3189                          $orientation,
3190                          ".",
3191                          "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
3192
3193
3194
3195        my $cds = ($is_pseudogene) ? $exon : $exon->get_CDS_exon_obj();
3196
3197        if ($cds) {
3198            my $phase = ".";
3199            unless ($is_pseudogene) {
3200                $phase = $cds->get_phase();
3201                if ($phase) {
3202                    $phase = ($phase == 1) ? 2 : 1; # reverse it according to GFF3 vs. GTF representation.
3203                }
3204            }
3205
3206            my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds->get_coords();
3207
3208            push (@gtf_text, [$seqname,
3209                              $source,
3210                              "CDS",
3211                              $cds_lend,
3212                              $cds_rend,
3213                              "0",
3214                              $orientation,
3215                              "$phase",
3216                              "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"]);
3217        }
3218
3219
3220    }
3221
3222    unless ($is_pseudogene) {
3223
3224        ## Get UTR info:
3225        {
3226            for my $pair ($gene_obj->get_3prime_UTR_coords) {
3227                my ($lend,$rend) = sort {$a<=>$b} @$pair;
3228                push (@gtf_text,   [$seqname,
3229                                    $source,
3230                                    "3UTR",
3231                                    $lend,
3232                                    $rend,
3233                                    "0",
3234                                    $orientation,
3235                                    "0",
3236                                    "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt"] );
3237            }
3238            for my $pair ($gene_obj->get_5prime_UTR_coords) {
3239                my ($lend,$rend) = sort {$a<=>$b} @$pair;
3240                push (@gtf_text,   [$seqname,
3241                                    $source,
3242                                    "5UTR",
3243                                    $lend,
3244                                    $rend,
3245                                    "0",
3246                                    $orientation,
3247                                    "0",
3248                                    "gene_id \"$gene_id\"; transcript_id \"$transcript_id\"; $name_txt" ] );
3249            }
3250        }
3251
3252    }
3253
3254    @gtf_text = sort {$a->[3] <=> $b->[3]} @gtf_text;
3255
3256    if ($orientation eq '-') {
3257        @gtf_text = reverse @gtf_text;
3258    }
3259
3260    my $GTF = "";
3261    foreach my $gtf_row (@gtf_text) {
3262        $GTF .= join ("\t", @$gtf_row) . "\n";
3263    }
3264
3265    foreach my $isoform ($gene_obj->get_additional_isoforms()) {
3266        my %iso_pref = %preferences;
3267        $iso_pref{'gene_record_already_done'} = 1;
3268        $GTF .= "\n" . $isoform->to_GTF_format($genome_seq_ref, %iso_pref);
3269    }
3270
3271    return ($GTF);
3272}
3273
3274
3275
3276
3277
3278####
3279sub get_start_codon_coordinates {
3280    my $gene_obj = shift;
3281
3282    my $orient = $gene_obj->get_orientation();
3283
3284    ## just want the coordinate pairs that define the first three CDS bases.
3285
3286    my @cds_coords;
3287    foreach my $exon ($gene_obj->get_exons()) {
3288        if (my $cds = $exon->get_CDS_exon_obj()) {
3289            my ($cds_end5, $cds_end3) = $cds->get_coords();
3290            push (@cds_coords, [$cds_end5, $cds_end3]);
3291        }
3292    }
3293
3294    my @start_coords;
3295    my $start_len_want = 3;
3296    foreach my $cds_coordpair (@cds_coords) {
3297        my ($cds_end5, $cds_end3) = @$cds_coordpair;
3298        my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
3299
3300        my $extract_len = ($cds_seg_len < $start_len_want) ? $cds_seg_len : $start_len_want;
3301        if ($orient eq '+') {
3302            push (@start_coords, [$cds_end5, $cds_end5 + $extract_len - 1]);
3303        }
3304        else {
3305            push (@start_coords, [$cds_end5, $cds_end5 - $extract_len + 1]);
3306        }
3307        $start_len_want -= $extract_len;
3308
3309        if ($start_len_want <= 0) { last; }
3310    }
3311
3312    if ($start_len_want > 0) {
3313        confess "Error, trouble extracting start codon coordinates from cds coordsets: " . Dumper (\@cds_coords);
3314    }
3315
3316    return (@start_coords);
3317}
3318
3319
3320
3321
3322####
3323sub get_stop_codon_coords {
3324    my $gene_obj = shift;
3325
3326    my $orient = $gene_obj->get_orientation();
3327
3328    ## just want the coordinate pairs that define the last three CDS bases.
3329
3330    my @cds_coords;
3331    foreach my $exon (reverse $gene_obj->get_exons()) {
3332        if (my $cds = $exon->get_CDS_exon_obj()) {
3333            my ($cds_end5, $cds_end3) = $cds->get_coords();
3334            push (@cds_coords, [$cds_end5, $cds_end3]);
3335        }
3336    }
3337
3338    my @stop_coords;
3339    my $stop_len_want = 3;
3340    foreach my $cds_coordpair (@cds_coords) {
3341        my ($cds_end5, $cds_end3) = @$cds_coordpair;
3342        my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
3343
3344        my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want;
3345        if ($orient eq '+') {
3346            push (@stop_coords, [$cds_end3 - $extract_len + 1, $cds_end3]);
3347        }
3348        else {
3349            push (@stop_coords, [$cds_end3, $cds_end3 + $extract_len - 1]);
3350        }
3351        $stop_len_want -= $extract_len;
3352
3353        if ($stop_len_want <= 0) { last; }
3354    }
3355
3356    if ($stop_len_want > 0) {
3357        confess "Error, trouble extracting stop codon coordinates from cds coordsets: " . Dumper (\@cds_coords);
3358    }
3359
3360
3361    return (@stop_coords);
3362
3363}
3364
3365
3366####
3367sub trim_stop_codon {
3368    my $gene_obj = shift;
3369
3370    ## just trimming the last three bases from the CDS's, changing the current gene object.
3371
3372    my @exons = reverse $gene_obj->get_exons();
3373
3374    my $orient = $gene_obj->get_orientation();
3375
3376    my $stop_len_want = 3;
3377    foreach my $exon (@exons) {
3378        if (my $cds = $exon->get_CDS_exon_obj()) {
3379
3380            my ($cds_end5, $cds_end3) = $cds->get_coords();
3381            my $cds_seg_len = abs ($cds_end3 - $cds_end5) + 1;
3382
3383            my $extract_len = ($cds_seg_len < $stop_len_want) ? $cds_seg_len : $stop_len_want;
3384
3385            if ($cds_seg_len == $extract_len) {
3386                # delete it!
3387                $exon->delete_CDS_exon_obj();
3388            }
3389            else {
3390                ## truncate it by extract_len
3391                if ($orient eq '+') {
3392                    $cds->{end3} -= $extract_len;
3393                }
3394
3395                else {
3396                    $cds->{end3} += $extract_len;
3397                }
3398            }
3399            $stop_len_want -= $extract_len;
3400
3401            if ($stop_len_want <= 0) { last; }
3402        }
3403    }
3404    if ($stop_len_want > 0) {
3405        confess "Error, trouble extracting all stop codon coordinates from cds coordsets. " . $gene_obj->toString();
3406    }
3407
3408    return;
3409
3410}
3411
3412
3413
3414=over 4
3415
3416=item to_GFF3_format()
3417
3418B<Description:> Outputs text corresponding to the representation of the gene in GFF3 format (still under development).
3419
3420B<Parameters:>
3421
3422B<Returns:> string
3423
3424GFF3 defined at:
3425http://song.sourceforge.net/gff3-jan04.shtml
3426
3427(some text lifted from above site provided below for reference purposes)
3428
3429The format consists of 9 columns, separated by tabs or spaces.  The
3430following unescaped characters are allowed within fields:
3431[a-zA-Z0-9.:^*$@!+_?-].  All other characters must must be escaped
3432using the URL escaping conventions.  Unescaped quotation marks,
3433backslashes and other ad-hoc escaping conventions that have been added
3434to the GFF format are explicitly forbidden.  The =, ; and % characters
3435have reserved meanings as described below, and must be escaped when
3436used in other contexts.
3437
3438Undefined fields are replaced with the "." character, as described in
3439the original GFF spec.
3440
3441Column 1: "seqid"
3442
3443The ID of the landmark used to establish the coordinate system for the
3444current feature.  IDs must contain alphanumeric characters.
3445Whitespace, if present, must be escaped using URL escaping rule
3446(e.g. space="%20" or "+").  Sequences must *NOT* begin with an
3447unescaped ">".
3448
3449Column 2: "source"
3450
3451The source of the feature.  This is unchanged from the older GFF specs
3452and is not part of a controlled vocabulary.
3453
3454Column 3: "type"
3455
3456The type of the feature (previously called the "method").  This is
3457constrained to be either: (a) a term from the "lite" sequence
3458ontology, SOFA; or (b) a SOFA accession number.  The latter
3459alternative is distinguished using the syntax SO:000000.
3460
3461Columns 4 & 5: "start" and "end"
3462
3463The start and end of the feature, in 1-based integer coordinates,
3464relative to the landmark given in column 1.  Start is always less than
3465or equal to end.
3466
3467For zero-length features, such as insertion sites, start equals end
3468and the implied site is to the right of the indicated base.  This
3469convention holds regardless of the strandedness of the feature.
3470
3471Column 6: "score"
3472
3473The score of the feature, a floating point number.  As in earlier
3474versions of the format, the semantics of the score are ill-defined.
3475It is strongly recommended that E-values be used for sequence
3476similarity features, and that P-values be used for ab initio gene
3477prediction features.
3478
3479Column 7: "strand"
3480
3481The strand of the feature.  + for positive strand (relative to the
3482landmark), - for minus strand, and . for features that are not
3483stranded.  In addition, ? can be used for features whose strandedness
3484is relevant, but unknown.
3485
3486Column 8: "phase"
3487
3488For features of type "exon", the phase indicates where the feature
3489begins with reference to the reading frame.  The phase is one of the
3490integers 0, 1,or 2, indicating that the first base of the feature
3491corresponds to the first, second or last base of the codon,
3492respectively.  This is NOT to be confused with the frame, but relates
3493to the relative position of the translational start in whatever strand
3494the feature is in.
3495
3496Column 9: "attributes"
3497
3498A list of feature attributes in the format tag=value.  Multiple
3499tag=value pairs are separated by semicolons.  URL escaping rules are
3500used for tags or values containing the following characters: ",=;".
3501Whitespace should be replaced with the "+" character or the %20 URL
3502escape.  This will allow the file to survive text processing programs
3503that convert tabs into spaces.
3504
3505These tags have predefined meanings:
3506
3507    ID	   Indicates the name of the feature.  IDs must be unique
3508	   within the scope of the GFF file.
3509
3510    Name   Display name for the feature.  This is the name to be
3511           displayed to the user.  Unlike IDs, there is no requirement
3512	   that the Name be unique within the file.
3513
3514    Alias  A secondary name for the feature.  It is suggested that
3515	   this tag be used whenever a secondary identifier for the
3516	   feature is needed, such as locus names and
3517	   accession numbers.  Unlike ID, there is no requirement
3518	   that Alias be unique within the file.
3519
3520    Parent Indicates the parent of the feature.  A parent ID can be
3521	   used to group exons into transcripts, transcripts into
3522	   genes, an so forth.  A feature may have multiple parents.
3523
3524    Target Indicates the target of a nucleotide-to-nucleotide or
3525	   protein-to-nucleotide alignment.  The format of the
3526	   value is "target_id+start+end".
3527
3528    Gap    The alignment of the feature to the target if the two are
3529          not colinear (e.g. contain gaps).  The alignment format is
3530	  taken from the CIGAR format described in the
3531	  Exonerate documentation.
3532	  (http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate
3533           ?cvsroot=Ensembl).  See "THE GAP ATTRIBUTE" for a description
3534	   of this format.
3535
3536    Note   A free text note.
3537
3538    Dbxref A database cross reference.  See the section
3539	   "Ontology Associations and Db Cross References" for
3540	   details on the format.
3541
3542    Ontology_term  A cross reference to an ontology term.  See
3543           the section "Ontology Associations and Db Cross References"
3544	   for details.
3545
3546Multiple attributes of the same type are indicated by separating the
3547values with the comma "," character, as in:
3548
3549       Parent=AF2312,AB2812,abc-3
3550
3551Note that attribute names are case sensitive.  "Parent" is not the
3552same as "parent".
3553
3554All attributes that begin with an uppercase letter are reserved for
3555later use.  Attributes that begin with a lowercase letter can be used
3556freely by applications.
3557
3558
3559
3560=back
3561
3562=cut
3563
3564    ;
3565
3566
3567
3568sub to_GFF3_format {
3569    my ($gene_obj, %preferences) = @_;
3570
3571    my $gene_id = $gene_obj->{TU_feat_name};
3572
3573    my $strand = $gene_obj->get_orientation();
3574
3575    my @noteText;
3576
3577    if ($gene_obj->{is_pseudogene}) {
3578        push (@noteText, "(pseudogene)");
3579    }
3580
3581    ## parse preferences
3582    my $asmbl_id = $preferences{seqid} || $gene_obj->{asmbl_id};
3583    my $source = $preferences{source} || $gene_obj->{source} || ".";
3584
3585    unless (defined $asmbl_id) {
3586        confess "Error, no asmbl_id from gene_obj\n";
3587    }
3588
3589
3590    my ($gene_lend, $gene_rend) = sort {$a<=>$b} $gene_obj->get_gene_span();
3591    my $com_name = $gene_obj->{com_name};
3592	unless ($com_name =~ /\w/) {
3593		$com_name = "";
3594	}
3595
3596	if ($com_name) {
3597		# uri escape it:
3598		$com_name = uri_escape($com_name);
3599	}
3600
3601	my $gene_alias = "";
3602	if (my $pub_locus = $gene_obj->{pub_locus}) {
3603		$gene_alias = "Alias=$pub_locus;";
3604	}
3605
3606    my $feat_type = ($gene_obj->{gene_type} eq "protein-coding") ? "gene" : $gene_obj->{gene_type};
3607
3608
3609    my $gff3_text = "$asmbl_id\t$source\t$feat_type\t$gene_lend\t$gene_rend\t.\t$strand\t.\tID=$gene_id;Name=$com_name;$gene_alias\n";  ## note, non-coding gene features are currently represented by a simple single coordinate pair.
3610
3611    if ($gene_obj->{gene_type} eq "protein-coding")  {
3612
3613        my $gene_obj_ref = $gene_obj;
3614
3615        foreach my $gene_obj ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms() ) {
3616
3617            my $model_id = $gene_obj->{Model_feat_name};
3618            my $model_alias = "";
3619            if (my $model_locus = $gene_obj->{Model_pub_locus}) {
3620				$model_alias = "Alias=$model_locus;";
3621			}
3622
3623			my ($mrna_lend, $mrna_rend) = $gene_obj->get_transcript_span();
3624
3625            $gff3_text .= "$asmbl_id\t$source\tmRNA\t$mrna_lend\t$mrna_rend\t.\t$strand\t.\tID=$model_id;Parent=$gene_id;Name=$com_name;$model_alias\n";
3626
3627            ## mark the first and last CDS entries (for now, an unpleasant hack!)
3628            my @exons = $gene_obj->get_exons();
3629            ## find the first cds
3630            foreach my $exon (@exons) {
3631                if (my $cds = $exon->get_CDS_obj()) {
3632                    $cds->{first_cds} = 1;
3633                    last;
3634                }
3635            }
3636            @exons = reverse @exons;
3637            foreach my $exon (@exons) {
3638                if (my $cds = $exon->get_CDS_obj()) {
3639                    $cds->{last_cds} = 1;
3640                    last;
3641                }
3642            }
3643
3644            my $prime5_partial = $gene_obj->is_5prime_partial();
3645            my $prime3_partial = $gene_obj->is_3prime_partial();
3646
3647
3648            ## annotate 5' utr
3649            if ($gene_obj->has_CDS() && $gene_obj->has_5prime_UTR()) {
3650                my @prime5_utr = $gene_obj->get_5prime_UTR_coords();
3651                if (@prime5_utr) {
3652                    my $utr_count = 0;
3653                    foreach my $coordset (@prime5_utr) {
3654                        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
3655                        $utr_count++;
3656                        my $utr_id = "$model_id.utr5p$utr_count";
3657                        $gff3_text .= "$asmbl_id\t$source\tfive_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n";
3658                    }
3659                }
3660            }
3661
3662
3663			my $exon_counter = 0;
3664            foreach my $exon ($gene_obj->get_exons()) {
3665                $exon_counter++;
3666				my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
3667                my $exon_ID_string = "";
3668                if (my $exon_feat_name = $exon->{feat_name}) {
3669                    $exon_ID_string = "$exon_feat_name";
3670                }
3671				else {
3672					$exon_ID_string = "$model_id.exon$exon_counter";
3673				}
3674                $gff3_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tID=${exon_ID_string};Parent=$model_id\n";
3675
3676                if (my $cds_obj = $exon->get_CDS_obj()) {
3677                    my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords();
3678                    my $phase = $cds_obj->{phase};
3679					if (defined($phase)) {
3680						## use GFF3 definition of phase, which is how many bases to trim before encountering first base of start
3681						if ($phase == 2) {
3682							$phase = 1;
3683						}
3684						elsif ($phase == 1) {
3685							$phase = 2;
3686						}
3687						# phase 0 remains 0
3688					}
3689					else {
3690						$phase =  "."; #use phase info if avail
3691                    }
3692
3693
3694					my $cds_ID_string = "cds.$model_id";
3695
3696					# according to the GFF3 spec, CDS segments from the same coding region should have the same identifier.
3697					#if (my $cds_feat_name = $cds_obj->{feat_name}) {
3698					#	$cds_ID_string = "$cds_feat_name";
3699					#}
3700					#else {
3701					#	$cds_ID_string = "$model_id.cds$exon_counter";
3702					#}
3703
3704                    my $partial_text = "";
3705                    if ($prime5_partial && $cds_obj->{first_cds}) {
3706                        $partial_text .= ";5_prime_partial=true";
3707                    }
3708                    if ($prime3_partial && $cds_obj->{last_cds}) {
3709                        $partial_text .= ";3_prime_partial=true";
3710                    }
3711
3712                    $gff3_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tID=${cds_ID_string};Parent=$model_id$partial_text\n";
3713                }
3714            }
3715
3716            ## annotate 3' utr
3717            if ($gene_obj->has_CDS() && $gene_obj->has_3prime_UTR()) {
3718                my @prime3_utr = $gene_obj->get_3prime_UTR_coords();
3719                if (@prime3_utr) {
3720                    my $utr_count = 0;
3721                    foreach my $coordset (@prime3_utr) {
3722                        my ($lend, $rend) = sort {$a<=>$b} @$coordset;
3723                        $utr_count++;
3724                        my $utr_id = "$model_id.utr3p$utr_count";
3725                        $gff3_text .= "$asmbl_id\t$source\tthree_prime_UTR\t$lend\t$rend\t.\t$strand\t.\tID=$utr_id;Parent=$model_id\n";
3726                    }
3727                }
3728
3729            }
3730        }
3731
3732    }  ## end of protein-coding genes
3733
3734
3735	## strip off any trailing whitespace and semicolons:
3736	my @lines = split (/\n/, $gff3_text);
3737	foreach my $line (@lines) {
3738		$line =~ s/\s+$//;
3739		$line =~ s/;$//;
3740	}
3741
3742	$gff3_text = join ("\n", @lines) . "\n";
3743
3744    return ($gff3_text);
3745
3746}
3747
3748
3749
3750=over 4
3751
3752=item to_BED_format()
3753
3754B<Description:> describes gene in BED format
3755B<Parameters:> (uri_encode => 1|0)
3756B<Returns:> string
3757
3758
3759BED format described here:
3760http://genome.ucsc.edu/FAQ/FAQformat.html#format1
3761
3762	BED format
3763
3764
3765
3766
3767BED format provides a flexible way to define the data lines that are displayed in an annotation track. BED lines have three required fields and nine additional optional fields. The number of fields per line must be consistent throughout any single set of data in an annotation track. The order of the optional fields is binding: lower-numbered fields must always be populated if higher-numbered fields are used.
3768
3769The first three required BED fields are:
3770
37711. chrom - The name of the chromosome (e.g. chr3, chrY, chr2_random) or scaffold (e.g. scaffold10671).
3772
37732. chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
3774
37753. chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
3776
3777The 9 additional optional BED fields are:
3778
37794. name - Defines the name of the BED line. This label is displayed to the left of the BED line in the Genome Browser window when the track is open to full display mode or directly to the left of the item in pack mode.
3780
37815. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). This table shows the Genome Browsers translation of BED score values into shades of gray
3782
37836. strand - Defines the strand - either '+' or '-'.
3784
37857. thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
3786
37878. thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
3788
37899. itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
3790
379110. blockCount - The number of blocks (exons) in the BED line.
3792
379311. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
3794
379512. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
3796
3797Example:
3798Heres an example of an annotation track that uses a complete BED definition:
3799track name=pairedReads description="Clone Paired Reads" useScore=1
3800chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
3801chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
3802
3803
3804
3805
3806=cut
3807
3808
3809sub to_BED_format {
3810	my $self = shift;
3811    my %params = @_;
3812
3813	my $strand = $self->get_strand();
3814
3815	my ($coding_lend, $coding_rend) = sort {$a<=>$b} $self->get_CDS_span();
3816
3817	my $scaffold = $self->{asmbl_id};
3818
3819	my $gene_id = $self->{TU_feat_name};
3820	my $trans_id = $self->{Model_feat_name};
3821
3822	my $com_name = $self->{com_name} || "";
3823
3824    my $score = $params{score} || 0;
3825
3826	if (my $alias = $self->{pub_locus}) {
3827		$com_name = "Alias=$alias;$com_name";
3828	}
3829
3830
3831    if ($gene_id) {
3832        $com_name = "$gene_id;$com_name";
3833    }
3834
3835	if ($trans_id) {
3836		$com_name = "ID=$trans_id;$com_name";
3837	}
3838	else {
3839        $com_name = "ID=$com_name";
3840    }
3841
3842    if ($params{uri_encode}) {
3843        $com_name = uri_escape($com_name);
3844    }
3845
3846
3847	my @exons = sort {$a->{end5}<=>$b->{end5}} $self->get_exons();
3848
3849    my @exon_coords;
3850	foreach my $exon (@exons) {
3851
3852		my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
3853        push (@exon_coords, [$exon_lend, $exon_rend]);
3854    }
3855
3856
3857	my @starts;
3858	my @lengths;
3859
3860    my $gene_lend = $exon_coords[0]->[0];
3861    my $gene_rend = $exon_coords[$#exon_coords]->[1];
3862
3863    foreach my $exon_coordset (@exon_coords) {
3864        my ($exon_lend, $exon_rend) = @$exon_coordset;
3865
3866		my $start = $exon_lend - $gene_lend;
3867		push (@starts, $start);
3868
3869		my $length = $exon_rend - $exon_lend + 1;
3870		push (@lengths, $length);
3871	}
3872
3873
3874	## construct bed output.
3875
3876	$com_name =~ s/ /_/g;
3877
3878	my $bed_line = join("\t", $scaffold,
3879						$gene_lend-1, $gene_rend,
3880						$com_name,
3881						$score,
3882						$strand,
3883						$coding_lend-1, $coding_rend,
3884						"0", # rgb info - use '.' to allow user customization in IGV.   Need 0 for compatibility with UCSC browser.
3885						scalar(@lengths),
3886						join(",", @lengths),
3887						join(",", @starts)
3888						) . "\n";
3889
3890    foreach my $isoform ($self->get_additional_isoforms()) {
3891        $bed_line .= $isoform->to_BED_format(%params);
3892    }
3893
3894	return($bed_line);
3895}
3896
3897
3898
3899# static method, returns gene object.
3900sub BED_line_to_gene_obj {
3901	my ($bed_line) = @_;
3902
3903	if (ref $bed_line) {
3904		confess "Error, static method, just provide bed text line, returns gene_obj";
3905	}
3906
3907
3908	my @x = split(/\t/, $bed_line);
3909
3910	my $scaff = $x[0];
3911	my $gene_lend = $x[1] + 1;
3912	my $gene_rend = $x[2];
3913
3914	my $com_name = $x[3];
3915
3916	my $score = $x[4];
3917	my $orient = $x[5];
3918
3919	if ($orient eq '*') {
3920		$orient = '+';
3921	}
3922
3923
3924	my $coding_lend = $x[6] + 1;
3925	my $coding_rend = $x[7];
3926
3927	my $rgb_color = $x[8];
3928
3929	my $num_exons = $x[9];
3930
3931	my $lengths_text = $x[10];
3932	my $exon_relative_starts_text = $x[11];
3933
3934	my @lengths = split(/,/, $lengths_text);
3935	my @exon_relative_starts = split(/,/, $exon_relative_starts_text);
3936
3937	my @exons;
3938
3939	while (@lengths) {
3940		my $len = shift @lengths;
3941		my $start = shift @exon_relative_starts;
3942
3943		my $exon_lend = $gene_lend + $start;
3944		my $exon_rend = $exon_lend + $len - 1;
3945
3946
3947		print "Len: $len, start=$start   ====>  $exon_lend - $exon_rend\n" if $DEBUG;
3948
3949		push (@exons, [$exon_lend, $exon_rend]);
3950
3951	}
3952
3953
3954	print "Coding: $coding_lend-$coding_rend, Exons: " . Dumper (\@exons) if $DEBUG;
3955
3956	my $gene_obj = new Gene_obj();
3957	$gene_obj->build_gene_obj_exons_n_cds_range(\@exons, $coding_lend, $coding_rend, $orient);
3958
3959	$gene_obj->{com_name} = $com_name;
3960	$gene_obj->{asmbl_id} = $scaff;
3961
3962	$com_name =~ s/\s+/\|/g; # reformat as an identifier with no whitespace
3963
3964	$gene_obj->{TU_feat_name} = "$com_name";
3965	$gene_obj->{Model_feat_name} = "m.$com_name";
3966
3967	return($gene_obj);
3968
3969
3970}
3971
3972
3973
3974
3975
3976
3977## Private, remove leading and trailing whitespace characters:
3978sub trim_leading_trailing_ws {
3979    my ($ref) = @_;
3980    if (ref $ref eq "SCALAR") {
3981        $$ref =~ s/^\s+|\s+$//g;
3982    } elsif (ref $ref eq "ARRAY") {
3983        foreach my $element (@$ref) {
3984            $element =~ s/^\s+|\s+$//g;
3985        }
3986    } else {
3987        my $type = ref $ref;
3988        die "Currently don't support trim_leading_trailing_ws(ref type: $type)\n";
3989    }
3990}
3991
3992
3993
3994=over 4
3995
3996=item to_GTF2_format()
3997
3998B<Description:> provides gene in GTF2 format
3999
4000B<Parameters:> genome_seq_ref, [properties_href]
4001
4002B<Returns:> text
4003
4004
4005properties_href encodes preferences like so
4006
4007    properties_href = {
4008                          seqname => tigr_asmbl_id_1000, # by default, asmbl_id is used as encoded in gene_obj
4009
4010                          source => MyGenePrediction,   # by default, set to "TIGR"
4011
4012                          include_comments => 0,   # turned on by default, indicating partial or pseudogenes with preceding comment lines
4013
4014                      }
4015
4016
4017
4018The GTF2 format is described here:
4019http://genes.cs.wustl.edu/GTF2.html
4020
4021as follows:
4022
4023GTF2 format (Revised Ensembl GTF)
4024Gene transfer format. This borrows from GFF, but has additional structure that warrants a separate definition and format name.
4025NEW! Validating Parser for GTF
4026
4027Structure is as GFF, so the fields are:
4028<seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
4029
4030Here is a simple example with 3 translated exons. Order of rows is not important.
4031
4032AB000381 Twinscan  CDS          380   401   .   +   0  gene_id "001"; transcript_id "001.1";
4033AB000381 Twinscan  CDS          501   650   .   +   2  gene_id "001"; transcript_id "001.1";
4034AB000381 Twinscan  CDS          700   707   .   +   2  gene_id "001"; transcript_id "001.1";
4035AB000381 Twinscan  start_codon  380   382   .   +   0  gene_id "001"; transcript_id "001.1";
4036AB000381 Twinscan  stop_codon   708   710   .   +   0  gene_id "001"; transcript_id "001.1";
4037
4038The whitespace in this example is provided only for readability. In GTF, fields must be separated by a single TAB and no white space.
4039
4040<seqname>
4041The FPC contig ID from the Golden Path.
4042
4043<source>
4044The source column should be a unique label indicating where the annotations came from --- typically the name of either a prediction program or a public database.
4045
4046<feature>
4047The following feature types are required: "CDS", "start_codon", "stop_codon". The feature "exon" is optional, since this project will not evaluate predicted splice sites outside of protein coding regions. All other features will be ignored.
4048
4049CDS represents the coding sequence starting with the first translated codon and proceeding to the last translated codon. Unlike Genbank annotation, the stop codon is not included in the CDS for the terminal exon.
4050
4051<start> <end>
4052Integer start and end coordinates of the feature relative to the beginning of the sequence named in <seqname>.  <start> must be less than or equal to <end>. Sequence numbering starts at 1. Values of <start> and <end> that extend outside the reference sequence are technically acceptable, but they are discouraged for purposes of this project.
4053
4054<score>
4055The score field will not be used for this project, so you can either provide a meaningful float or replace it by a dot.
4056
4057<frame>
40580 indicates that the first whole codon of the reading frame is located at 5'-most base. 1 means that there is one extra base before the first codon and 2 means that there are two extra bases before the first codon. Note that the frame is not the length of the CDS mod 3.
4059
4060Here are the details excised from the GFF spec. Important: Note comment on reverse strand.
4061
4062    '0' indicates that the specified region is in frame, i.e. that its first base corresponds to the first base of a codon. '1' indicates that there is one extra base, i.e. that the second base of the region corresponds to the first base of a codon, and '2' means that the third base of the region is the first base of a codon. If the strand is '-', then the first base of the region is value of <end>, because the corresponding coding region will run from <end> to <start> on the reverse strand.
4063
4064[attributes]
4065All four features have the same two mandatory attributes at the end of the record:
4066
4067    * gene_id value;     A globally unique identifier for the genomic source of the transcript
4068    * transcript_id value;     A globally unique identifier for the predicted transcript.
4069
4070These attributes are designed for handling multiple transcripts from the same genomic region. Any other attributes or comments must appear after these two and will be ignored.
4071
4072Attributes must end in a semicolon which must then be separated from the start of any subsequent attribute by exactly one space character (NOT a tab character).
4073
4074Textual attributes should be surrounded by doublequotes.
4075
4076Here is an example of a gene on the negative strand. Larger coordinates are 5' of smaller coordinates. Thus, the start codon is 3 bp with largest coordinates among all those bp that fall within the CDS regions. Similarly, the stop codon is the 3 bp with coordinates just less than the smallest coordinates within the CDS regions.
4077
4078AB000123    Twinscan     CDS    193817    194022    .    -    2    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
4079AB000123    Twinscan     CDS    199645    199752    .    -    2    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
4080AB000123    Twinscan     CDS    200369    200508    .    -    1    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
4081AB000123    Twinscan     CDS    215991    216028    .    -    0    gene_id "AB000123.1"; transcript_id "AB00123.1.2";
4082AB000123    Twinscan     start_codon   216026    216028    .    -    .    gene_id    "AB000123.1"; transcript_id "AB00123.1.2";
4083AB000123    Twinscan     stop_codon    193814    193816    .    -    .    gene_id    "AB000123.1"; transcript_id "AB00123.1.2";
4084
4085Note the frames of the coding exons. For example:
4086
4087   1. The first CDS (from 216028 to 215991) always has frame zero.
4088   2. Frame of the 1st CDS =0, length =38.  (frame - length) % 3  = 1, the frame of the 2nd CDS.
4089   3. Frame of the 2nd CDS=1, length=140. (frame - length) % 3  = 2, the frame of the 3rd CDS.
4090   4. Frame of the 3rd CDS=2, length=108. (frame - length) % 3  =  2, the frame of the terminal CDS.
4091   5. Alternatively, the frame of terminal CDS can be calculated without the rest of the gene. Length of the terminal CDS=206. length % 3 =2, the frame of the terminal CDS.
4092
4093Here is an example in which the "exon" feature is used. It is a 5 exon gene with 3 translated exons.
4094
4095AB000381 Twinscan  exon         150   200   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4096AB000381 Twinscan  exon         300   401   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4097AB000381 Twinscan  CDS          380   401   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4098AB000381 Twinscan  exon         501   650   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4099AB000381 Twinscan  CDS          501   650   .   +   2  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4100AB000381 Twinscan  exon         700   800   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4101AB000381 Twinscan  CDS          700   707   .   +   2  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4102AB000381 Twinscan  exon         900  1000   .   +   .  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4103AB000381 Twinscan  start_codon  380   382   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4104AB000381 Twinscan  stop_codon   708   710   .   +   0  gene_id "AB000381.000"; transcript_id "AB000381.000.1";
4105
4106
4107
4108
4109
4110=back
4111
4112=cut
4113
4114
4115
4116sub to_GTF2_format () {
4117    my $self = shift;
4118    my $genomic_seq_ref = shift;
4119
4120    my $properties_href = shift;
4121    unless ($properties_href) {
4122        $properties_href = {};
4123    }
4124
4125
4126    ## need to adjust my frame definition so it's consistent with requirements above in spec.
4127    my $frame_convert = sub {
4128        my $phase = shift;
4129
4130        my %frame = ( 0 => 0,
4131                      1 => 2,
4132                      2 => 1 );
4133        return ($frame{$phase});
4134    };
4135
4136
4137    my $gtf2_text = "";
4138
4139    my $gene_obj = $self;
4140
4141    my $asmbl_id = $properties_href->{seqname} || $gene_obj->{asmbl_id} || die "Error, no asmbl_id as gene_obj att";
4142
4143    my $source = $properties_href->{source} || "TIGR";
4144
4145    my $gene_id = $gene_obj->{TU_feat_name};
4146    my $model_id = $gene_obj->{Model_feat_name};
4147    my $strand = $gene_obj->get_orientation();
4148
4149
4150    my $comment_line = "";
4151    if ($gene_obj->is_pseudogene()) {
4152        $comment_line .= "$model_id=pseudogene ";
4153    }
4154
4155    if ( $gene_obj->{gene_type} eq "protein-coding") {
4156
4157        if (! $gene_obj->is_pseudogene()) {
4158
4159            $gene_obj->set_CDS_phases($genomic_seq_ref);
4160            # also resets the 5' and 3' partiality attributes based on the longest orf.
4161
4162
4163            if ($gene_obj->is_5prime_partial()) {
4164                $comment_line .= "$model_id=5'partial ";
4165            }
4166            else {
4167                $gene_obj->validate_start_codon();
4168            }
4169
4170            if ($gene_obj->is_3prime_partial() ) {
4171                $comment_line .= "$model_id=3'partial ";
4172            }
4173            else {
4174                $gene_obj->validate_stop_codon();
4175            }
4176        }
4177
4178        my @stop_codon_objs;
4179        my @start_codons;
4180
4181        if (! $gene_obj->is_pseudogene()) {
4182
4183            if (! $gene_obj->is_3prime_partial())  {
4184                @stop_codon_objs = $gene_obj->_remove_stop_codons();
4185
4186                unless (@stop_codon_objs) {
4187                    confess $gene_obj->toString() . "Error, no stop codon objs retrieved for non 3' partial gene";
4188                }
4189            }
4190            if (! $gene_obj->is_5prime_partial()) {
4191                @start_codons = $self->_extract_start_codons();
4192
4193                unless (@start_codons) {
4194                    confess $gene_obj->toString() . "Error, no start codon extracted for non 5'partial gene.";
4195                }
4196            }
4197
4198        }
4199
4200        foreach my $start_codon (@start_codons) {
4201            my ($start_lend, $start_rend) = sort {$a<=>$b} $start_codon->get_coords();
4202            my $phase = &$frame_convert($start_codon->{phase});
4203            $gtf2_text .= "$asmbl_id\t$source\tstart_codon\t$start_lend\t$start_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
4204        }
4205
4206
4207        foreach my $exon ($gene_obj->get_exons()) {
4208            my ($exon_lend, $exon_rend) = sort {$a<=>$b} $exon->get_coords();
4209            $gtf2_text .= "$asmbl_id\t$source\texon\t$exon_lend\t$exon_rend\t.\t$strand\t.\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
4210
4211            if ($gene_obj->is_pseudogene()) { next; } # don't bother trying to report nonsensical CDSs.
4212
4213            if (my $cds_obj = $exon->get_CDS_obj()) {
4214                my ($cds_lend, $cds_rend) = sort {$a<=>$b} $cds_obj->get_coords();
4215                my $phase = $cds_obj->{phase};
4216                unless (defined($phase)) {
4217                    die "Error, no phase defined for cds($cds_lend-$cds_rend) of gene" . $gene_obj->toString();
4218                }
4219                $phase = &$frame_convert($phase);
4220
4221                $gtf2_text .= "$asmbl_id\t$source\tCDS\t$cds_lend\t$cds_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
4222            }
4223        }
4224
4225        foreach my $stop_codon (@stop_codon_objs) {
4226            my ($stop_lend, $stop_rend) = sort {$a<=>$b} $stop_codon->get_coords();
4227            my $phase = &$frame_convert($stop_codon->{phase});
4228            $gtf2_text .= "$asmbl_id\t$source\tstop_codon\t$stop_lend\t$stop_rend\t.\t$strand\t$phase\tgene_id \"$gene_id\"; transcript_id \"$model_id\";\n";
4229        }
4230
4231        foreach my $isoform ($gene_obj->get_additional_isoforms() ) {
4232            $gtf2_text .= $isoform->to_GTF2_format($genomic_seq_ref, $properties_href);
4233        }
4234    }
4235
4236    if ($comment_line) {
4237        # prefix with \# to actually comment it in the file
4238        $comment_line = "#$comment_line\n";
4239    }
4240
4241    my $comment_flag = $properties_href->{include_comments};
4242    if (defined ($comment_flag) && $comment_flag == 0) {
4243        $comment_line = ""; # clear it
4244    }
4245
4246
4247    return ($comment_line . $gtf2_text);
4248}
4249
4250
4251sub _extract_start_codons {
4252    my $self = shift;
4253
4254    ## 5' partiality attribute is trusted here !!!
4255
4256    if ($self->is_5prime_partial()) {
4257        return();
4258    }
4259
4260    my @exons = $self->get_exons();
4261    my $orientation = $self->get_orientation();
4262
4263    my @start_codons;
4264
4265    my $found_cds_flag = 0;
4266
4267    for (my $i = 0; $i <= $#exons; $i++) {
4268        if (my $cds = $exons[$i]->get_CDS_obj()) {
4269            # found first cds
4270            $found_cds_flag = 1;
4271            my ($cds_end5, $cds_end3) = $cds->get_coords();
4272            my $cds_len = $cds->length();
4273            if ($cds_len >= 3) {
4274                ## got start codon in entirety
4275                if ($orientation eq '+') {
4276                    push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5+2)->set_phase(0));
4277                    last;
4278                }
4279                else {
4280                    push (@start_codons, CDS_exon_obj->new($cds_end5, $cds_end5-2)->set_phase(0));
4281                    last;
4282                }
4283            }
4284            else {
4285                ## split start codon
4286                push (@start_codons, $cds); # add current cds as start codon part
4287                my $missing_length = 3 - $cds_len;
4288
4289                ## examine next cds exon for part of it:
4290                my $next_cds = $exons[$i+1]->get_CDS_obj();
4291                unless (ref $next_cds) {
4292                    die "Error, no next cds for split start codon" . $self->toString();
4293                }
4294
4295                my ($next_cds_end5, $next_cds_end3) = $next_cds->get_coords();
4296                my $next_cds_len = $next_cds->length();
4297
4298                if ($next_cds_len >= $missing_length) {
4299                    # great, this has everything we need
4300                    if ($orientation eq '+') {
4301                        push (@start_codons,
4302                              CDS_exon_obj->new($next_cds_end5, $next_cds_end5 + $missing_length-1)->set_phase($next_cds->{phase}));
4303                        last;
4304                    }
4305                    else {
4306                        push (@start_codons,
4307                              CDS_exon_obj->new($next_cds_end5, $next_cds_end5 - $missing_length + 1)->set_phase($next_cds->{phase}));
4308                        last;
4309                    }
4310                }
4311                else {
4312                    ## another split start codon portion.  Just add the current cds, and get the first bp from the next cds
4313                    push (@start_codons, $next_cds);
4314
4315                    my $final_cds = $exons[$i+2]->get_CDS_obj();
4316                    unless (ref $final_cds) {
4317                        die "Error getting final cds of three-part split start codon";
4318                    }
4319                    unless ($final_cds->{phase} == 2) {
4320                        die "Error, final cds of three-part stop codon is not in phase 2 ";
4321                    }
4322                    my ($final_cds_end5, $final_cds_end3) = $final_cds->get_coords();
4323                    push (@start_codons,
4324                          CDS_exon_obj->new($final_cds_end5, $final_cds_end5)->set_phase(2));
4325                    last;
4326                }
4327            } # end of split start codon
4328
4329        } # end of found cds
4330    } # end of foreach exon
4331
4332    unless ($found_cds_flag) {
4333        die "Error, no cds exon found in search of start codon";
4334    }
4335
4336    unless (@start_codons) {
4337        die "Error, no start codons found";
4338    }
4339    ## ensure start codons sum to 3
4340    my $sum_len = 0;
4341    foreach my $start_codon (@start_codons) {
4342        $sum_len += $start_codon->length();
4343    }
4344    unless ($sum_len == 3) {
4345        print "Error, sum len of start codons != 3 ( = $sum_len, instead) " . $self->toString() . "starts:\n";
4346        my $i=0;
4347        foreach my $start (@start_codons) {
4348            $i++;
4349            print "start($i): " . $start->toString();
4350        }
4351        die;
4352    }
4353
4354    return (@start_codons);
4355
4356}
4357
4358
4359
4360sub _remove_stop_codons {
4361    my $self = shift;
4362
4363    ## 3' partiality attribute is trusted here !!!
4364
4365    if ($self->is_3prime_partial()) {
4366        return ();
4367    }
4368
4369    my $orientation = $self->get_orientation();
4370    my @exons = reverse $self->get_exons(); # examining exons in reverse order, starting from stop codon direction.
4371
4372    my @stop_codons;
4373
4374    my $found_cds_flag = 0;
4375
4376    ## find first exon
4377    for (my $i=0; $i <= $#exons; $i++) {
4378        if (my $cds = $exons[$i]->get_CDS_obj()) {
4379
4380            $found_cds_flag = 1;
4381
4382            my ($cds_end5, $cds_end3) = $cds->get_coords();
4383
4384            my $cds_length = $cds->length();
4385            if ($cds_length > 3) {
4386                ## cds exon encodes more than just the stop codon
4387                if ($orientation eq '+') {
4388                    $cds->{end3} -= 3;
4389                    push (@stop_codons, CDS_exon_obj->new($cds_end3 - 2, $cds_end3)->set_phase(0));
4390                }
4391                else {
4392                    $cds->{end3} += 3;
4393                    push (@stop_codons, CDS_exon_obj->new($cds_end3 + 2, $cds_end3)->set_phase(0));
4394                }
4395                last;
4396
4397            }
4398            elsif ($cds_length == 3) {
4399                ## Just a stop codon exon.  We can remove it.
4400                push (@stop_codons, $cds);
4401                $exons[$i]->{CDS_exon_obj} = 0; # nullified
4402                last;
4403            }
4404
4405            else {
4406                ## cds exon encodes a split stop codon
4407                push (@stop_codons, $cds); # just add the last portion of stop codon
4408                $exons[$i]->{CDS_exon_obj} = 0; # nullified
4409
4410                ## check next portion of cds exon to see if it contains the rest of the stop
4411                my $next_exon = $exons[$i+1];
4412                unless (ref $next_exon) {
4413                    die "Error, incomplete stop codon and not enough exons! ";
4414                }
4415                my $missing_stop_length = 3 - $cds_length;
4416                my $next_cds_obj = $next_exon->get_CDS_obj();
4417                unless (ref $next_cds_obj) {
4418                    die "Error, next cds obj is missing!";
4419                }
4420
4421                my $next_cds_length = $next_cds_obj->length();
4422                my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords();
4423                if ($next_cds_length <= $missing_stop_length) {
4424                    ## encodes only the second part of the stop codon
4425                    # add and nullify
4426                    push (@stop_codons, $next_cds_obj);
4427                    $next_exon->{CDS_exon_obj} = 0;
4428
4429                    ## get the very last part of the stop
4430                    $missing_stop_length -= $next_cds_length;
4431                    if ($missing_stop_length > 0) {
4432                        ## must be still missing the first bp of the stop codon
4433                        if ($missing_stop_length != 1) {
4434                            die "Error, too much of the stop codon is left (missing_length = $missing_stop_length).  Should only be 1 ";
4435                        }
4436                        my $next_exon = $exons[$i+2];
4437                        unless (ref $next_exon) {
4438                            die "Error, second next exon is unavail ";
4439                        }
4440                        my $next_cds_obj = $next_exon->get_CDS_obj();
4441                        unless (ref $next_cds_obj) {
4442                            die "Error, second next cds obj is unavail";
4443                        }
4444                        my $cds_length = $next_cds_obj->length();
4445                        my ($cds_end5, $cds_end3) = $next_cds_obj->get_coords();
4446                        if ($cds_length > 1) {
4447                            if ($orientation eq '+') {
4448                                $next_cds_obj->{end3}-=1;
4449                                push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0));
4450                            }
4451                            else {
4452                                $next_cds_obj->{end3}+=1;
4453                                push (@stop_codons, CDS_exon_obj->new($cds_end3, $cds_end3)->set_phase(0));
4454                            }
4455                        }
4456                    }
4457                }
4458                else {
4459                    # split stop codon
4460                    #missing length of cds exon is present in the second portion of the stop
4461                    if ($orientation eq '+') {
4462                        $next_cds_obj->{end3} -= $missing_stop_length;
4463                        push (@stop_codons, CDS_exon_obj->new($cds_end3 - $missing_stop_length + 1, $cds_end3)->set_phase(0));
4464                    }
4465                    else {
4466                        $next_cds_obj->{end3} += $missing_stop_length;
4467                        push (@stop_codons, CDS_exon_obj->new($cds_end3 + $missing_stop_length -1, $cds_end3)->set_phase(0));
4468                    }
4469                }
4470            } # end of split stop codon
4471
4472            last;
4473
4474        } # end of found cds obj
4475
4476
4477    } # end of foreach exon
4478
4479    unless ($found_cds_flag) {
4480        die "Error, no cds exon was found. ";
4481    }
4482
4483
4484    unless (@stop_codons) {
4485        die "Error, no stop codons extracted from non-partial gene.";
4486    }
4487
4488    @stop_codons = reverse @stop_codons; # reorder according to gene direction
4489
4490    ## make sure sum (stop_codons) length == 3
4491    my $sum_len = 0;
4492    foreach my $stop_codon (@stop_codons) {
4493        $sum_len += $stop_codon->length();
4494    }
4495    if ($sum_len != 3) {
4496        print "Error, stop codons sum length != 3 ( = $sum_len, instead) " . $self->toString();
4497        my $i=0;
4498        foreach my $stop_codon (@stop_codons) {
4499            print "stop($i): " . $stop_codon->toString();
4500        }
4501
4502        die;
4503    }
4504
4505    return (@stop_codons);
4506
4507}
4508
4509
4510
4511sub has_CDS {
4512    my $self = shift;
4513
4514    foreach my $exon ($self->get_exons()) {
4515        if (ref ($exon->get_CDS_obj())) {
4516            return (1);
4517        }
4518    }
4519
4520    return (0); # no cds entry found
4521}
4522
4523
4524####
4525sub set_CDS_phases_from_init_phase {
4526    my ($self, $init_phase) = @_;
4527
4528    my @exons = $self->get_exons();
4529
4530    my $curr_cds_len = $init_phase;
4531
4532    foreach my $exon (@exons) {
4533        if (my $cds = $exon->get_CDS_obj()) {
4534            $cds->set_phase($curr_cds_len % 3);
4535            my $cds_len = $cds->length();
4536            $curr_cds_len += $cds_len;
4537        }
4538    }
4539
4540    return;
4541}
4542
4543
4544
4545sub set_CDS_phases {
4546    my ($self, $genomic_seq_ref) = @_;
4547
4548
4549    my $start_pos = 1;
4550    if ($self->has_CDS() && ! $self->is_pseudogene()) {
4551
4552        $self->create_all_sequence_types($genomic_seq_ref);
4553
4554        my $cds_sequence = $self->get_CDS_sequence();
4555        my $protein_seq = $self->get_protein_sequence();
4556
4557        ## first, clear the partial attributes:
4558        $self->set_5prime_partial(0);
4559        $self->set_3prime_partial(0);
4560
4561
4562        if ($protein_seq !~ /^M/) {
4563            # lacks start codon
4564            $self->set_5prime_partial(1);
4565        }
4566        if ($protein_seq !~ /\*$/) {
4567            # lacks stop codon
4568            $self->set_3prime_partial(1);
4569        }
4570
4571        $start_pos = $self->_get_cds_start_pos($cds_sequence);
4572
4573
4574        my $first_phase = $start_pos - 1;
4575
4576        my @exons = $self->get_exons();
4577        my @cds_objs;
4578        foreach my $exon (@exons) {
4579            my $cds = $exon->get_CDS_obj();
4580            if (ref $cds) {
4581                push (@cds_objs, $cds);
4582            }
4583        }
4584
4585        my $cds_obj = shift @cds_objs;
4586        $cds_obj->{phase} = $first_phase;
4587        my $cds_length = abs ($cds_obj->{end3} - $cds_obj->{end5}) + 1;
4588        $cds_length -= $first_phase;
4589
4590        while (@cds_objs) {
4591            my $next_cds_obj = shift @cds_objs;
4592            $next_cds_obj->{phase} = $cds_length % 3;
4593            $cds_length += abs ($next_cds_obj->{end3} - $next_cds_obj->{end5}) + 1;
4594        }
4595    }
4596
4597    foreach my $isoform ($self->get_additional_isoforms()) {
4598        $isoform->set_CDS_phases($genomic_seq_ref);
4599    }
4600
4601    return;
4602
4603}
4604
4605
4606sub get_first_CDS_segment {
4607    my $gene_obj = shift;
4608    my @exons = $gene_obj->get_exons();
4609
4610    foreach my $exon (@exons) {
4611        if (my $cds = $exon->get_CDS_exon_obj()) {
4612            return ($cds);
4613        }
4614    }
4615
4616    return undef;
4617}
4618
4619sub _get_cds_start_pos {
4620    my ($self, $cds_sequence) = @_;
4621    my $cds_length = length($cds_sequence);
4622    # if cds is set of triplets, assume translate at codon pos 1.
4623    my $codon_start;
4624
4625    ## must determine where translation starts:
4626    my $new_orfFinder = new Longest_orf();
4627    $new_orfFinder->allow_partials();
4628    $new_orfFinder->forward_strand_only();
4629
4630	my $longest_orf = $new_orfFinder->get_longest_orf($cds_sequence);
4631
4632	unless (ref $longest_orf) {
4633		die "No longest ORF found in sequence";
4634	}
4635
4636	## examine the first three ORFs, prefer long orf with stop codon.
4637    my $orfPos = $longest_orf->{start}; #init to first, longest orf.
4638
4639	unless (defined $orfPos) {
4640		die "Error, orfPos not defined! " . Dumper ($longest_orf);
4641	}
4642
4643	my $bestOrfPos;
4644    my @allOrfs = $new_orfFinder->orfs();
4645
4646    for my $orfIndex (0..2) {
4647        my $orf = $allOrfs[$orfIndex];
4648        if ($orf) {
4649            my $start = $orf->{start};
4650            my $length = $orf->{length};
4651            my $protein = $orf->{protein};
4652            if ($length > $cds_length - 3 && $start <= 3 && $protein =~ /\*$/) {
4653                unless ($bestOrfPos) {
4654                    $bestOrfPos = $start;
4655                }
4656            }
4657        }
4658    }
4659
4660    if ($bestOrfPos && $bestOrfPos != $orfPos) {
4661        $orfPos = $bestOrfPos;
4662    }
4663
4664    if ($orfPos >3) {
4665        confess "Error, longest ORF is found at position $orfPos, and should be between 1 and 3.  What's wrong with your gene?" . $self->toString();
4666    }
4667    $codon_start = $orfPos;
4668
4669    return ($codon_start);
4670}
4671
4672
4673=over 4
4674
4675=item dispose()
4676
4677B<Description:> Sets all attributes = 0, hopefully to faciliate targeting for garbage collection. (experimental method)
4678
4679B<Parameters:> none
4680
4681B<Returns:> none
4682
4683=back
4684
4685=cut
4686
4687sub dispose {
4688    my $self = shift;
4689    foreach my $att (keys %$self) {
4690	$self->{$att} = 0;
4691    }
4692}
4693
4694
4695
4696sub DESTROY {
4697    my $self = shift;
4698
4699    warn "DESTROYING gene_obj: " . $self->{TU_feat_name} . "," . $self->{Model_feat_name} . "\n" if $main::DEBUG;
4700
4701}
4702
4703
4704sub validate_start_codon {
4705    ## requires that you have the CDS sequence already set
4706    my $self = shift;
4707
4708    my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence.  It must be built prior to calling this method";
4709    ## currently, only trust Met start codons.
4710    my $start_codon = uc substr($cds_sequence, 0, 3);
4711    if ($start_codon ne "ATG") {
4712        die $self->toString() . "Error, start codon is not M (codon $start_codon instead)!";
4713        # call within an eval block to catch exception
4714    }
4715}
4716
4717
4718sub validate_stop_codon {
4719    ## requires that you have the CDS sequence already set
4720    my $self = shift;
4721
4722    my $cds_sequence = $self->get_CDS_sequence() or confess "Error, cannot get CDS sequence.  It must be built prior to calling this method";
4723
4724    my @stop_codons = &Nuc_translator::get_stop_codons();
4725
4726    my $curr_stop_codon = substr($cds_sequence, length($cds_sequence)-3, 3);
4727
4728    my $found_stop_codon_flag = 0;
4729    foreach my $stop (@stop_codons) {
4730        if ($stop eq $curr_stop_codon) {
4731            $found_stop_codon_flag = 1;
4732            last;
4733        }
4734    }
4735
4736    unless ($found_stop_codon_flag) {
4737        die $self->toString() . "Error, stop codon $curr_stop_codon is not an acceptable stop codon: [@stop_codons]\n";
4738    }
4739
4740}
4741
4742
4743
4744
4745######################################################################################################################################
4746######################################################################################################################################
4747
4748
4749=head1 NAME
4750
4751package mRNA_exon_obj
4752
4753=cut
4754
4755=head1 DESCRIPTION
4756
4757    The mRNA_exon_obj represents an individual spliced mRNA exon of a gene.  The coordinates of the exon can be manipulated, and the mRNA_exon_obj can contain a single CDS_exon_obj.  A mRNA_exon_obj lacking a CDS_exon_obj component is an untranslated (UTR) exon.
4758
4759    A mature Gene_obj is expected to have at least one mRNA_exon_obj component.
4760
4761=cut
4762
4763
4764package mRNA_exon_obj;
4765
4766use strict;
4767use warnings;
4768use Storable qw (store retrieve freeze thaw dclone);
4769
4770=over 4
4771
4772=item new()
4773
4774B<Description:> Instantiates an mRNA_exon_obj
4775
4776B<Parameters:> <(end5, end3)>
4777
4778The end5 and end3 coordinates can be optionally passed into the constructor to set these attributes.  Alternatively, the set_coords() method can be used to set these values.
4779
4780B<Returns:> $mRNA_exon_obj
4781
4782=back
4783
4784=cut
4785
4786
4787    ;
4788
4789sub new {
4790    shift;
4791    my $self = { end5 => 0,   # stores end5 of mRNA exon
4792                 end3 => 0,   # stores end3 of mRNA exon
4793                 CDS_exon_obj => 0,   # stores object reference to CDS_obj
4794                 feat_name => 0,    # stores TIGR temp id
4795                 strand => undef,   #   +|-
4796                 };
4797
4798    # end5 and end3 can be included as parameters in constructor.
4799    if (@_) {
4800        my ($end5, $end3) = @_;
4801        if (defined($end5) && defined($end3)) {
4802            $self->{end5} = $end5;
4803            $self->{end3} = $end3;
4804        }
4805    }
4806
4807    bless ($self);
4808    return ($self);
4809}
4810
4811
4812
4813=over 4
4814
4815=item get_CDS_obj()
4816
4817B<Description:> Retrieves the CDS_exon_obj component of this mRNA_exon_obj
4818
4819B<Parameters:> none
4820
4821B<Returns:> $cds_exon_obj
4822
4823If no CDS_exon_obj is attached, returns 0
4824
4825=back
4826
4827=cut
4828
4829    ;
4830
4831sub get_CDS_obj {
4832    my $self = shift;
4833    return ($self->{CDS_exon_obj});
4834}
4835
4836
4837## alias
4838sub get_CDS_exon_obj {
4839    my $self = shift;
4840    return ($self->get_CDS_obj());
4841}
4842
4843
4844=over 4
4845
4846=item get_mRNA_exon_end5_end3()
4847
4848B<Description:> Retrieves the end5, end3 coordinates of the exon
4849
4850**Method Deprecated**, use get_coords()
4851
4852B<Parameters:> none
4853
4854B<Returns:> (end5, end3)
4855
4856=back
4857
4858=cut
4859
4860
4861sub get_mRNA_exon_end5_end3 {
4862    my $self = shift;
4863    return ($self->{end5}, $self->{end3});
4864}
4865
4866
4867
4868=over 4
4869
4870=item set_CDS_exon_obj()
4871
4872B<Description:> Sets the CDS_exon_obj of the mRNA_exon_obj
4873
4874B<Parameters:> $cds_exon_obj
4875
4876B<Returns:> none
4877
4878=back
4879
4880=cut
4881
4882    ;
4883sub set_CDS_exon_obj {
4884    my $self = shift;
4885    my $ref = shift;
4886    if (ref($ref)) {
4887        $self->{CDS_exon_obj} = $ref;
4888    }
4889}
4890
4891
4892
4893####
4894sub delete_CDS_exon_obj {
4895    my $self = shift;
4896    $self->{CDS_exon_obj} = undef;
4897    return;
4898}
4899
4900
4901=over 4
4902
4903=item add_CDS_exon_obj()
4904
4905B<Description:> Instantiates and adds a new CDS_exon_obj to the mRNA_exon_obj given the CDS coordinates.
4906
4907B<Parameters:> (end5, end3)
4908
4909B<Returns:> none
4910
4911=back
4912
4913=cut
4914
4915
4916sub add_CDS_exon_obj {
4917    my $self = shift;
4918    my ($end5, $end3) = @_;
4919    my $cds_obj = CDS_exon_obj->new ($end5, $end3);
4920    $self->set_CDS_exon_obj($cds_obj);
4921}
4922
4923
4924=over 4
4925
4926=item set_feat_name()
4927
4928B<Description:> Sets the feat_name attribute of the mRNA_exon_obj
4929
4930B<Parameters:> $feat_name
4931
4932B<Returns:> none
4933
4934=back
4935
4936=cut
4937
4938
4939
4940sub set_feat_name {
4941    my $self = shift;
4942    my $feat_name = shift;
4943    $self->{feat_name} = $feat_name;
4944}
4945
4946
4947=over 4
4948
4949=item clone_exon()
4950
4951B<Description:> Creates a deep clone of this mRNA_exon_obj, using dclone() of Storable.pm
4952
4953B<Parameters:> none
4954
4955B<Returns:> $mRNA_exon_obj
4956
4957=back
4958
4959=cut
4960
4961
4962
4963sub clone_exon {
4964    my $self = shift;
4965
4966    my $clone_exon = dclone($self);
4967
4968    return ($clone_exon);
4969}
4970
4971
4972
4973=over 4
4974
4975=item get_CDS_end5_end3 ()
4976
4977B<Description:> Retrieves end5, end3 of the CDS_exon_obj component of this mRNA_exon_obj
4978
4979B<Parameters:> none
4980
4981B<Returns:> (end5, end3)
4982
4983An empty array is returned if no CDS_exon_obj is attached.
4984
4985=back
4986
4987=cut
4988
4989
4990sub get_CDS_end5_end3 {
4991    my $self = shift;
4992    my $cds_obj = $self->get_CDS_obj();
4993    if ($cds_obj) {
4994        return ($cds_obj->get_CDS_end5_end3());
4995    } else {
4996        return ( () );
4997    }
4998}
4999
5000
5001=over 4
5002
5003=item get_coords()
5004
5005B<Description:> Retrieves the end5, end3 coordinates of this mRNA_exon_obj
5006
5007B<Parameters:> none
5008
5009B<Returns:> (end5, end3)
5010
5011=back
5012
5013=cut
5014
5015
5016sub get_coords {
5017    my $self = shift;
5018    return ($self->get_mRNA_exon_end5_end3());
5019}
5020
5021
5022=over 4
5023
5024=item set_coords()
5025
5026B<Description:> Sets the end5, end3 coordinates of the mRNA_exon_obj
5027
5028B<Parameters:> (end5, end3)
5029
5030B<Returns:> none
5031
5032=back
5033
5034=cut
5035
5036
5037## simpler coord setting (end5, end3)
5038sub set_coords {
5039    my $self = shift;
5040    my $end5 = shift;
5041    my $end3 = shift;
5042    $self->{end5} = $end5;
5043    $self->{end3} = $end3;
5044}
5045
5046
5047=over 4
5048
5049=item get_strand()
5050
5051B<Description:> Retrieves the orientation of the mRNA_exon_obj based on gene models transcribed orientation.
5052
5053B<Parameters:> none
5054
5055B<Returns:> +|-|undef
5056
5057If end5 == end3, strand orientation cannot be inferred based on coordinates alone, so undef is returned.
5058
5059=back
5060
5061=cut
5062
5063
5064    ;
5065
5066sub get_orientation {
5067    # determine positive or reverse orientation
5068    my $self = shift;
5069    return ($self->{strand});
5070}
5071
5072
5073sub get_strand { ## preferred
5074	my $self = shift;
5075	return($self->get_orientation());
5076}
5077
5078
5079####
5080sub merge_exon {
5081    my $self = shift;
5082    my $other_exon = shift;
5083
5084    my $cds = $self->get_CDS_exon_obj();
5085
5086    my $other_cds = $other_exon->get_CDS_exon_obj();
5087
5088    if ($other_cds) {
5089        if ($cds) {
5090            $cds->merge_CDS($other_cds);
5091        }
5092        else {
5093            # current exon lacks cds. Set this one to it.
5094            $self->set_CDS_exon_obj($other_cds);
5095        }
5096    }
5097
5098
5099    ## merge the exons.
5100    my @coords = sort {$a<=>$b} ($self->get_coords(), $other_exon->get_coords());
5101    my $lend = shift @coords;
5102    my $rend = pop @coords;
5103
5104    my ($new_end5, $new_end3) = ($self->get_orientation() eq '+') ? ($lend, $rend) : ($rend, $lend);
5105
5106    $self->set_coords($new_end5, $new_end3);
5107
5108    return;
5109}
5110
5111
5112
5113
5114
5115
5116=over 4
5117
5118=item toString()
5119
5120B<Description:> Provides a textual description of the mRNA_exon_obj
5121
5122B<Parameters:> none
5123
5124B<Returns:> $text
5125
5126=back
5127
5128=cut
5129
5130    ;
5131
5132
5133sub toString {
5134    my $self = shift;
5135    my @coords = $self->get_mRNA_exon_end5_end3();
5136    my $feat_name = $self->{feat_name};
5137    my $text = "";
5138    if ($feat_name) {
5139        $text .= "feat_name: $feat_name\t";
5140    }
5141    $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n";
5142    return ($text);
5143}
5144
5145
5146sub length {
5147    my $self = shift;
5148
5149    my $len = abs ($self->{end5} - $self->{end3}) + 1;
5150
5151    return($len);
5152}
5153
5154
5155
5156
5157
5158##########################################################################################################################
5159##########################################################################################################################
5160
5161
5162
5163=head1 NAME
5164
5165package CDS_exon_obj
5166
5167=cut
5168
5169
5170=head1 DESCRIPTION
5171
5172    The CDS_exon_obj represents the protein-coding portion of an mRNA_exon_obj.
5173
5174=cut
5175
5176
5177
5178package CDS_exon_obj;
5179
5180use strict;
5181use warnings;
5182use Storable qw (store retrieve freeze thaw dclone);
5183use Carp;
5184
5185
5186=over 4
5187
5188=item new()
5189
5190B<Description:>  Cosntructor for the CDS_exon_obj
5191
5192B<Parameters:> <(end5, end3)>
5193
5194The (end5, end3) parameter is optional.  Alternatively, the set_coords() method can be used to set these values.
5195
5196B<Returns:> $cds_exon_obj
5197
5198=back
5199
5200=cut
5201
5202    ;
5203
5204sub new {
5205    shift;
5206    my $self = { end5 => 0,   #stores end5 of cds exon
5207                 end3 => 0,    #stores end3 of cds exon
5208                 phase => undef, #must set if to output in gff3 format.
5209                 feat_name => 0, #tigr's temp id
5210                 strand => undef,   # +|-
5211             };
5212
5213
5214    # end5 and end3 are allowed constructor parameters
5215    if (@_) {
5216        my ($end5, $end3) = @_;
5217        if (defined ($end5) && defined ($end3)) {
5218            $self->{end5} = $end5;
5219            $self->{end3} = $end3;
5220        }
5221    }
5222    bless ($self);
5223    return ($self);
5224}
5225
5226
5227
5228=over 4
5229
5230=item set_feat_name()
5231
5232B<Description:> Sets the feat_name attribute value of the CDS_exon_obj
5233
5234B<Parameters:> $feat_name
5235
5236B<Returns:> none
5237
5238=back
5239
5240=cut
5241
5242
5243sub set_feat_name {
5244    my $self = shift;
5245    my $feat_name = shift;
5246    $self->{feat_name} = $feat_name;
5247}
5248
5249
5250=over 4
5251
5252=item get_CDS_end5_end3()
5253
5254B<Description:> Retrieves the end5, end3 coordinates of the CDS_exon_obj
5255
5256** Method deprecated **, use get_coords()
5257
5258
5259B<Parameters:> none
5260
5261B<Returns:> (end5, end3)
5262
5263=back
5264
5265=cut
5266
5267
5268sub get_CDS_end5_end3 {
5269    my $self = shift;
5270    return ($self->{end5}, $self->{end3});
5271}
5272
5273
5274
5275=over 4
5276
5277=item set_coords()
5278
5279B<Description:> Sets the (end5, end3) values of the CDS_exon_obj
5280
5281B<Parameters:> (end5, end3)
5282
5283B<Returns:> none
5284
5285=back
5286
5287=cut
5288
5289
5290
5291sub set_coords {
5292    my $self = shift;
5293    my $end5 = shift;
5294    my $end3 = shift;
5295    $self->{end5} = $end5;
5296    $self->{end3} = $end3;
5297}
5298
5299=over 4
5300
5301=item get_coords()
5302
5303B<Description:> Retrieves the (end5, end3) coordinates of the CDS_exon_obj
5304
5305B<Parameters:> none
5306
5307B<Returns:> (end5, end3)
5308
5309
5310The get_coords() method behaves similarly among Gene_obj, mRNA_exon_obj, and CDS_exon_obj, and is generally preferred to other existing methods for extracting these coordinate values.  Other methods persist for backwards compatibility with older applications, but have been largely deprecated.
5311
5312
5313=back
5314
5315=cut
5316
5317
5318
5319sub get_coords {
5320    my $self = shift;
5321    return ($self->get_CDS_end5_end3());
5322}
5323
5324
5325=over 4
5326
5327=item get_orientation()
5328
5329B<Description:> Retrieves the orientation of the CDS_exon_obj based on gene models orientation.
5330
5331B<Parameters:> none
5332
5333B<Returns:> +|-|undef
5334
5335undef returned if end5 == end3
5336
5337=back
5338
5339=cut
5340
5341    ;
5342
5343sub get_orientation {
5344    # determine positive or reverse orientation
5345    my $self = shift;
5346    return ($self->{strand});
5347}
5348
5349
5350sub get_strand { ## preferred
5351	my $self = shift;
5352	return($self->get_orientation());
5353}
5354
5355
5356=over 4
5357
5358=item toString()
5359
5360B<Description:> Retrieves a textual description of the CDS_exon_obj
5361
5362B<Parameters:> none
5363
5364B<Returns:> $text
5365
5366=back
5367
5368=cut
5369
5370
5371
5372=over 4
5373
5374=item clone_cds()
5375
5376B<Description:> Creates a deep clone of this CDS_exon_obj, using dclone() of Storable.pm
5377
5378B<Parameters:> none
5379
5380B<Returns:> $mRNA_exon_obj
5381
5382=back
5383
5384=cut
5385
5386
5387
5388sub clone_cds {
5389    my $self = shift;
5390
5391    my $clone_cds = dclone($self);
5392
5393    return ($clone_cds);
5394}
5395
5396
5397=over 4
5398
5399=item length()
5400
5401B<Description:> length of this cds segment
5402
5403B<Parameters:> none
5404
5405B<Returns:> int
5406
5407=back
5408
5409=cut
5410
5411
5412sub length {
5413    my $self = shift;
5414    my $length = abs ($self->{end3} - $self->{end5}) + 1;
5415    return ($length);
5416}
5417
5418
5419
5420=over 4
5421
5422=item  set_phase()
5423
5424B<Description:> set phase of the CDS incident bp
5425
5426B<Parameters:> [012]
5427
5428B<Returns:> self
5429
5430
5431phase 0 = first bp of codon
5432phase 1 = second bp of codon
5433phase 2 = third bp of codon
5434
5435
5436=back
5437
5438=cut
5439
5440
5441sub set_phase {
5442    my $self = shift;
5443    my $phase = shift;
5444    $self->{phase} = $phase;
5445    return($self);
5446}
5447
5448=over 4
5449
5450=item  get_phase()
5451
5452B<Description:> gets phase of the CDS incident bp
5453
5454B<Parameters:> none
5455
5456B<Returns:> [012] or undef if not set
5457
5458
5459phase 0 = first bp of codon
5460phase 1 = second bp of codon
5461phase 2 = third bp of codon
5462
5463
5464=back
5465
5466=cut
5467
5468
5469
5470
5471sub get_phase {
5472    my $self = shift;
5473    my $phase = $self->{phase};
5474    return($phase);
5475}
5476
5477
5478
5479####
5480sub merge_CDS {
5481    my $self = shift;
5482    my $other_cds = shift;
5483
5484    my $orientation = $self->get_orientation();
5485    unless ($orientation) {
5486        confess "Error, self CDS lacks orientation\n";
5487    }
5488
5489    my @coords = sort {$a<=>$b} ($self->get_coords(), $other_cds->get_coords());
5490    my $lend = shift @coords;
5491    my $rend = pop @coords;
5492
5493    unless ($lend && $rend) {
5494        confess "Error, trying to merge CDSs but coordinates are not available: \n"
5495            . "self: " . $self->toString()
5496            . "\n"
5497            . "other: " . $other_cds->toString() . "\n";
5498    }
5499
5500    my ($end5, $end3) = ($orientation eq '+') ? ($lend, $rend) : ($rend, $lend);
5501
5502    $self->set_coords($end5, $end3);
5503}
5504
5505sub toString {
5506    my $self = shift;
5507    my @coords = $self->get_CDS_end5_end3();
5508    my $feat_name = $self->{feat_name};
5509    my $text = "";
5510    if ($feat_name) {
5511        $text .= "feat_name: $feat_name\t";
5512    }
5513    $text .= "end5 " . $coords[0] . "\tend3 " . $coords[1] . "\n";
5514    return ($text);
5515}
5516
5517
55181;
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533