1package GTF_utils2; # removes DB_File and Gene_obj_indexer requirement.
2
3use strict;
4use warnings;
5use Gene_obj;
6use GTF;
7use Carp;
8use Data::Dumper;
9use Overlap_piler;
10
11####
12sub index_GTF_gene_objs {
13  my ($gtf_filename, $gene_obj_indexer) = @_;  ## gene_obj_indexer can be a simple  hashref {}
14
15  unless ($gtf_filename && $gene_obj_indexer) {
16	  confess "Error, need gtf_filename and gene_obj_indexer as perams";
17  }
18
19  return index_GTF_gene_objs_from_GTF($gtf_filename,$gene_obj_indexer);
20}
21
22sub index_GTF_gene_objs_from_GTF {
23    my ($gtf_filename, $gene_obj_indexer) = @_;
24
25    unless (ref $gene_obj_indexer eq "HASH") {
26        confess "Error, \$gene_obj_indexer must be a hash ref";
27    }
28
29
30    ##
31    #print STDERR "\n-caching genes.\n";
32    my %seqname_map;
33
34    my $gene_objs = GTF_to_gene_objs($gtf_filename);
35
36    my %seen;
37
38    for my $gene_obj (@$gene_objs) {
39
40        my $gene_id = $gene_obj->{TU_feat_name};
41
42
43        if ($seen{$gene_id}) {
44            confess "Error, already processed gene: $gene_id\n"
45                . " here: " . $gene_obj->toString() . "\n"
46                . " and earlier: " . $seen{$gene_id}->toString();
47
48        }
49
50        $seen{$gene_id} = $gene_obj;
51
52
53        my $seqname = $gene_obj->{asmbl_id};
54
55        $gene_obj_indexer->{$gene_id} = $gene_obj;
56
57        # add to gene list for asmbl_id
58        my $gene_list = $seqname_map{$seqname};
59        unless (ref $gene_list) {
60            $gene_list = $seqname_map{$seqname} = [];
61        }
62        push (@$gene_list, $gene_id);
63    }
64    return (\%seqname_map);
65}
66
67sub GTF_to_gene_objs {
68    my ($gtf_filename) = @_;
69
70    my %gene_transcript_data;
71    my %noncoding_features;
72
73    my %gene_id_to_source;
74    my %gene_id_to_name;
75
76    my %gene_id_to_seq_name;
77    my %gene_id_to_gene_name;
78
79
80    my %coding_genes;
81
82
83    open (my $fh, $gtf_filename) or die "Error, cannot open $gtf_filename";
84    while (<$fh>) {
85        unless (/\w/) { next; }
86        if (/^\#/) { next; } # comment line.
87
88        chomp;
89        my ($seqname, $source, $type, $lend, $rend, $score, $strand, $gtf_phase, $annot) = split (/\t/);
90
91        my ($end5, $end3) = ($strand eq '+') ? ($lend, $rend) : ($rend, $lend);
92
93        $annot =~ /gene_id \"([^\"]+)\"/  or confess "Error, cannot get gene_id from $annot of line\n$_";
94        my $gene_id = $1;
95
96        if (my $sn = $gene_id_to_seq_name{$gene_id}) {
97            if ($sn ne $seqname) {
98                $gene_id = "$seqname" . "|" . "$gene_id"; # make unique per scaffold.
99            }
100        }
101        $gene_id_to_seq_name{$gene_id} = $seqname;
102
103        $gene_id_to_source{$gene_id} = $source;
104
105
106        if ($annot =~ /name \"([^\"]+)\"/) {
107            my $name = $1;
108            $gene_id_to_name{$gene_id} = $name;
109        }
110
111        my $gene_name = "";
112        if ($annot =~ /gene_name \"([^\"]+)\"/) {
113            $gene_name = $1;
114            $gene_id_to_gene_name{$gene_id} = $gene_name;
115        }
116
117
118		# print "gene_id: $gene_id, transcrpt_id: $transcript_id, $type\n";
119
120        if ($type eq 'transcript' || $type eq 'gene') { next; } # capture by exon coordinates
121
122        my $transcript_id;
123        if ($annot =~ /transcript_id \"([^\"]+)\"/) {
124            $transcript_id = $1;
125        }
126        else {
127            print STDERR "Skipping line: $_, no transcript_id value provided\n";
128            next;
129        }
130
131        if ($type eq 'CDS' || $type eq 'stop_codon' || $type eq 'start_codon') {
132            push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{CDS}}, [$end5, $end3] );
133            push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] );
134
135            $coding_genes{$gene_id}++;
136        }
137        elsif ($type eq "exon" || $type =~ /UTR/) {
138            push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] );
139        }
140        elsif ($type =~ /Selenocysteine/) {
141            # no op
142        }
143        else {
144            ## assuming noncoding feature
145            push (@{$noncoding_features{$seqname}->{$type}->{$gene_id}->{$transcript_id}}, [$end5, $end3] );
146        }
147
148    }
149    close $fh;
150
151
152    ## create gene objects.
153
154    my @top_gene_objs;
155
156    my %seen;
157    foreach my $seqname (keys %gene_transcript_data) {
158
159
160        {
161            ##################################
162            ## Process protein-coding genes:
163
164            my $genes_href = $gene_transcript_data{$seqname};
165
166            foreach my $gene_id (keys %$genes_href) {
167
168                if ($seen{$gene_id}) {
169                    print STDERR ("Error, already saw $gene_id,$seqname as $gene_id,$seen{$gene_id}\nSkipping.");
170                    next;
171                }
172                $seen{$gene_id} = $seqname;
173
174                my $transcripts_href = $genes_href->{$gene_id};
175
176                my $source = $gene_id_to_source{$gene_id};
177
178                my @gene_objs;
179
180                foreach my $transcript_id (keys %$transcripts_href) {
181
182                    my $coord_types_href = $transcripts_href->{$transcript_id};
183
184                    my $CDS_coords_aref = $coord_types_href->{CDS};
185                    my $mRNA_coords_aref = $coord_types_href->{mRNA};
186
187
188                    #print STDERR "Before, CDS: " . Dumper($CDS_coords_aref);
189                    #print STDERR "Before, exons: " . Dumper($mRNA_coords_aref);
190
191
192                    my $CDS_coords_href = &_join_overlapping_coords($CDS_coords_aref);
193                    my $mRNA_coords_href = &_join_overlapping_coords($mRNA_coords_aref);
194
195                    #print STDERR "CDS: " . Dumper($CDS_coords_href);
196                    #print STDERR "mRNA: " . Dumper ($mRNA_coords_href);
197
198
199                    my $gene_obj = new Gene_obj();
200                    $gene_obj->populate_gene_object($CDS_coords_href, $mRNA_coords_href);
201
202                    $gene_obj->{TU_feat_name} = $gene_id;
203                    $gene_obj->{Model_feat_name} = $transcript_id;
204                    if (my $name = $gene_id_to_name{$gene_id}) {
205                        $gene_obj->{com_name} = $name;
206                    }
207                    else {
208                        $gene_obj->{com_name} = $transcript_id;
209                    }
210                    if (my $gene_name = $gene_id_to_gene_name{$gene_id}) {
211                        $gene_obj->{gene_name} = $gene_name;
212                    }
213                    $gene_obj->{asmbl_id} = $seqname;
214                    $gene_obj->{source} = $source;
215
216                    $gene_obj->join_adjacent_exons();
217
218                    push (@gene_objs, $gene_obj);
219                }
220
221
222                ## want single gene that includes all alt splice variants here
223                if(scalar(@gene_objs)) {
224                    my $template_gene_obj = shift @gene_objs;
225                    foreach my $other_gene_obj (@gene_objs) {
226                        $template_gene_obj->add_isoform($other_gene_obj);
227                    }
228                    push (@top_gene_objs, $template_gene_obj);
229
230                    # print $template_gene_obj->toString();
231
232
233                }
234            }
235
236        }
237
238
239        {
240            ################################
241            ## Process noncoding features ##
242            ################################
243
244            my $ncgene_types_href = $noncoding_features{$seqname};
245
246            if (ref $ncgene_types_href) {
247
248                foreach my $nc_type (keys %$ncgene_types_href) {
249
250                    my $gene_ids_href = $ncgene_types_href->{$nc_type};
251                    foreach my $gene_id (keys %$gene_ids_href) {
252
253                        if (exists $coding_genes{$gene_id}) {
254                            print STDERR "Warning: Skipping $gene_id ($nc_type) as this gene is already included as a coding gene.\n";
255                            next;
256                        }
257
258                        my $trans_ids_href = $gene_ids_href->{$gene_id};
259                        foreach my $trans_id (keys %$trans_ids_href) {
260
261                            my @coordsets = @{$trans_ids_href->{$trans_id}};
262                            my %coords;
263                            foreach my $coordset (@coordsets) {
264                                my ($end5, $end3) = @$coordset;
265                                $coords{$end5} = $end3;
266                            }
267
268                            my $gene_obj = new Gene_obj();
269
270                            $gene_obj->populate_gene_object({}, \%coords);
271                            $gene_obj->{asmbl_id} = $seqname;
272                            $gene_obj->{TU_feat_name} = $gene_id;
273                            $gene_obj->{Model_feat_name} = $trans_id;
274                            $gene_obj->{gene_type} = $nc_type;
275                            if (my $name = $gene_id_to_name{$gene_id}) {
276                                $gene_obj->{com_name} = $name;
277                            }
278                            else {
279                                $gene_obj->{com_name} = $trans_id;
280                            }
281
282
283                            #print STDERR $gene_obj->toString();
284
285
286                            push (@top_gene_objs, $gene_obj);
287                        }
288                    }
289                }
290            }
291        }
292    }
293    return (\@top_gene_objs);
294}
295
296
297
298####
299sub _join_overlapping_coords {
300    my $coords_aref = shift;
301
302    unless (ref $coords_aref) {
303        return ({});
304    }
305
306    my $orient;
307
308    my @coords;
309
310    my $inferred_orient;
311
312    foreach my $coordset (@$coords_aref) {
313        my ($end5, $end3) = @$coordset;
314
315        my $orient;
316        if ($end5 < $end3) {
317            $orient = '+';
318        }
319        elsif ($end5 > $end3) {
320            $orient = '-';
321
322        }
323
324        if ( (! defined $inferred_orient) && defined $orient) {
325            $inferred_orient = $orient;
326        }
327        elsif ( defined($orient) && $orient ne $inferred_orient) {
328            die "Error, conflicting orientation info: " . Dumper($coords_aref);
329        }
330
331        my ($lend, $rend) = sort {$a<=>$b} ($end5, $end3);
332
333        push (@coords, [$lend, $rend]);
334    }
335
336    #print STDERR "coords: " . Dumper(@coords);
337
338    my @piles = &Overlap_piler::simple_coordsets_collapser(@coords);
339
340    #print STDERR "piles: " . Dumper(@piles);
341
342    @piles = sort {$a->[0] <=> $b->[0]} @piles;
343
344    ## join adjacent piles
345    my @joined_piles = shift @piles;
346    foreach my $pile (@piles) {
347        my ($pile_lend, $pile_rend) = @$pile;
348        if ($pile_lend == $joined_piles[$#joined_piles]->[1] + 1) {
349            # adjacent
350            $joined_piles[$#joined_piles]->[1] = $pile_rend;
351        }
352        else {
353            push (@joined_piles, $pile);
354        }
355    }
356
357
358    my %new_coords;
359    foreach my $pile (@joined_piles) {
360        my ($lend, $rend) = @$pile;
361        my ($end5, $end3) = ($inferred_orient eq '+') ? ($lend, $rend) : ($rend, $lend);
362        $new_coords{$end5} = $end3;
363    }
364
365
366    return(\%new_coords);
367}
368
369
370
3711; #EOM
372