1package GTF_utils2; # removes DB_File and Gene_obj_indexer requirement. 2 3use strict; 4use warnings; 5use Gene_obj; 6use GTF; 7use Carp; 8use Data::Dumper; 9use Overlap_piler; 10 11#### 12sub index_GTF_gene_objs { 13 my ($gtf_filename, $gene_obj_indexer) = @_; ## gene_obj_indexer can be a simple hashref {} 14 15 unless ($gtf_filename && $gene_obj_indexer) { 16 confess "Error, need gtf_filename and gene_obj_indexer as perams"; 17 } 18 19 return index_GTF_gene_objs_from_GTF($gtf_filename,$gene_obj_indexer); 20} 21 22sub index_GTF_gene_objs_from_GTF { 23 my ($gtf_filename, $gene_obj_indexer) = @_; 24 25 unless (ref $gene_obj_indexer eq "HASH") { 26 confess "Error, \$gene_obj_indexer must be a hash ref"; 27 } 28 29 30 ## 31 #print STDERR "\n-caching genes.\n"; 32 my %seqname_map; 33 34 my $gene_objs = GTF_to_gene_objs($gtf_filename); 35 36 my %seen; 37 38 for my $gene_obj (@$gene_objs) { 39 40 my $gene_id = $gene_obj->{TU_feat_name}; 41 42 43 if ($seen{$gene_id}) { 44 confess "Error, already processed gene: $gene_id\n" 45 . " here: " . $gene_obj->toString() . "\n" 46 . " and earlier: " . $seen{$gene_id}->toString(); 47 48 } 49 50 $seen{$gene_id} = $gene_obj; 51 52 53 my $seqname = $gene_obj->{asmbl_id}; 54 55 $gene_obj_indexer->{$gene_id} = $gene_obj; 56 57 # add to gene list for asmbl_id 58 my $gene_list = $seqname_map{$seqname}; 59 unless (ref $gene_list) { 60 $gene_list = $seqname_map{$seqname} = []; 61 } 62 push (@$gene_list, $gene_id); 63 } 64 return (\%seqname_map); 65} 66 67sub GTF_to_gene_objs { 68 my ($gtf_filename) = @_; 69 70 my %gene_transcript_data; 71 my %noncoding_features; 72 73 my %gene_id_to_source; 74 my %gene_id_to_name; 75 76 my %gene_id_to_seq_name; 77 my %gene_id_to_gene_name; 78 79 80 my %coding_genes; 81 82 83 open (my $fh, $gtf_filename) or die "Error, cannot open $gtf_filename"; 84 while (<$fh>) { 85 unless (/\w/) { next; } 86 if (/^\#/) { next; } # comment line. 87 88 chomp; 89 my ($seqname, $source, $type, $lend, $rend, $score, $strand, $gtf_phase, $annot) = split (/\t/); 90 91 my ($end5, $end3) = ($strand eq '+') ? ($lend, $rend) : ($rend, $lend); 92 93 $annot =~ /gene_id \"([^\"]+)\"/ or confess "Error, cannot get gene_id from $annot of line\n$_"; 94 my $gene_id = $1; 95 96 if (my $sn = $gene_id_to_seq_name{$gene_id}) { 97 if ($sn ne $seqname) { 98 $gene_id = "$seqname" . "|" . "$gene_id"; # make unique per scaffold. 99 } 100 } 101 $gene_id_to_seq_name{$gene_id} = $seqname; 102 103 $gene_id_to_source{$gene_id} = $source; 104 105 106 if ($annot =~ /name \"([^\"]+)\"/) { 107 my $name = $1; 108 $gene_id_to_name{$gene_id} = $name; 109 } 110 111 my $gene_name = ""; 112 if ($annot =~ /gene_name \"([^\"]+)\"/) { 113 $gene_name = $1; 114 $gene_id_to_gene_name{$gene_id} = $gene_name; 115 } 116 117 118 # print "gene_id: $gene_id, transcrpt_id: $transcript_id, $type\n"; 119 120 if ($type eq 'transcript' || $type eq 'gene') { next; } # capture by exon coordinates 121 122 my $transcript_id; 123 if ($annot =~ /transcript_id \"([^\"]+)\"/) { 124 $transcript_id = $1; 125 } 126 else { 127 print STDERR "Skipping line: $_, no transcript_id value provided\n"; 128 next; 129 } 130 131 if ($type eq 'CDS' || $type eq 'stop_codon' || $type eq 'start_codon') { 132 push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{CDS}}, [$end5, $end3] ); 133 push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] ); 134 135 $coding_genes{$gene_id}++; 136 } 137 elsif ($type eq "exon" || $type =~ /UTR/) { 138 push (@{$gene_transcript_data{$seqname}->{$gene_id}->{$transcript_id}->{mRNA}}, [$end5, $end3] ); 139 } 140 elsif ($type =~ /Selenocysteine/) { 141 # no op 142 } 143 else { 144 ## assuming noncoding feature 145 push (@{$noncoding_features{$seqname}->{$type}->{$gene_id}->{$transcript_id}}, [$end5, $end3] ); 146 } 147 148 } 149 close $fh; 150 151 152 ## create gene objects. 153 154 my @top_gene_objs; 155 156 my %seen; 157 foreach my $seqname (keys %gene_transcript_data) { 158 159 160 { 161 ################################## 162 ## Process protein-coding genes: 163 164 my $genes_href = $gene_transcript_data{$seqname}; 165 166 foreach my $gene_id (keys %$genes_href) { 167 168 if ($seen{$gene_id}) { 169 print STDERR ("Error, already saw $gene_id,$seqname as $gene_id,$seen{$gene_id}\nSkipping."); 170 next; 171 } 172 $seen{$gene_id} = $seqname; 173 174 my $transcripts_href = $genes_href->{$gene_id}; 175 176 my $source = $gene_id_to_source{$gene_id}; 177 178 my @gene_objs; 179 180 foreach my $transcript_id (keys %$transcripts_href) { 181 182 my $coord_types_href = $transcripts_href->{$transcript_id}; 183 184 my $CDS_coords_aref = $coord_types_href->{CDS}; 185 my $mRNA_coords_aref = $coord_types_href->{mRNA}; 186 187 188 #print STDERR "Before, CDS: " . Dumper($CDS_coords_aref); 189 #print STDERR "Before, exons: " . Dumper($mRNA_coords_aref); 190 191 192 my $CDS_coords_href = &_join_overlapping_coords($CDS_coords_aref); 193 my $mRNA_coords_href = &_join_overlapping_coords($mRNA_coords_aref); 194 195 #print STDERR "CDS: " . Dumper($CDS_coords_href); 196 #print STDERR "mRNA: " . Dumper ($mRNA_coords_href); 197 198 199 my $gene_obj = new Gene_obj(); 200 $gene_obj->populate_gene_object($CDS_coords_href, $mRNA_coords_href); 201 202 $gene_obj->{TU_feat_name} = $gene_id; 203 $gene_obj->{Model_feat_name} = $transcript_id; 204 if (my $name = $gene_id_to_name{$gene_id}) { 205 $gene_obj->{com_name} = $name; 206 } 207 else { 208 $gene_obj->{com_name} = $transcript_id; 209 } 210 if (my $gene_name = $gene_id_to_gene_name{$gene_id}) { 211 $gene_obj->{gene_name} = $gene_name; 212 } 213 $gene_obj->{asmbl_id} = $seqname; 214 $gene_obj->{source} = $source; 215 216 $gene_obj->join_adjacent_exons(); 217 218 push (@gene_objs, $gene_obj); 219 } 220 221 222 ## want single gene that includes all alt splice variants here 223 if(scalar(@gene_objs)) { 224 my $template_gene_obj = shift @gene_objs; 225 foreach my $other_gene_obj (@gene_objs) { 226 $template_gene_obj->add_isoform($other_gene_obj); 227 } 228 push (@top_gene_objs, $template_gene_obj); 229 230 # print $template_gene_obj->toString(); 231 232 233 } 234 } 235 236 } 237 238 239 { 240 ################################ 241 ## Process noncoding features ## 242 ################################ 243 244 my $ncgene_types_href = $noncoding_features{$seqname}; 245 246 if (ref $ncgene_types_href) { 247 248 foreach my $nc_type (keys %$ncgene_types_href) { 249 250 my $gene_ids_href = $ncgene_types_href->{$nc_type}; 251 foreach my $gene_id (keys %$gene_ids_href) { 252 253 if (exists $coding_genes{$gene_id}) { 254 print STDERR "Warning: Skipping $gene_id ($nc_type) as this gene is already included as a coding gene.\n"; 255 next; 256 } 257 258 my $trans_ids_href = $gene_ids_href->{$gene_id}; 259 foreach my $trans_id (keys %$trans_ids_href) { 260 261 my @coordsets = @{$trans_ids_href->{$trans_id}}; 262 my %coords; 263 foreach my $coordset (@coordsets) { 264 my ($end5, $end3) = @$coordset; 265 $coords{$end5} = $end3; 266 } 267 268 my $gene_obj = new Gene_obj(); 269 270 $gene_obj->populate_gene_object({}, \%coords); 271 $gene_obj->{asmbl_id} = $seqname; 272 $gene_obj->{TU_feat_name} = $gene_id; 273 $gene_obj->{Model_feat_name} = $trans_id; 274 $gene_obj->{gene_type} = $nc_type; 275 if (my $name = $gene_id_to_name{$gene_id}) { 276 $gene_obj->{com_name} = $name; 277 } 278 else { 279 $gene_obj->{com_name} = $trans_id; 280 } 281 282 283 #print STDERR $gene_obj->toString(); 284 285 286 push (@top_gene_objs, $gene_obj); 287 } 288 } 289 } 290 } 291 } 292 } 293 return (\@top_gene_objs); 294} 295 296 297 298#### 299sub _join_overlapping_coords { 300 my $coords_aref = shift; 301 302 unless (ref $coords_aref) { 303 return ({}); 304 } 305 306 my $orient; 307 308 my @coords; 309 310 my $inferred_orient; 311 312 foreach my $coordset (@$coords_aref) { 313 my ($end5, $end3) = @$coordset; 314 315 my $orient; 316 if ($end5 < $end3) { 317 $orient = '+'; 318 } 319 elsif ($end5 > $end3) { 320 $orient = '-'; 321 322 } 323 324 if ( (! defined $inferred_orient) && defined $orient) { 325 $inferred_orient = $orient; 326 } 327 elsif ( defined($orient) && $orient ne $inferred_orient) { 328 die "Error, conflicting orientation info: " . Dumper($coords_aref); 329 } 330 331 my ($lend, $rend) = sort {$a<=>$b} ($end5, $end3); 332 333 push (@coords, [$lend, $rend]); 334 } 335 336 #print STDERR "coords: " . Dumper(@coords); 337 338 my @piles = &Overlap_piler::simple_coordsets_collapser(@coords); 339 340 #print STDERR "piles: " . Dumper(@piles); 341 342 @piles = sort {$a->[0] <=> $b->[0]} @piles; 343 344 ## join adjacent piles 345 my @joined_piles = shift @piles; 346 foreach my $pile (@piles) { 347 my ($pile_lend, $pile_rend) = @$pile; 348 if ($pile_lend == $joined_piles[$#joined_piles]->[1] + 1) { 349 # adjacent 350 $joined_piles[$#joined_piles]->[1] = $pile_rend; 351 } 352 else { 353 push (@joined_piles, $pile); 354 } 355 } 356 357 358 my %new_coords; 359 foreach my $pile (@joined_piles) { 360 my ($lend, $rend) = @$pile; 361 my ($end5, $end3) = ($inferred_orient eq '+') ? ($lend, $rend) : ($rend, $lend); 362 $new_coords{$end5} = $end3; 363 } 364 365 366 return(\%new_coords); 367} 368 369 370 3711; #EOM 372