1#!/usr/local/bin/python3.8 2# 3# This is script was contributed by Philip Ashton (https://github.com/flashton2003) 4# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format 5# supported by bcftools csq. Please expand as necessary and send a pull request. 6# 7# See also 8# https://github.com/samtools/bcftools/issues/530#issuecomment-268278248 9# https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831 10# https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373 11# 12import sys 13import gffutils 14 15class PseudoFeature(): 16 def __init__(self, chrom, start, stop, strand): 17 self.chrom = chrom 18 self.start = start 19 self.stop = stop 20 self.strand = strand 21 22 23class FeatureGroup(): 24 def __init__(self, gene_id): 25 ## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603 26 self.id = gene_id 27 ## the below should be lists of instances of Feature class. 28 self.gene = None # one feature group should have a single gene 29 self.transcript = None 30 ## assuming here that ncRNA is more like transcript than exon 31 ## i.e. there is only one ncRNA feature per feature group. 32 self.ncRNA = None 33 self.exons = [] 34 self.CDSs = [] 35 36 def add_gene(self, feature): 37 self.gene = feature 38 39 def add_transcript(self, feature): 40 self.transcript = feature # one feature group should have a single gene 41 42 def add_exon(self, feature): 43 self.exons.append(feature) 44 45 def add_cds(self, feature): 46 self.CDSs.append(feature) 47 48 def add_ncRNA(self, feature): 49 self.ncRNA = feature 50 51 52def get_args(): 53 if len(sys.argv) != 3: 54 print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>') 55 sys.exit() 56 else: 57 return sys.argv[1], sys.argv[2] 58 59def group_features(db): 60 feature_groups = {} 61 for feature in db.all_features(): 62 if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'): 63 if feature.featuretype == 'gene': 64 ## for any particular feature group, the gene doesn't necassarily come 65 ## before the CDS in the gff, so need to have the if/else logic 66 gene_id = feature.id.split('.')[0] 67 if gene_id not in feature_groups: 68 feature_groups[gene_id] = FeatureGroup(gene_id) 69 feature_groups[gene_id].add_gene(feature) 70 else: 71 feature_groups[gene_id].add_gene(feature) 72 ## in this gff, the transcripts are referred to as mRNA 73 ## not all genes have mRNA records 74 elif feature.featuretype == 'mRNA': 75 gene_id = feature.id.split('.')[0] 76 # print(feature.id) 77 # print(gene_id) 78 feature_groups[gene_id].add_transcript(feature) 79 elif feature.featuretype == 'exon': 80 gene_id = feature.attributes['locus_tag'][0] 81 if gene_id not in feature_groups: 82 feature_groups[gene_id] = FeatureGroup(gene_id) 83 feature_groups[gene_id].add_exon(feature) 84 else: 85 feature_groups[gene_id].add_exon(feature) 86 87 elif feature.featuretype == 'CDS': 88 gene_id = feature.attributes['locus_tag'][0] 89 if gene_id not in feature_groups: 90 feature_groups[gene_id] = FeatureGroup(gene_id) 91 feature_groups[gene_id].add_cds(feature) 92 else: 93 feature_groups[gene_id].add_cds(feature) 94 95 elif feature.featuretype == 'ncRNA': 96 gene_id = feature.attributes['locus_tag'][0] 97 # print(gene_id) 98 if gene_id not in feature_groups: 99 feature_groups[gene_id] = FeatureGroup(gene_id) 100 feature_groups[gene_id].add_ncRNA(feature) 101 else: 102 feature_groups[gene_id].add_ncRNA(feature) 103 return feature_groups 104 105def make_transcript_pseudofeature(CDSs): 106 ## takes list of CDSs, list len will often be 1, but will sometimes be 2 107 start = min([x.start for x in CDSs]) 108 stop = max([x.stop for x in CDSs]) 109 chrom = CDSs[0].chrom 110 ## check that both on the same strand 111 assert len(set([x.strand for x in CDSs])) == 1 112 strand = CDSs[0].strand 113 pf = PseudoFeature(chrom, start, stop, strand) 114 return pf 115 116 117def check_feature_groups(feature_groups): 118 ## checking that feature group has a gene, at least one CDS, and a transcript 119 ## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript 120 ## only checking that have transcript for everything 121 for gene_id in feature_groups: 122 ## don't want to analyse ncRNAs here, but need to include them up 123 ## to this point so that we know that that gene is an ncRNA gene 124 if feature_groups[gene_id].ncRNA != None: 125 continue 126 assert feature_groups[gene_id].gene != None 127 assert len(feature_groups[gene_id].CDSs) > 0 128 if feature_groups[gene_id].transcript == None: 129 ## using pseudofeature because the gffutils feature says it's not really intended 130 ## for direct instantiation by users 131 feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs) 132 133def print_features(feature_groups): 134 for gene_id in feature_groups: 135 feature_group = feature_groups[gene_id] 136 if feature_group.ncRNA != None: 137 continue 138 print('###') 139 name = feature_group.gene.attributes['Name'][0] 140 gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}' 141 print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t') 142 143 transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}' 144 print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t') 145 146 cds_attributes = f'Parent=transcript:{gene_id};Name={name}' 147 148 for c in feature_group.CDSs: 149 print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t') 150 151def main(): 152 ''' 153 Ensembl gff should have one gene and one transcript per "feature group" 154 Then, can have multiple CDS/exons 155 read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially) 156 multiple CDS/exon 157 Exons aren't printed, as not needed bt bcftools csq 158 ''' 159 gff_handle, db_handle = get_args() 160 fn = gffutils.example_filename(gff_handle) 161 db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True) 162 feature_groups = group_features(db) 163 check_feature_groups(feature_groups) 164 print_features(feature_groups) 165 166 167 168## gff input is generated from a genbank file by bp_genbank2gff3.pl 169 170if __name__ == '__main__': 171 main() 172 173