1#!/usr/local/bin/python3.8
2#
3# This is script was contributed by Philip Ashton (https://github.com/flashton2003)
4# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format
5# supported by bcftools csq. Please expand as necessary and send a pull request.
6#
7# See also
8#   https://github.com/samtools/bcftools/issues/530#issuecomment-268278248
9#   https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831
10#   https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373
11#
12import sys
13import gffutils
14
15class PseudoFeature():
16    def __init__(self, chrom, start, stop, strand):
17        self.chrom = chrom
18        self.start = start
19        self.stop = stop
20        self.strand = strand
21
22
23class FeatureGroup():
24    def __init__(self, gene_id):
25        ## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603
26        self.id = gene_id
27        ## the below should be lists of instances of Feature class.
28        self.gene = None # one feature group should have a single gene
29        self.transcript = None
30        ## assuming here that ncRNA is more like transcript than exon
31        ## i.e. there is only one ncRNA feature per feature group.
32        self.ncRNA = None
33        self.exons = []
34        self.CDSs = []
35
36    def add_gene(self, feature):
37        self.gene = feature
38
39    def add_transcript(self, feature):
40        self.transcript = feature # one feature group should have a single gene
41
42    def add_exon(self, feature):
43        self.exons.append(feature)
44
45    def add_cds(self, feature):
46        self.CDSs.append(feature)
47
48    def add_ncRNA(self, feature):
49        self.ncRNA = feature
50
51
52def get_args():
53    if len(sys.argv) != 3:
54        print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>')
55        sys.exit()
56    else:
57        return sys.argv[1], sys.argv[2]
58
59def group_features(db):
60    feature_groups = {}
61    for feature in db.all_features():
62        if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'):
63            if feature.featuretype == 'gene':
64                ## for any particular feature group, the gene doesn't necassarily come
65                ## before the CDS in the gff, so need to have the if/else logic
66                gene_id = feature.id.split('.')[0]
67                if gene_id not in feature_groups:
68                    feature_groups[gene_id] = FeatureGroup(gene_id)
69                    feature_groups[gene_id].add_gene(feature)
70                else:
71                    feature_groups[gene_id].add_gene(feature)
72            ## in this gff, the transcripts are referred to as mRNA
73            ## not all genes have mRNA records
74            elif feature.featuretype == 'mRNA':
75                gene_id = feature.id.split('.')[0]
76                # print(feature.id)
77                # print(gene_id)
78                feature_groups[gene_id].add_transcript(feature)
79            elif feature.featuretype == 'exon':
80                gene_id = feature.attributes['locus_tag'][0]
81                if gene_id not in feature_groups:
82                    feature_groups[gene_id] = FeatureGroup(gene_id)
83                    feature_groups[gene_id].add_exon(feature)
84                else:
85                    feature_groups[gene_id].add_exon(feature)
86
87            elif feature.featuretype == 'CDS':
88                gene_id = feature.attributes['locus_tag'][0]
89                if gene_id not in feature_groups:
90                    feature_groups[gene_id] = FeatureGroup(gene_id)
91                    feature_groups[gene_id].add_cds(feature)
92                else:
93                    feature_groups[gene_id].add_cds(feature)
94
95            elif feature.featuretype == 'ncRNA':
96                gene_id = feature.attributes['locus_tag'][0]
97                # print(gene_id)
98                if gene_id not in feature_groups:
99                    feature_groups[gene_id] = FeatureGroup(gene_id)
100                    feature_groups[gene_id].add_ncRNA(feature)
101                else:
102                    feature_groups[gene_id].add_ncRNA(feature)
103    return feature_groups
104
105def make_transcript_pseudofeature(CDSs):
106    ## takes list of CDSs, list len will often be 1, but will sometimes be 2
107    start = min([x.start for x in CDSs])
108    stop = max([x.stop for x in CDSs])
109    chrom = CDSs[0].chrom
110    ## check that both on the same strand
111    assert len(set([x.strand for x in CDSs])) == 1
112    strand = CDSs[0].strand
113    pf = PseudoFeature(chrom, start, stop, strand)
114    return pf
115
116
117def check_feature_groups(feature_groups):
118    ## checking that feature group has a gene, at least one CDS, and a transcript
119    ## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript
120    ## only checking that have transcript for everything
121    for gene_id in feature_groups:
122        ## don't want to analyse ncRNAs here, but need to include them up
123        ## to this point so that we know that that gene is an ncRNA gene
124        if feature_groups[gene_id].ncRNA != None:
125            continue
126        assert feature_groups[gene_id].gene != None
127        assert len(feature_groups[gene_id].CDSs) > 0
128        if feature_groups[gene_id].transcript == None:
129            ## using pseudofeature because the gffutils feature says it's not really intended
130            ## for direct instantiation by users
131            feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs)
132
133def print_features(feature_groups):
134    for gene_id in feature_groups:
135        feature_group = feature_groups[gene_id]
136        if feature_group.ncRNA != None:
137            continue
138        print('###')
139        name = feature_group.gene.attributes['Name'][0]
140        gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}'
141        print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t')
142
143        transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}'
144        print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t')
145
146        cds_attributes =  f'Parent=transcript:{gene_id};Name={name}'
147
148        for c in feature_group.CDSs:
149            print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t')
150
151def main():
152    '''
153    Ensembl gff should have one gene and one transcript per "feature group"
154    Then, can have multiple CDS/exons
155    read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially)
156    multiple CDS/exon
157    Exons aren't printed, as not needed bt bcftools csq
158    '''
159    gff_handle, db_handle = get_args()
160    fn = gffutils.example_filename(gff_handle)
161    db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
162    feature_groups = group_features(db)
163    check_feature_groups(feature_groups)
164    print_features(feature_groups)
165
166
167
168## gff input is generated from a genbank file by bp_genbank2gff3.pl
169
170if __name__ == '__main__':
171    main()
172
173