1# cython: embedsignature=True
2#
3# Code to read, write and edit VCF files
4#
5# VCF lines are encoded as a dictionary with these keys (note: all lowercase):
6# 'chrom':  string
7# 'pos':    integer
8# 'id':     string
9# 'ref':    string
10# 'alt':    list of strings
11# 'qual':   integer
12# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
13# 'info':   dictionary of values (see below)
14# 'format': list of keys (strings)
15# sample keys: dictionary of values (see below)
16#
17# The sample keys are accessible through vcf.getsamples()
18#
19# A dictionary of values contains value keys (defined in ##INFO or
20# ##FORMAT lines) which map to a list, containing integers, floats,
21# strings, or characters.  Missing values are replaced by a particular
22# value, often -1 or .
23#
24# Genotypes are not stored as a string, but as a list of 1 or 3
25# elements (for haploid and diploid samples), the first (and last) the
26# integer representing an allele, and the second the separation
27# character.  Note that there is just one genotype per sample, but for
28# consistency the single element is stored in a list.
29#
30# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
31# (key, value) pairs and are accessible through getheader()
32#
33# The VCF class can be instantiated with a 'regions' variable
34# consisting of tuples (chrom,start,end) encoding 0-based half-open
35# segments.  Only variants with a position inside the segment will be
36# parsed.  A regions parser is available under parse_regions.
37#
38# When instantiated, a reference can be passed to the VCF class.  This
39# may be any class that supports a fetch(chrom, start, end) method.
40#
41# NOTE: the position that is returned to Python is 0-based, NOT
42# 1-based as in the VCF file.
43# NOTE: There is also preliminary VCF functionality in the VariantFile class.
44#
45# TODO:
46#  only v4.0 writing is complete; alleles are not converted to v3.3 format
47#
48
49from collections import namedtuple, defaultdict
50from operator import itemgetter
51import sys, re, copy, bisect
52
53from libc.stdlib cimport atoi
54from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
55from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
56
57cimport pysam.libctabix as libctabix
58cimport pysam.libctabixproxies as libctabixproxies
59
60from pysam.libcutils cimport force_str
61
62import pysam
63
64gtsRegEx = re.compile("[|/\\\\]")
65alleleRegEx = re.compile('^[ACGTN]+$')
66
67# Utility function.  Uses 0-based coordinates
68def get_sequence(chrom, start, end, fa):
69    # obtain sequence from .fa file, without truncation
70    if end<=start: return ""
71    if not fa: return "N"*(end-start)
72    if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
73    sequence = fa.fetch(chrom, start, end).upper()
74    if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
75    return sequence
76
77# Utility function.  Parses a region string
78def parse_regions( string ):
79    result = []
80    for r in string.split(','):
81        elts = r.split(':')
82        chrom, start, end = elts[0], 0, 3000000000
83        if len(elts)==1: pass
84        elif len(elts)==2:
85            if len(elts[1])>0:
86                ielts = elts[1].split('-')
87                if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
88                try:    start, end = int(ielts[0])-1, int(ielts[1])
89                except: raise ValueError("Don't understand region string '%s'" % r)
90        else:
91            raise ValueError("Don't understand region string '%s'" % r)
92        result.append( (chrom,start,end) )
93    return result
94
95
96FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
97
98###########################################################################################################
99#
100# New class
101#
102###########################################################################################################
103
104cdef class VCFRecord(libctabixproxies.TupleProxy):
105    '''vcf record.
106
107    initialized from data and vcf meta
108    '''
109
110    cdef vcf
111    cdef char * contig
112    cdef uint32_t pos
113
114    def __init__(self, vcf):
115        self.vcf = vcf
116        self.encoding = vcf.encoding
117
118        # if len(data) != len(self.vcf._samples):
119        #     self.vcf.error(str(data),
120        #                self.BAD_NUMBER_OF_COLUMNS,
121        #                "expected %s for %s samples (%s), got %s" % \
122        #                    (len(self.vcf._samples),
123        #                     len(self.vcf._samples),
124        #                     self.vcf._samples,
125        #                     len(data)))
126
127    def __cinit__(self, vcf):
128        # start indexed access at genotypes
129        self.offset = 9
130
131        self.vcf = vcf
132        self.encoding = vcf.encoding
133
134    def error(self, line, error, opt=None):
135        '''raise error.'''
136        # pass to vcf file for error handling
137        return self.vcf.error(line, error, opt)
138
139    cdef update(self, char * buffer, size_t nbytes):
140        '''update internal data.
141
142        nbytes does not include the terminal '\0'.
143        '''
144        libctabixproxies.TupleProxy.update(self, buffer, nbytes)
145
146        self.contig = self.fields[0]
147        # vcf counts from 1 - correct here
148        self.pos = atoi(self.fields[1]) - 1
149
150    def __len__(self):
151        return max(0, self.nfields - 9)
152
153    property contig:
154        def __get__(self): return self.contig
155
156    property pos:
157        def __get__(self): return self.pos
158
159    property id:
160        def __get__(self): return self.fields[2]
161
162    property ref:
163        def __get__(self):
164            return self.fields[3]
165
166    property alt:
167        def __get__(self):
168            # convert v3.3 to v4.0 alleles below
169            alt = self.fields[4]
170            if alt == ".": alt = []
171            else: alt = alt.upper().split(',')
172            return alt
173
174    property qual:
175        def __get__(self):
176            qual = self.fields[5]
177            if qual == b".": qual = -1
178            else:
179                try:    qual = float(qual)
180                except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
181            return qual
182
183    property filter:
184        def __get__(self):
185            f = self.fields[6]
186            # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
187            if f == b"." or f == b"PASS" or f == b"0": return []
188            else: return f.split(';')
189
190    property info:
191        def __get__(self):
192            col = self.fields[7]
193            # dictionary of keys, and list of values
194            info = {}
195            if col != b".":
196                for blurp in col.split(';'):
197                    elts = blurp.split('=')
198                    if len(elts) == 1: v = None
199                    elif len(elts) == 2: v = elts[1]
200                    else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
201                    info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
202            return info
203
204    property format:
205         def __get__(self):
206             return self.fields[8].split(':')
207
208    property samples:
209        def __get__(self):
210            return self.vcf._samples
211
212    def __getitem__(self, key):
213
214        # parse sample columns
215        values = self.fields[self.vcf._sample2column[key]].split(':')
216        alt = self.alt
217        format = self.format
218
219        if len(values) > len(format):
220            self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
221                           (len(values),key,len(format)))
222
223        result = {}
224        for idx in range(len(format)):
225            expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
226            if idx < len(values): value = values[idx]
227            else:
228                if expected == -1: value = "."
229                else: value = ",".join(["."]*expected)
230
231            result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
232            if expected != -1 and len(result[format[idx]]) != expected:
233                self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
234                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
235                if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
236                result[format[idx]] = result[format[idx]][:expected]
237
238        return result
239
240
241cdef class asVCFRecord(libctabix.Parser):
242    '''converts a :term:`tabix row` into a VCF record.'''
243    cdef vcffile
244    def __init__(self, vcffile):
245        self.vcffile = vcffile
246
247    cdef parse(self, char * buffer, int len):
248        cdef VCFRecord r
249        r = VCFRecord(self.vcffile)
250        r.copy(buffer, len)
251        return r
252
253class VCF(object):
254
255    # types
256    NT_UNKNOWN = 0
257    NT_NUMBER = 1
258    NT_ALLELES = 2
259    NT_NR_ALLELES = 3
260    NT_GENOTYPES = 4
261    NT_PHASED_GENOTYPES = 5
262
263    _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
264                1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
265                2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
266                3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
267                4:"POS_NOT_NUMERICAL:Position column is not numerical",
268                5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
269                6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
270                7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
271                8:"POS_NOT_POSITIVE:Position field must be >0",
272                9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
273               10:"ERROR_INFO_STRING:Error while parsing info field",
274               11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
275               12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
276               13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
277               14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
278               15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
279               16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
280               17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
281               18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
282               19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
283               20:"MISSING_REF:Reference allele missing",
284               21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
285               22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
286               23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
287               24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
288               25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
289               26:"WRONG_REF:Wrong reference %s",
290               27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
291               28:"BAD_CHR_TAG:Error calculating chr tag for %s",
292               29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
293               30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
294               31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
295               32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
296               33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
297                }
298
299    # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
300    _header = []
301
302    # version number; 33=v3.3; 40=v4.0
303    _version = 40
304
305    # info, filter and format data
306    _info = {}
307    _filter = {}
308    _format = {}
309
310    # header; and required columns
311    _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
312    _samples = []
313
314    # control behaviour
315    _ignored_errors = set([11,31])   # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
316    _warn_errors = set([])
317    _leftalign = False
318
319    # reference sequence
320    _reference = None
321
322    # regions to include; None includes everything
323    _regions = None
324
325    # statefull stuff
326    _lineno = -1
327    _line = None
328    _lines = None
329
330    def __init__(self, _copy=None, reference=None, regions=None,
331                 lines=None, leftalign=False):
332        # make error identifiers accessible by name
333        for id in self._errors.keys():
334            self.__dict__[self._errors[id].split(':')[0]] = id
335        if _copy != None:
336            self._leftalign = _copy._leftalign
337            self._header = _copy._header[:]
338            self._version = _copy._version
339            self._info = copy.deepcopy(_copy._info)
340            self._filter = copy.deepcopy(_copy._filter)
341            self._format = copy.deepcopy(_copy._format)
342            self._samples = _copy._samples[:]
343            self._sample2column = copy.deepcopy(_copy._sample2column)
344            self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
345            self._warn_errors = copy.deepcopy(_copy._warn_errors)
346            self._reference = _copy._reference
347            self._regions = _copy._regions
348        if reference: self._reference = reference
349        if regions: self._regions = regions
350        if leftalign: self._leftalign = leftalign
351        self._lines = lines
352        self.encoding = "ascii"
353        self.tabixfile = None
354
355    def error(self,line,error,opt=None):
356        if error in self._ignored_errors: return
357        errorlabel, errorstring = self._errors[error].split(':')
358        if opt: errorstring = errorstring % opt
359        errwarn = ["Error","Warning"][error in self._warn_errors]
360        errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
361        if error in self._warn_errors: return
362        raise ValueError(errorstring)
363
364    def parse_format(self,line,format,filter=False):
365        if self._version == 40:
366            if not format.startswith('<'):
367                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
368                format = "<"+format
369            if not format.endswith('>'):
370                self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
371                format += ">"
372            format = format[1:-1]
373        data = {'id':None,'number':None,'type':None,'descr':None}
374        idx = 0
375        while len(format.strip())>0:
376            elts = format.strip().split(',')
377            first, rest = elts[0], ','.join(elts[1:])
378            if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
379                if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
380                if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
381                first = ["ID=","Number=","Type=","Description="][idx] + first
382            if first.startswith('ID='):            data['id'] = first.split('=')[1]
383            elif first.startswith('Number='):      data['number'] = first.split('=')[1]
384            elif first.startswith('Type='):        data['type'] = first.split('=')[1]
385            elif first.startswith('Description='):
386                elts = format.split('"')
387                if len(elts)<3:
388                    self.error(line,self.FORMAT_MISSING_QUOTES)
389                    elts = first.split('=') + [rest]
390                data['descr'] = elts[1]
391                rest = '"'.join(elts[2:])
392                if rest.startswith(','): rest = rest[1:]
393            else:
394                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
395            format = rest
396            idx += 1
397            if filter and idx==1: idx=3  # skip number and type fields for FILTER format strings
398        if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
399        if 'descr' not in data:
400            # missing description
401            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
402            data['descr'] = ""
403        if not data['type'] and not data['number']:
404            # fine, ##filter format
405            return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
406        if not data['type'] in ["Integer","Float","Character","String","Flag"]:
407            self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
408        # I would like a missing-value field, but it isn't there
409        if data['type'] in ['Integer','Float']: data['missing'] = None    # Do NOT use arbitrary int/float as missing value
410        else:                                   data['missing'] = '.'
411        if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
412        try:
413            n = int(data['number'])
414            t = self.NT_NUMBER
415        except ValueError:
416            n = -1
417            if data['number'] == '.':                   t = self.NT_UNKNOWN
418            elif data['number'] == '#alleles':          t = self.NT_ALLELES
419            elif data['number'] == '#nonref_alleles':   t = self.NT_NR_ALLELES
420            elif data['number'] == '#genotypes':        t = self.NT_GENOTYPES
421            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
422            elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
423            # abbreviations added in VCF version v4.1
424            elif data['number'] == 'A': t = self.NT_ALLELES
425            elif data['number'] == 'G': t = self.NT_GENOTYPES
426            else:
427                self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
428        # if number is 0 - type must be Flag
429        if n == 0 and data['type'] != 'Flag':
430            self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
431            # force type 'Flag' if no number
432            data['type'] = 'Flag'
433
434        return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
435
436    def format_format( self, fmt, filter=False ):
437        values = [('ID',fmt.id)]
438        if fmt.number != None and not filter:
439            if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
440            elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
441            elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
442            elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
443            elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
444            elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
445            else:
446                raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
447            values.append( ('Number',nmb) )
448            values.append( ('Type', fmt.type) )
449        values.append( ('Description', '"' + fmt.description + '"') )
450        if self._version == 33:
451            format = ",".join([v for k,v in values])
452        else:
453            format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
454        return format
455
456    def get_expected(self, format, formatdict, alt):
457        fmt = formatdict[format]
458        if fmt.numbertype == self.NT_UNKNOWN: return -1
459        if fmt.numbertype == self.NT_NUMBER: return fmt.number
460        if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
461        if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
462        if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
463        if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
464        return 0
465
466
467    def _add_definition(self, formatdict, key, data, line ):
468        if key in formatdict: return
469        self.error(line,self.ERROR_UNKNOWN_KEY,key)
470        if data == None:
471            formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
472            return
473        if data == []: data = [""]             # unsure what type -- say string
474        if type(data[0]) == type(0.0):
475            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
476            return
477        if type(data[0]) == type(0):
478            formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
479            return
480        formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
481
482
483    # todo: trim trailing missing values
484    def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
485        output, sdata = [], []
486        if type(data) == type([]): # for FORMAT field, make data with dummy values
487            d = {}
488            for k in data: d[k] = []
489            data = d
490        # convert missing values; and silently add definitions if required
491        for k in data:
492            self._add_definition( format, k, data[k], "(output)" )
493            for idx,v in enumerate(data[k]):
494                if v == format[k].missingvalue: data[k][idx] = "."
495        # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
496        for k in data:
497            if k != 'GT': sdata.append( (k,data[k]) )
498        sdata.sort()
499        if 'GT' in data:
500            sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
501        for k,v in sdata:
502            if v == []: v = None
503            if key and value:
504                if v != None: output.append( k+"="+','.join(map(str,v)) )
505                else: output.append( k )
506            elif key: output.append(k)
507            elif value:
508                if v != None: output.append( ','.join(map(str,v)) )
509                else: output.append( "." )                    # should not happen
510        # snip off trailing missing data
511        while len(output) > 1:
512            last = output[-1].replace(',','').replace('.','')
513            if len(last)>0: break
514            output = output[:-1]
515        return separator.join(output)
516
517
518    def enter_default_format(self):
519        for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
520                  FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
521                  FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
522                  FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
523                  FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
524                  FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
525                  FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
526                  FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
527                  FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
528                  FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1),    # unknown number, since may be haploid
529                  FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
530                  FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
531                  FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
532                  FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
533                  ]:
534            if f.id not in self._format:
535                self._format[f.id] = f
536
537    def parse_header(self, line):
538
539        assert line.startswith('##')
540        elts = line[2:].split('=')
541        key = elts[0].strip()
542        value = '='.join(elts[1:]).strip()
543        if key == "fileformat":
544            if value == "VCFv3.3":
545                self._version = 33
546            elif value == "VCFv4.0":
547                self._version = 40
548            elif value == "VCFv4.1":
549                # AH - for testing
550                self._version = 40
551            elif value == "VCFv4.2":
552                # AH - for testing
553                self._version = 40
554            else:
555                self.error(line,self.UNKNOWN_FORMAT_STRING)
556        elif key == "INFO":
557            f = self.parse_format(line, value)
558            self._info[ f.id ] = f
559        elif key == "FILTER":
560            f = self.parse_format(line, value, filter=True)
561            self._filter[ f.id ] = f
562        elif key == "FORMAT":
563            f = self.parse_format(line, value)
564            self._format[ f.id ] = f
565        else:
566            # keep other keys in the header field
567            self._header.append( (key,value) )
568
569
570    def write_header( self, stream ):
571        stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
572        for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
573        for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
574            for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
575
576
577    def parse_heading( self, line ):
578        assert line.startswith('#')
579        assert not line.startswith('##')
580        headings = line[1:].split('\t')
581        # test for 8, as FORMAT field might be missing
582        if len(headings)==1 and len(line[1:].split()) >= 8:
583            self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
584            headings = line[1:].split()
585
586        for i,s in enumerate(self._required):
587
588            if len(headings)<=i or headings[i] != s:
589
590                if len(headings) <= i:
591                    err = "(%sth entry not found)" % (i+1)
592                else:
593                    err = "(found %s, expected %s)" % (headings[i],s)
594
595                #self.error(line,self.BADLY_FORMATTED_HEADING,err)
596                # allow FORMAT column to be absent
597                if len(headings) == 8:
598                    headings.append("FORMAT")
599                else:
600                    self.error(line,self.BADLY_FORMATTED_HEADING,err)
601
602        self._samples = headings[9:]
603        self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
604
605    def write_heading( self, stream ):
606        stream.write("#" + "\t".join(self._required + self._samples) + "\n")
607
608    def convertGT(self, GTstring):
609        if GTstring == ".": return ["."]
610        try:
611            gts = gtsRegEx.split(GTstring)
612            if len(gts) == 1: return [int(gts[0])]
613            if len(gts) != 2: raise ValueError()
614            if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
615            return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
616        except ValueError:
617            self.error(self._line,self.BAD_GENOTYPE,GTstring)
618            return [".","|","."]
619
620    def convertGTback(self, GTdata):
621        return ''.join(map(str,GTdata))
622
623    def parse_formatdata( self, key, value, formatdict, line ):
624        # To do: check that the right number of values is present
625        f = formatdict.get(key,None)
626        if f == None:
627            self._add_definition(formatdict, key, value, line )
628            f = formatdict[key]
629        if f.type == "Flag":
630            if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
631            return []
632        values = value.split(',')
633        # deal with trailing data in some early VCF files
634        if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
635            self.error(line,self.ERROR_TRAILING_DATA,values[-1])
636            values[-1] = values[-1].split(';')[0]
637        if f.type == "Integer":
638            for idx,v in enumerate(values):
639                try:
640                    if v == ".": values[idx] = f.missingvalue
641                    else:        values[idx] = int(v)
642                except:
643                    self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
644                    return [0] * len(values)
645            return values
646        elif f.type == "String":
647            self._line = line
648            if f.id == "GT": values = list(map( self.convertGT, values ))
649            return values
650        elif f.type == "Character":
651            for v in values:
652                if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
653            return values
654        elif f.type == "Float":
655            for idx,v in enumerate(values):
656                if v == ".": values[idx] = f.missingvalue
657            try: return list(map(float,values))
658            except:
659                self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
660                return [0.0] * len(values)
661        else:
662            # can't happen
663            self.error(line,self.ERROR_INFO_STRING)
664
665    def inregion(self, chrom, pos):
666        if not self._regions: return True
667        for r in self._regions:
668            if r[0] == chrom and r[1] <= pos < r[2]: return True
669        return False
670
671    def parse_data( self, line, lineparse=False ):
672        cols = line.split('\t')
673        if len(cols) != len(self._samples)+9:
674            # gracefully deal with absent FORMAT column
675            # and those missing samples
676            if len(cols) == 8:
677                cols.append("")
678            else:
679                self.error(line,
680                           self.BAD_NUMBER_OF_COLUMNS,
681                           "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
682
683        chrom = cols[0]
684
685        # get 0-based position
686        try:    pos = int(cols[1])-1
687        except: self.error(line,self.POS_NOT_NUMERICAL)
688        if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
689
690        # implement filtering
691        if not self.inregion(chrom,pos): return None
692
693        # end of first-pass parse for sortedVCF
694        if lineparse: return chrom, pos, line
695
696        id = cols[2]
697
698        ref = cols[3].upper()
699        if ref == ".":
700            self.error(line,self.MISSING_REF)
701            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
702            else:                   ref = ""
703        else:
704            for c in ref:
705                if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
706            if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
707
708        # make sure reference is sane
709        if self._reference:
710            left = max(0,pos-100)
711            faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
712            faref = faref_leftflank[pos-left:]
713            if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
714            ref = faref
715
716        # convert v3.3 to v4.0 alleles below
717        if cols[4] == ".": alt = []
718        else: alt = cols[4].upper().split(',')
719
720        if cols[5] == ".": qual = -1
721        else:
722            try:    qual = float(cols[5])
723            except: self.error(line,self.QUAL_NOT_NUMERICAL)
724
725        # postpone checking that filters exist.  Encode missing filter or no filtering as empty list
726        if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
727        else: filter = cols[6].split(';')
728
729        # dictionary of keys, and list of values
730        info = {}
731        if cols[7] != ".":
732            for blurp in cols[7].split(';'):
733                elts = blurp.split('=')
734                if len(elts) == 1: v = None
735                elif len(elts) == 2: v = elts[1]
736                else: self.error(line,self.ERROR_INFO_STRING)
737                info[elts[0]] = self.parse_formatdata(elts[0],
738                                                      v,
739                                                      self._info,
740                                                      line)
741
742        # Gracefully deal with absent FORMAT column
743        if cols[8] == "": format = []
744        else: format = cols[8].split(':')
745
746        # check: all filters are defined
747        for f in filter:
748            if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
749
750        # check: format fields are defined
751        if self._format:
752            for f in format:
753                if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
754
755        # convert v3.3 alleles
756        if self._version == 33:
757            if len(ref) != 1: self.error(line,self.V33_BAD_REF)
758            newalts = []
759            have_deletions = False
760            for a in alt:
761                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
762                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
763                elif a.startswith('D'): # allow D<seq> and D<num>
764                    have_deletions = True
765                    try:
766                        l = int(a[1:])          # throws ValueError if sequence
767                        if len(ref) < l:        # add to reference if necessary
768                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
769                            ref += addns
770                            for i,na in enumerate(newalts): newalts[i] = na+addns
771                        a = ref[l:]             # new deletion, deleting pos...pos+l
772                    except ValueError:
773                        s = a[1:]
774                        if len(ref) < len(s):   # add Ns to reference if necessary
775                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
776                            if not s.endswith(addns) and addns != 'N'*len(addns):
777                                self.error(line,self.V33_UNMATCHED_DELETION,
778                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
779                            ref += addns
780                            for i,na in enumerate(newalts): newalts[i] = na+addns
781                        a = ref[len(s):]        # new deletion, deleting from pos
782                else:
783                    self.error(line,self.V33_BAD_ALLELE)
784                newalts.append(a)
785            alt = newalts
786            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
787            if have_deletions:
788                if pos == 0:
789                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
790                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
791                    ref += addn
792                    alt = [allele+addn for allele in alt]
793                else:
794                    addn = get_sequence(chrom,pos-1,pos,self._reference)
795                    ref = addn + ref
796                    alt = [addn + allele for allele in alt]
797                    pos -= 1
798        else:
799            # format v4.0 -- just check for nucleotides
800            for allele in alt:
801                if not alleleRegEx.match(allele):
802                    self.error(line,self.V40_BAD_ALLELE,allele)
803
804        # check for leading nucleotide in indel calls
805        for allele in alt:
806            if len(allele) != len(ref):
807                if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
808                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
809                    self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
810
811        # trim trailing bases in alleles
812        # AH: not certain why trimming this needs to be added
813        #     disabled now for unit testing
814        # if alt:
815        #     for i in range(1,min(len(ref),min(map(len,alt)))):
816        #         if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
817        #             break
818        #         ref, alt = ref[:-1], [allele[:-1] for allele in alt]
819
820        # left-align alleles, if a reference is available
821        if self._leftalign and self._reference:
822            while left < pos:
823                movable = True
824                for allele in alt:
825                    if len(allele) > len(ref):
826                        longest, shortest = allele, ref
827                    else:
828                        longest, shortest = ref, allele
829                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
830                        movable = False
831                    if longest[-1].upper() != longest[len(shortest)-1].upper():
832                        movable = False
833                if not movable:
834                    break
835                ref = ref[:-1]
836                alt = [allele[:-1] for allele in alt]
837                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
838                    ref = faref_leftflank[pos-left-1] + ref
839                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
840                    pos -= 1
841
842        # parse sample columns
843        samples = []
844        for sample in cols[9:]:
845            dict = {}
846            values = sample.split(':')
847            if len(values) > len(format):
848                self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
849            for idx in range(len(format)):
850                expected = self.get_expected(format[idx], self._format, alt)
851                if idx < len(values): value = values[idx]
852                else:
853                    if expected == -1: value = "."
854                    else: value = ",".join(["."]*expected)
855
856                dict[format[idx]] = self.parse_formatdata(format[idx],
857                                                          value,
858                                                          self._format,
859                                                          line)
860                if expected != -1 and len(dict[format[idx]]) != expected:
861                    self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
862                               "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
863                    if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
864                    dict[format[idx]] = dict[format[idx]][:expected]
865            samples.append( dict )
866
867        # done
868        d = {'chrom':chrom,
869             'pos':pos,      # return 0-based position
870             'id':id,
871             'ref':ref,
872             'alt':alt,
873             'qual':qual,
874             'filter':filter,
875             'info':info,
876             'format':format}
877        for key,value in zip(self._samples,samples):
878            d[key] = value
879
880        return d
881
882
883    def write_data(self, stream, data):
884        required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
885        for k in required:
886            if k not in data: raise ValueError("Required key %s not found in data" % str(k))
887        if data['alt'] == []: alt = "."
888        else: alt = ",".join(data['alt'])
889        if data['filter'] == None: filter = "."
890        elif data['filter'] == []:
891            if self._version == 33: filter = "0"
892            else: filter = "PASS"
893        else: filter = ';'.join(data['filter'])
894        if data['qual'] == -1: qual = "."
895        else: qual = str(data['qual'])
896
897        output = [data['chrom'],
898                  str(data['pos']+1),   # change to 1-based position
899                  data['id'],
900                  data['ref'],
901                  alt,
902                  qual,
903                  filter,
904                  self.format_formatdata(
905                      data['info'], self._info, separator=";"),
906                  self.format_formatdata(
907                      data['format'], self._format, value=False)]
908
909        for s in self._samples:
910            output.append(self.format_formatdata(
911                data[s], self._format, key=False))
912
913        stream.write( "\t".join(output) + "\n" )
914
915    def _parse_header(self, stream):
916        self._lineno = 0
917        for line in stream:
918            line = force_str(line, self.encoding)
919            self._lineno += 1
920            if line.startswith('##'):
921                self.parse_header(line.strip())
922            elif line.startswith('#'):
923                self.parse_heading(line.strip())
924                self.enter_default_format()
925            else:
926                break
927        return line
928
929    def _parse(self, line, stream):
930        # deal with files with header only
931        if line.startswith("##"): return
932        if len(line.strip()) > 0:
933            d = self.parse_data( line.strip() )
934            if d: yield d
935        for line in stream:
936            self._lineno += 1
937            if self._lines and self._lineno > self._lines: raise StopIteration
938            d = self.parse_data( line.strip() )
939            if d: yield d
940
941    ######################################################################################################
942    #
943    # API follows
944    #
945    ######################################################################################################
946
947    def getsamples(self):
948        """ List of samples in VCF file """
949        return self._samples
950
951    def setsamples(self,samples):
952        """ List of samples in VCF file """
953        self._samples = samples
954
955    def getheader(self):
956        """ List of header key-value pairs (strings) """
957        return self._header
958
959    def setheader(self,header):
960        """ List of header key-value pairs (strings) """
961        self._header = header
962
963    def getinfo(self):
964        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
965        return self._info
966
967    def setinfo(self,info):
968        """ Dictionary of ##INFO tags, as VCF.FORMAT values """
969        self._info = info
970
971    def getformat(self):
972        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
973        return self._format
974
975    def setformat(self,format):
976        """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
977        self._format = format
978
979    def getfilter(self):
980        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
981        return self._filter
982
983    def setfilter(self,filter):
984        """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
985        self._filter = filter
986
987    def setversion(self, version):
988        if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
989        self._version = version
990
991    def setregions(self, regions):
992        self._regions = regions
993
994    def setreference(self, ref):
995        """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
996        self._reference = ref
997
998    def ignoreerror(self, errorstring):
999        try:             self._ignored_errors.add(self.__dict__[errorstring])
1000        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
1001
1002    def warnerror(self, errorstring):
1003        try:             self._warn_errors.add(self.__dict__[errorstring])
1004        except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
1005
1006    def parse(self, stream):
1007        """ Parse a stream of VCF-formatted lines.  Initializes class instance and return generator """
1008        last_line = self._parse_header(stream)
1009        # now return a generator that does the actual work.  In this way the pre-processing is done
1010        # before the first piece of data is yielded
1011        return self._parse(last_line, stream)
1012
1013    def write(self, stream, datagenerator):
1014        """ Writes a VCF file to a stream, using a data generator (or list) """
1015        self.write_header(stream)
1016        self.write_heading(stream)
1017        for data in datagenerator: self.write_data(stream,data)
1018
1019    def writeheader(self, stream):
1020        """ Writes a VCF header """
1021        self.write_header(stream)
1022        self.write_heading(stream)
1023
1024    def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
1025        """ Utility function: compares two calls for equality """
1026        # a variant should always be assigned to a unique position, one base before
1027        # the leftmost position of the alignment gap.  If this rule is implemented
1028        # correctly, the two positions must be equal for the calls to be identical.
1029        if pos1 != pos2: return False
1030        # from both calls, trim rightmost bases when identical.  Do this safely, i.e.
1031        # only when the reference bases are not Ns
1032        while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
1033            ref1 = ref1[:-1]
1034            alt1 = alt1[:-1]
1035        while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
1036            ref2 = ref2[:-1]
1037            alt2 = alt2[:-1]
1038        # now, the alternative alleles must be identical
1039        return alt1 == alt2
1040
1041###########################################################################################################
1042###########################################################################################################
1043## API functions added by Andreas
1044###########################################################################################################
1045
1046    def connect(self, filename, encoding="ascii"):
1047        '''connect to tabix file.'''
1048        self.encoding=encoding
1049        self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
1050        self._parse_header(self.tabixfile.header)
1051
1052    def __del__(self):
1053        self.close()
1054        self.tabixfile = None
1055
1056    def close(self):
1057        if self.tabixfile:
1058            self.tabixfile.close()
1059            self.tabixfile = None
1060
1061    def fetch(self,
1062              reference=None,
1063              start=None,
1064              end=None,
1065              region=None ):
1066        """ Parse a stream of VCF-formatted lines.
1067        Initializes class instance and return generator """
1068        return self.tabixfile.fetch(
1069            reference,
1070            start,
1071            end,
1072            region,
1073            parser = asVCFRecord(self))
1074
1075    def validate(self, record):
1076        '''validate vcf record.
1077
1078        returns a validated record.
1079        '''
1080
1081        raise NotImplementedError("needs to be checked")
1082
1083        chrom, pos = record.chrom, record.pos
1084
1085        # check reference
1086        ref = record.ref
1087        if ref == ".":
1088            self.error(str(record),self.MISSING_REF)
1089            if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
1090            else:                   ref = ""
1091        else:
1092            for c in ref:
1093                if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
1094                if "N" in ref: ref = get_sequence(chrom,
1095                                                  pos,
1096                                                  pos+len(ref),
1097                                                  self._reference)
1098
1099        # make sure reference is sane
1100        if self._reference:
1101            left = max(0,self.pos-100)
1102            faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
1103            faref = faref_leftflank[pos-left:]
1104            if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
1105            ref = faref
1106
1107        # check: format fields are defined
1108        for f in record.format:
1109            if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
1110
1111        # check: all filters are defined
1112        for f in record.filter:
1113            if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
1114
1115        # convert v3.3 alleles
1116        if self._version == 33:
1117            if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
1118            newalts = []
1119            have_deletions = False
1120            for a in alt:
1121                if len(a) == 1: a = a + ref[1:]                       # SNP; add trailing reference
1122                elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:]  # insertion just beyond pos; add first and trailing reference
1123                elif a.startswith('D'): # allow D<seq> and D<num>
1124                    have_deletions = True
1125                    try:
1126                        l = int(a[1:])          # throws ValueError if sequence
1127                        if len(ref) < l:        # add to reference if necessary
1128                            addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
1129                            ref += addns
1130                            for i,na in enumerate(newalts): newalts[i] = na+addns
1131                        a = ref[l:]             # new deletion, deleting pos...pos+l
1132                    except ValueError:
1133                        s = a[1:]
1134                        if len(ref) < len(s):   # add Ns to reference if necessary
1135                            addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
1136                            if not s.endswith(addns) and addns != 'N'*len(addns):
1137                                self.error(str(record),self.V33_UNMATCHED_DELETION,
1138                                           "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
1139                            ref += addns
1140                            for i,na in enumerate(newalts): newalts[i] = na+addns
1141                        a = ref[len(s):]        # new deletion, deleting from pos
1142                else:
1143                    self.error(str(record),self.V33_BAD_ALLELE)
1144                newalts.append(a)
1145            alt = newalts
1146            # deletion alleles exist, add dummy 1st reference allele, and account for leading base
1147            if have_deletions:
1148                if pos == 0:
1149                    # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
1150                    addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
1151                    ref += addn
1152                    alt = [allele+addn for allele in alt]
1153                else:
1154                    addn = get_sequence(chrom,pos-1,pos,self._reference)
1155                    ref = addn + ref
1156                    alt = [addn + allele for allele in alt]
1157                    pos -= 1
1158        else:
1159            # format v4.0 -- just check for nucleotides
1160            for allele in alt:
1161                if not alleleRegEx.match(allele):
1162                    self.error(str(record),self.V40_BAD_ALLELE,allele)
1163
1164
1165        # check for leading nucleotide in indel calls
1166        for allele in alt:
1167            if len(allele) != len(ref):
1168                if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
1169                if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
1170                    self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
1171
1172        # trim trailing bases in alleles
1173        # AH: not certain why trimming this needs to be added
1174        #     disabled now for unit testing
1175        # for i in range(1,min(len(ref),min(map(len,alt)))):
1176        #     if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
1177        #         break
1178        #     ref, alt = ref[:-1], [allele[:-1] for allele in alt]
1179
1180        # left-align alleles, if a reference is available
1181        if self._leftalign and self._reference:
1182            while left < pos:
1183                movable = True
1184                for allele in alt:
1185                    if len(allele) > len(ref):
1186                        longest, shortest = allele, ref
1187                    else:
1188                        longest, shortest = ref, allele
1189                    if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
1190                        movable = False
1191                    if longest[-1].upper() != longest[len(shortest)-1].upper():
1192                        movable = False
1193                if not movable:
1194                    break
1195                ref = ref[:-1]
1196                alt = [allele[:-1] for allele in alt]
1197                if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
1198                    ref = faref_leftflank[pos-left-1] + ref
1199                    alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
1200                    pos -= 1
1201
1202__all__ = [
1203    "VCF", "VCFRecord", ]
1204