1# cython: embedsignature=True 2# 3# Code to read, write and edit VCF files 4# 5# VCF lines are encoded as a dictionary with these keys (note: all lowercase): 6# 'chrom': string 7# 'pos': integer 8# 'id': string 9# 'ref': string 10# 'alt': list of strings 11# 'qual': integer 12# 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] 13# 'info': dictionary of values (see below) 14# 'format': list of keys (strings) 15# sample keys: dictionary of values (see below) 16# 17# The sample keys are accessible through vcf.getsamples() 18# 19# A dictionary of values contains value keys (defined in ##INFO or 20# ##FORMAT lines) which map to a list, containing integers, floats, 21# strings, or characters. Missing values are replaced by a particular 22# value, often -1 or . 23# 24# Genotypes are not stored as a string, but as a list of 1 or 3 25# elements (for haploid and diploid samples), the first (and last) the 26# integer representing an allele, and the second the separation 27# character. Note that there is just one genotype per sample, but for 28# consistency the single element is stored in a list. 29# 30# Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as 31# (key, value) pairs and are accessible through getheader() 32# 33# The VCF class can be instantiated with a 'regions' variable 34# consisting of tuples (chrom,start,end) encoding 0-based half-open 35# segments. Only variants with a position inside the segment will be 36# parsed. A regions parser is available under parse_regions. 37# 38# When instantiated, a reference can be passed to the VCF class. This 39# may be any class that supports a fetch(chrom, start, end) method. 40# 41# NOTE: the position that is returned to Python is 0-based, NOT 42# 1-based as in the VCF file. 43# NOTE: There is also preliminary VCF functionality in the VariantFile class. 44# 45# TODO: 46# only v4.0 writing is complete; alleles are not converted to v3.3 format 47# 48 49from collections import namedtuple, defaultdict 50from operator import itemgetter 51import sys, re, copy, bisect 52 53from libc.stdlib cimport atoi 54from libc.stdint cimport int8_t, int16_t, int32_t, int64_t 55from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t 56 57cimport pysam.libctabix as libctabix 58cimport pysam.libctabixproxies as libctabixproxies 59 60from pysam.libcutils cimport force_str 61 62import pysam 63 64gtsRegEx = re.compile("[|/\\\\]") 65alleleRegEx = re.compile('^[ACGTN]+$') 66 67# Utility function. Uses 0-based coordinates 68def get_sequence(chrom, start, end, fa): 69 # obtain sequence from .fa file, without truncation 70 if end<=start: return "" 71 if not fa: return "N"*(end-start) 72 if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() 73 sequence = fa.fetch(chrom, start, end).upper() 74 if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) 75 return sequence 76 77# Utility function. Parses a region string 78def parse_regions( string ): 79 result = [] 80 for r in string.split(','): 81 elts = r.split(':') 82 chrom, start, end = elts[0], 0, 3000000000 83 if len(elts)==1: pass 84 elif len(elts)==2: 85 if len(elts[1])>0: 86 ielts = elts[1].split('-') 87 if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) 88 try: start, end = int(ielts[0])-1, int(ielts[1]) 89 except: raise ValueError("Don't understand region string '%s'" % r) 90 else: 91 raise ValueError("Don't understand region string '%s'" % r) 92 result.append( (chrom,start,end) ) 93 return result 94 95 96FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') 97 98########################################################################################################### 99# 100# New class 101# 102########################################################################################################### 103 104cdef class VCFRecord(libctabixproxies.TupleProxy): 105 '''vcf record. 106 107 initialized from data and vcf meta 108 ''' 109 110 cdef vcf 111 cdef char * contig 112 cdef uint32_t pos 113 114 def __init__(self, vcf): 115 self.vcf = vcf 116 self.encoding = vcf.encoding 117 118 # if len(data) != len(self.vcf._samples): 119 # self.vcf.error(str(data), 120 # self.BAD_NUMBER_OF_COLUMNS, 121 # "expected %s for %s samples (%s), got %s" % \ 122 # (len(self.vcf._samples), 123 # len(self.vcf._samples), 124 # self.vcf._samples, 125 # len(data))) 126 127 def __cinit__(self, vcf): 128 # start indexed access at genotypes 129 self.offset = 9 130 131 self.vcf = vcf 132 self.encoding = vcf.encoding 133 134 def error(self, line, error, opt=None): 135 '''raise error.''' 136 # pass to vcf file for error handling 137 return self.vcf.error(line, error, opt) 138 139 cdef update(self, char * buffer, size_t nbytes): 140 '''update internal data. 141 142 nbytes does not include the terminal '\0'. 143 ''' 144 libctabixproxies.TupleProxy.update(self, buffer, nbytes) 145 146 self.contig = self.fields[0] 147 # vcf counts from 1 - correct here 148 self.pos = atoi(self.fields[1]) - 1 149 150 def __len__(self): 151 return max(0, self.nfields - 9) 152 153 property contig: 154 def __get__(self): return self.contig 155 156 property pos: 157 def __get__(self): return self.pos 158 159 property id: 160 def __get__(self): return self.fields[2] 161 162 property ref: 163 def __get__(self): 164 return self.fields[3] 165 166 property alt: 167 def __get__(self): 168 # convert v3.3 to v4.0 alleles below 169 alt = self.fields[4] 170 if alt == ".": alt = [] 171 else: alt = alt.upper().split(',') 172 return alt 173 174 property qual: 175 def __get__(self): 176 qual = self.fields[5] 177 if qual == b".": qual = -1 178 else: 179 try: qual = float(qual) 180 except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL) 181 return qual 182 183 property filter: 184 def __get__(self): 185 f = self.fields[6] 186 # postpone checking that filters exist. Encode missing filter or no filtering as empty list 187 if f == b"." or f == b"PASS" or f == b"0": return [] 188 else: return f.split(';') 189 190 property info: 191 def __get__(self): 192 col = self.fields[7] 193 # dictionary of keys, and list of values 194 info = {} 195 if col != b".": 196 for blurp in col.split(';'): 197 elts = blurp.split('=') 198 if len(elts) == 1: v = None 199 elif len(elts) == 2: v = elts[1] 200 else: self.vcf.error(str(self),self.ERROR_INFO_STRING) 201 info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf)) 202 return info 203 204 property format: 205 def __get__(self): 206 return self.fields[8].split(':') 207 208 property samples: 209 def __get__(self): 210 return self.vcf._samples 211 212 def __getitem__(self, key): 213 214 # parse sample columns 215 values = self.fields[self.vcf._sample2column[key]].split(':') 216 alt = self.alt 217 format = self.format 218 219 if len(values) > len(format): 220 self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\ 221 (len(values),key,len(format))) 222 223 result = {} 224 for idx in range(len(format)): 225 expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) 226 if idx < len(values): value = values[idx] 227 else: 228 if expected == -1: value = "." 229 else: value = ",".join(["."]*expected) 230 231 result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data)) 232 if expected != -1 and len(result[format[idx]]) != expected: 233 self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, 234 "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) 235 if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) 236 result[format[idx]] = result[format[idx]][:expected] 237 238 return result 239 240 241cdef class asVCFRecord(libctabix.Parser): 242 '''converts a :term:`tabix row` into a VCF record.''' 243 cdef vcffile 244 def __init__(self, vcffile): 245 self.vcffile = vcffile 246 247 cdef parse(self, char * buffer, int len): 248 cdef VCFRecord r 249 r = VCFRecord(self.vcffile) 250 r.copy(buffer, len) 251 return r 252 253class VCF(object): 254 255 # types 256 NT_UNKNOWN = 0 257 NT_NUMBER = 1 258 NT_ALLELES = 2 259 NT_NR_ALLELES = 3 260 NT_GENOTYPES = 4 261 NT_PHASED_GENOTYPES = 5 262 263 _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", 264 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", 265 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", 266 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", 267 4:"POS_NOT_NUMERICAL:Position column is not numerical", 268 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", 269 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", 270 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", 271 8:"POS_NOT_POSITIVE:Position field must be >0", 272 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", 273 10:"ERROR_INFO_STRING:Error while parsing info field", 274 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", 275 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", 276 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", 277 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", 278 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", 279 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", 280 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", 281 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", 282 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", 283 20:"MISSING_REF:Reference allele missing", 284 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", 285 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", 286 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", 287 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", 288 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", 289 26:"WRONG_REF:Wrong reference %s", 290 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", 291 28:"BAD_CHR_TAG:Error calculating chr tag for %s", 292 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", 293 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base", 294 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'", 295 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s", 296 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value", 297 } 298 299 # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields 300 _header = [] 301 302 # version number; 33=v3.3; 40=v4.0 303 _version = 40 304 305 # info, filter and format data 306 _info = {} 307 _filter = {} 308 _format = {} 309 310 # header; and required columns 311 _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] 312 _samples = [] 313 314 # control behaviour 315 _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD 316 _warn_errors = set([]) 317 _leftalign = False 318 319 # reference sequence 320 _reference = None 321 322 # regions to include; None includes everything 323 _regions = None 324 325 # statefull stuff 326 _lineno = -1 327 _line = None 328 _lines = None 329 330 def __init__(self, _copy=None, reference=None, regions=None, 331 lines=None, leftalign=False): 332 # make error identifiers accessible by name 333 for id in self._errors.keys(): 334 self.__dict__[self._errors[id].split(':')[0]] = id 335 if _copy != None: 336 self._leftalign = _copy._leftalign 337 self._header = _copy._header[:] 338 self._version = _copy._version 339 self._info = copy.deepcopy(_copy._info) 340 self._filter = copy.deepcopy(_copy._filter) 341 self._format = copy.deepcopy(_copy._format) 342 self._samples = _copy._samples[:] 343 self._sample2column = copy.deepcopy(_copy._sample2column) 344 self._ignored_errors = copy.deepcopy(_copy._ignored_errors) 345 self._warn_errors = copy.deepcopy(_copy._warn_errors) 346 self._reference = _copy._reference 347 self._regions = _copy._regions 348 if reference: self._reference = reference 349 if regions: self._regions = regions 350 if leftalign: self._leftalign = leftalign 351 self._lines = lines 352 self.encoding = "ascii" 353 self.tabixfile = None 354 355 def error(self,line,error,opt=None): 356 if error in self._ignored_errors: return 357 errorlabel, errorstring = self._errors[error].split(':') 358 if opt: errorstring = errorstring % opt 359 errwarn = ["Error","Warning"][error in self._warn_errors] 360 errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring) 361 if error in self._warn_errors: return 362 raise ValueError(errorstring) 363 364 def parse_format(self,line,format,filter=False): 365 if self._version == 40: 366 if not format.startswith('<'): 367 self.error(line,self.V40_MISSING_ANGLE_BRACKETS) 368 format = "<"+format 369 if not format.endswith('>'): 370 self.error(line,self.V40_MISSING_ANGLE_BRACKETS) 371 format += ">" 372 format = format[1:-1] 373 data = {'id':None,'number':None,'type':None,'descr':None} 374 idx = 0 375 while len(format.strip())>0: 376 elts = format.strip().split(',') 377 first, rest = elts[0], ','.join(elts[1:]) 378 if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): 379 if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) 380 if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 381 first = ["ID=","Number=","Type=","Description="][idx] + first 382 if first.startswith('ID='): data['id'] = first.split('=')[1] 383 elif first.startswith('Number='): data['number'] = first.split('=')[1] 384 elif first.startswith('Type='): data['type'] = first.split('=')[1] 385 elif first.startswith('Description='): 386 elts = format.split('"') 387 if len(elts)<3: 388 self.error(line,self.FORMAT_MISSING_QUOTES) 389 elts = first.split('=') + [rest] 390 data['descr'] = elts[1] 391 rest = '"'.join(elts[2:]) 392 if rest.startswith(','): rest = rest[1:] 393 else: 394 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 395 format = rest 396 idx += 1 397 if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings 398 if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 399 if 'descr' not in data: 400 # missing description 401 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 402 data['descr'] = "" 403 if not data['type'] and not data['number']: 404 # fine, ##filter format 405 return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') 406 if not data['type'] in ["Integer","Float","Character","String","Flag"]: 407 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 408 # I would like a missing-value field, but it isn't there 409 if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value 410 else: data['missing'] = '.' 411 if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 412 try: 413 n = int(data['number']) 414 t = self.NT_NUMBER 415 except ValueError: 416 n = -1 417 if data['number'] == '.': t = self.NT_UNKNOWN 418 elif data['number'] == '#alleles': t = self.NT_ALLELES 419 elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES 420 elif data['number'] == '#genotypes': t = self.NT_GENOTYPES 421 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES 422 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES 423 # abbreviations added in VCF version v4.1 424 elif data['number'] == 'A': t = self.NT_ALLELES 425 elif data['number'] == 'G': t = self.NT_GENOTYPES 426 else: 427 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) 428 # if number is 0 - type must be Flag 429 if n == 0 and data['type'] != 'Flag': 430 self.error( line, self.ZERO_FOR_NON_FLAG_FIELD) 431 # force type 'Flag' if no number 432 data['type'] = 'Flag' 433 434 return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) 435 436 def format_format( self, fmt, filter=False ): 437 values = [('ID',fmt.id)] 438 if fmt.number != None and not filter: 439 if fmt.numbertype == self.NT_UNKNOWN: nmb = "." 440 elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) 441 elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" 442 elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" 443 elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" 444 elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" 445 else: 446 raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) 447 values.append( ('Number',nmb) ) 448 values.append( ('Type', fmt.type) ) 449 values.append( ('Description', '"' + fmt.description + '"') ) 450 if self._version == 33: 451 format = ",".join([v for k,v in values]) 452 else: 453 format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">" 454 return format 455 456 def get_expected(self, format, formatdict, alt): 457 fmt = formatdict[format] 458 if fmt.numbertype == self.NT_UNKNOWN: return -1 459 if fmt.numbertype == self.NT_NUMBER: return fmt.number 460 if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 461 if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) 462 if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 463 if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) 464 return 0 465 466 467 def _add_definition(self, formatdict, key, data, line ): 468 if key in formatdict: return 469 self.error(line,self.ERROR_UNKNOWN_KEY,key) 470 if data == None: 471 formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") 472 return 473 if data == []: data = [""] # unsure what type -- say string 474 if type(data[0]) == type(0.0): 475 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) 476 return 477 if type(data[0]) == type(0): 478 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) 479 return 480 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") 481 482 483 # todo: trim trailing missing values 484 def format_formatdata( self, data, format, key=True, value=True, separator=":" ): 485 output, sdata = [], [] 486 if type(data) == type([]): # for FORMAT field, make data with dummy values 487 d = {} 488 for k in data: d[k] = [] 489 data = d 490 # convert missing values; and silently add definitions if required 491 for k in data: 492 self._add_definition( format, k, data[k], "(output)" ) 493 for idx,v in enumerate(data[k]): 494 if v == format[k].missingvalue: data[k][idx] = "." 495 # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string 496 for k in data: 497 if k != 'GT': sdata.append( (k,data[k]) ) 498 sdata.sort() 499 if 'GT' in data: 500 sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata 501 for k,v in sdata: 502 if v == []: v = None 503 if key and value: 504 if v != None: output.append( k+"="+','.join(map(str,v)) ) 505 else: output.append( k ) 506 elif key: output.append(k) 507 elif value: 508 if v != None: output.append( ','.join(map(str,v)) ) 509 else: output.append( "." ) # should not happen 510 # snip off trailing missing data 511 while len(output) > 1: 512 last = output[-1].replace(',','').replace('.','') 513 if len(last)>0: break 514 output = output[:-1] 515 return separator.join(output) 516 517 518 def enter_default_format(self): 519 for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), 520 FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), 521 FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'), 522 FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), 523 FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), 524 FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), 525 FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'), 526 FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'), 527 FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'), 528 FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid 529 FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'), 530 FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1), 531 FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1), 532 FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1), 533 ]: 534 if f.id not in self._format: 535 self._format[f.id] = f 536 537 def parse_header(self, line): 538 539 assert line.startswith('##') 540 elts = line[2:].split('=') 541 key = elts[0].strip() 542 value = '='.join(elts[1:]).strip() 543 if key == "fileformat": 544 if value == "VCFv3.3": 545 self._version = 33 546 elif value == "VCFv4.0": 547 self._version = 40 548 elif value == "VCFv4.1": 549 # AH - for testing 550 self._version = 40 551 elif value == "VCFv4.2": 552 # AH - for testing 553 self._version = 40 554 else: 555 self.error(line,self.UNKNOWN_FORMAT_STRING) 556 elif key == "INFO": 557 f = self.parse_format(line, value) 558 self._info[ f.id ] = f 559 elif key == "FILTER": 560 f = self.parse_format(line, value, filter=True) 561 self._filter[ f.id ] = f 562 elif key == "FORMAT": 563 f = self.parse_format(line, value) 564 self._format[ f.id ] = f 565 else: 566 # keep other keys in the header field 567 self._header.append( (key,value) ) 568 569 570 def write_header( self, stream ): 571 stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) 572 for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) 573 for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: 574 for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) 575 576 577 def parse_heading( self, line ): 578 assert line.startswith('#') 579 assert not line.startswith('##') 580 headings = line[1:].split('\t') 581 # test for 8, as FORMAT field might be missing 582 if len(headings)==1 and len(line[1:].split()) >= 8: 583 self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) 584 headings = line[1:].split() 585 586 for i,s in enumerate(self._required): 587 588 if len(headings)<=i or headings[i] != s: 589 590 if len(headings) <= i: 591 err = "(%sth entry not found)" % (i+1) 592 else: 593 err = "(found %s, expected %s)" % (headings[i],s) 594 595 #self.error(line,self.BADLY_FORMATTED_HEADING,err) 596 # allow FORMAT column to be absent 597 if len(headings) == 8: 598 headings.append("FORMAT") 599 else: 600 self.error(line,self.BADLY_FORMATTED_HEADING,err) 601 602 self._samples = headings[9:] 603 self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] ) 604 605 def write_heading( self, stream ): 606 stream.write("#" + "\t".join(self._required + self._samples) + "\n") 607 608 def convertGT(self, GTstring): 609 if GTstring == ".": return ["."] 610 try: 611 gts = gtsRegEx.split(GTstring) 612 if len(gts) == 1: return [int(gts[0])] 613 if len(gts) != 2: raise ValueError() 614 if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] 615 return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] 616 except ValueError: 617 self.error(self._line,self.BAD_GENOTYPE,GTstring) 618 return [".","|","."] 619 620 def convertGTback(self, GTdata): 621 return ''.join(map(str,GTdata)) 622 623 def parse_formatdata( self, key, value, formatdict, line ): 624 # To do: check that the right number of values is present 625 f = formatdict.get(key,None) 626 if f == None: 627 self._add_definition(formatdict, key, value, line ) 628 f = formatdict[key] 629 if f.type == "Flag": 630 if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) 631 return [] 632 values = value.split(',') 633 # deal with trailing data in some early VCF files 634 if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: 635 self.error(line,self.ERROR_TRAILING_DATA,values[-1]) 636 values[-1] = values[-1].split(';')[0] 637 if f.type == "Integer": 638 for idx,v in enumerate(values): 639 try: 640 if v == ".": values[idx] = f.missingvalue 641 else: values[idx] = int(v) 642 except: 643 self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values))) 644 return [0] * len(values) 645 return values 646 elif f.type == "String": 647 self._line = line 648 if f.id == "GT": values = list(map( self.convertGT, values )) 649 return values 650 elif f.type == "Character": 651 for v in values: 652 if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) 653 return values 654 elif f.type == "Float": 655 for idx,v in enumerate(values): 656 if v == ".": values[idx] = f.missingvalue 657 try: return list(map(float,values)) 658 except: 659 self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values))) 660 return [0.0] * len(values) 661 else: 662 # can't happen 663 self.error(line,self.ERROR_INFO_STRING) 664 665 def inregion(self, chrom, pos): 666 if not self._regions: return True 667 for r in self._regions: 668 if r[0] == chrom and r[1] <= pos < r[2]: return True 669 return False 670 671 def parse_data( self, line, lineparse=False ): 672 cols = line.split('\t') 673 if len(cols) != len(self._samples)+9: 674 # gracefully deal with absent FORMAT column 675 # and those missing samples 676 if len(cols) == 8: 677 cols.append("") 678 else: 679 self.error(line, 680 self.BAD_NUMBER_OF_COLUMNS, 681 "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) 682 683 chrom = cols[0] 684 685 # get 0-based position 686 try: pos = int(cols[1])-1 687 except: self.error(line,self.POS_NOT_NUMERICAL) 688 if pos < 0: self.error(line,self.POS_NOT_POSITIVE) 689 690 # implement filtering 691 if not self.inregion(chrom,pos): return None 692 693 # end of first-pass parse for sortedVCF 694 if lineparse: return chrom, pos, line 695 696 id = cols[2] 697 698 ref = cols[3].upper() 699 if ref == ".": 700 self.error(line,self.MISSING_REF) 701 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) 702 else: ref = "" 703 else: 704 for c in ref: 705 if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) 706 if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) 707 708 # make sure reference is sane 709 if self._reference: 710 left = max(0,pos-100) 711 faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) 712 faref = faref_leftflank[pos-left:] 713 if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) 714 ref = faref 715 716 # convert v3.3 to v4.0 alleles below 717 if cols[4] == ".": alt = [] 718 else: alt = cols[4].upper().split(',') 719 720 if cols[5] == ".": qual = -1 721 else: 722 try: qual = float(cols[5]) 723 except: self.error(line,self.QUAL_NOT_NUMERICAL) 724 725 # postpone checking that filters exist. Encode missing filter or no filtering as empty list 726 if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] 727 else: filter = cols[6].split(';') 728 729 # dictionary of keys, and list of values 730 info = {} 731 if cols[7] != ".": 732 for blurp in cols[7].split(';'): 733 elts = blurp.split('=') 734 if len(elts) == 1: v = None 735 elif len(elts) == 2: v = elts[1] 736 else: self.error(line,self.ERROR_INFO_STRING) 737 info[elts[0]] = self.parse_formatdata(elts[0], 738 v, 739 self._info, 740 line) 741 742 # Gracefully deal with absent FORMAT column 743 if cols[8] == "": format = [] 744 else: format = cols[8].split(':') 745 746 # check: all filters are defined 747 for f in filter: 748 if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) 749 750 # check: format fields are defined 751 if self._format: 752 for f in format: 753 if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) 754 755 # convert v3.3 alleles 756 if self._version == 33: 757 if len(ref) != 1: self.error(line,self.V33_BAD_REF) 758 newalts = [] 759 have_deletions = False 760 for a in alt: 761 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference 762 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference 763 elif a.startswith('D'): # allow D<seq> and D<num> 764 have_deletions = True 765 try: 766 l = int(a[1:]) # throws ValueError if sequence 767 if len(ref) < l: # add to reference if necessary 768 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) 769 ref += addns 770 for i,na in enumerate(newalts): newalts[i] = na+addns 771 a = ref[l:] # new deletion, deleting pos...pos+l 772 except ValueError: 773 s = a[1:] 774 if len(ref) < len(s): # add Ns to reference if necessary 775 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) 776 if not s.endswith(addns) and addns != 'N'*len(addns): 777 self.error(line,self.V33_UNMATCHED_DELETION, 778 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) 779 ref += addns 780 for i,na in enumerate(newalts): newalts[i] = na+addns 781 a = ref[len(s):] # new deletion, deleting from pos 782 else: 783 self.error(line,self.V33_BAD_ALLELE) 784 newalts.append(a) 785 alt = newalts 786 # deletion alleles exist, add dummy 1st reference allele, and account for leading base 787 if have_deletions: 788 if pos == 0: 789 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 790 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) 791 ref += addn 792 alt = [allele+addn for allele in alt] 793 else: 794 addn = get_sequence(chrom,pos-1,pos,self._reference) 795 ref = addn + ref 796 alt = [addn + allele for allele in alt] 797 pos -= 1 798 else: 799 # format v4.0 -- just check for nucleotides 800 for allele in alt: 801 if not alleleRegEx.match(allele): 802 self.error(line,self.V40_BAD_ALLELE,allele) 803 804 # check for leading nucleotide in indel calls 805 for allele in alt: 806 if len(allele) != len(ref): 807 if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) 808 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): 809 self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) 810 811 # trim trailing bases in alleles 812 # AH: not certain why trimming this needs to be added 813 # disabled now for unit testing 814 # if alt: 815 # for i in range(1,min(len(ref),min(map(len,alt)))): 816 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): 817 # break 818 # ref, alt = ref[:-1], [allele[:-1] for allele in alt] 819 820 # left-align alleles, if a reference is available 821 if self._leftalign and self._reference: 822 while left < pos: 823 movable = True 824 for allele in alt: 825 if len(allele) > len(ref): 826 longest, shortest = allele, ref 827 else: 828 longest, shortest = ref, allele 829 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): 830 movable = False 831 if longest[-1].upper() != longest[len(shortest)-1].upper(): 832 movable = False 833 if not movable: 834 break 835 ref = ref[:-1] 836 alt = [allele[:-1] for allele in alt] 837 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: 838 ref = faref_leftflank[pos-left-1] + ref 839 alt = [faref_leftflank[pos-left-1] + allele for allele in alt] 840 pos -= 1 841 842 # parse sample columns 843 samples = [] 844 for sample in cols[9:]: 845 dict = {} 846 values = sample.split(':') 847 if len(values) > len(format): 848 self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) 849 for idx in range(len(format)): 850 expected = self.get_expected(format[idx], self._format, alt) 851 if idx < len(values): value = values[idx] 852 else: 853 if expected == -1: value = "." 854 else: value = ",".join(["."]*expected) 855 856 dict[format[idx]] = self.parse_formatdata(format[idx], 857 value, 858 self._format, 859 line) 860 if expected != -1 and len(dict[format[idx]]) != expected: 861 self.error(line,self.BAD_NUMBER_OF_PARAMETERS, 862 "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) 863 if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) 864 dict[format[idx]] = dict[format[idx]][:expected] 865 samples.append( dict ) 866 867 # done 868 d = {'chrom':chrom, 869 'pos':pos, # return 0-based position 870 'id':id, 871 'ref':ref, 872 'alt':alt, 873 'qual':qual, 874 'filter':filter, 875 'info':info, 876 'format':format} 877 for key,value in zip(self._samples,samples): 878 d[key] = value 879 880 return d 881 882 883 def write_data(self, stream, data): 884 required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples 885 for k in required: 886 if k not in data: raise ValueError("Required key %s not found in data" % str(k)) 887 if data['alt'] == []: alt = "." 888 else: alt = ",".join(data['alt']) 889 if data['filter'] == None: filter = "." 890 elif data['filter'] == []: 891 if self._version == 33: filter = "0" 892 else: filter = "PASS" 893 else: filter = ';'.join(data['filter']) 894 if data['qual'] == -1: qual = "." 895 else: qual = str(data['qual']) 896 897 output = [data['chrom'], 898 str(data['pos']+1), # change to 1-based position 899 data['id'], 900 data['ref'], 901 alt, 902 qual, 903 filter, 904 self.format_formatdata( 905 data['info'], self._info, separator=";"), 906 self.format_formatdata( 907 data['format'], self._format, value=False)] 908 909 for s in self._samples: 910 output.append(self.format_formatdata( 911 data[s], self._format, key=False)) 912 913 stream.write( "\t".join(output) + "\n" ) 914 915 def _parse_header(self, stream): 916 self._lineno = 0 917 for line in stream: 918 line = force_str(line, self.encoding) 919 self._lineno += 1 920 if line.startswith('##'): 921 self.parse_header(line.strip()) 922 elif line.startswith('#'): 923 self.parse_heading(line.strip()) 924 self.enter_default_format() 925 else: 926 break 927 return line 928 929 def _parse(self, line, stream): 930 # deal with files with header only 931 if line.startswith("##"): return 932 if len(line.strip()) > 0: 933 d = self.parse_data( line.strip() ) 934 if d: yield d 935 for line in stream: 936 self._lineno += 1 937 if self._lines and self._lineno > self._lines: raise StopIteration 938 d = self.parse_data( line.strip() ) 939 if d: yield d 940 941 ###################################################################################################### 942 # 943 # API follows 944 # 945 ###################################################################################################### 946 947 def getsamples(self): 948 """ List of samples in VCF file """ 949 return self._samples 950 951 def setsamples(self,samples): 952 """ List of samples in VCF file """ 953 self._samples = samples 954 955 def getheader(self): 956 """ List of header key-value pairs (strings) """ 957 return self._header 958 959 def setheader(self,header): 960 """ List of header key-value pairs (strings) """ 961 self._header = header 962 963 def getinfo(self): 964 """ Dictionary of ##INFO tags, as VCF.FORMAT values """ 965 return self._info 966 967 def setinfo(self,info): 968 """ Dictionary of ##INFO tags, as VCF.FORMAT values """ 969 self._info = info 970 971 def getformat(self): 972 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ 973 return self._format 974 975 def setformat(self,format): 976 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ 977 self._format = format 978 979 def getfilter(self): 980 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ 981 return self._filter 982 983 def setfilter(self,filter): 984 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ 985 self._filter = filter 986 987 def setversion(self, version): 988 if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files") 989 self._version = version 990 991 def setregions(self, regions): 992 self._regions = regions 993 994 def setreference(self, ref): 995 """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ 996 self._reference = ref 997 998 def ignoreerror(self, errorstring): 999 try: self._ignored_errors.add(self.__dict__[errorstring]) 1000 except KeyError: raise ValueError("Invalid error string: %s" % errorstring) 1001 1002 def warnerror(self, errorstring): 1003 try: self._warn_errors.add(self.__dict__[errorstring]) 1004 except KeyError: raise ValueError("Invalid error string: %s" % errorstring) 1005 1006 def parse(self, stream): 1007 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ 1008 last_line = self._parse_header(stream) 1009 # now return a generator that does the actual work. In this way the pre-processing is done 1010 # before the first piece of data is yielded 1011 return self._parse(last_line, stream) 1012 1013 def write(self, stream, datagenerator): 1014 """ Writes a VCF file to a stream, using a data generator (or list) """ 1015 self.write_header(stream) 1016 self.write_heading(stream) 1017 for data in datagenerator: self.write_data(stream,data) 1018 1019 def writeheader(self, stream): 1020 """ Writes a VCF header """ 1021 self.write_header(stream) 1022 self.write_heading(stream) 1023 1024 def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): 1025 """ Utility function: compares two calls for equality """ 1026 # a variant should always be assigned to a unique position, one base before 1027 # the leftmost position of the alignment gap. If this rule is implemented 1028 # correctly, the two positions must be equal for the calls to be identical. 1029 if pos1 != pos2: return False 1030 # from both calls, trim rightmost bases when identical. Do this safely, i.e. 1031 # only when the reference bases are not Ns 1032 while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: 1033 ref1 = ref1[:-1] 1034 alt1 = alt1[:-1] 1035 while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: 1036 ref2 = ref2[:-1] 1037 alt2 = alt2[:-1] 1038 # now, the alternative alleles must be identical 1039 return alt1 == alt2 1040 1041########################################################################################################### 1042########################################################################################################### 1043## API functions added by Andreas 1044########################################################################################################### 1045 1046 def connect(self, filename, encoding="ascii"): 1047 '''connect to tabix file.''' 1048 self.encoding=encoding 1049 self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) 1050 self._parse_header(self.tabixfile.header) 1051 1052 def __del__(self): 1053 self.close() 1054 self.tabixfile = None 1055 1056 def close(self): 1057 if self.tabixfile: 1058 self.tabixfile.close() 1059 self.tabixfile = None 1060 1061 def fetch(self, 1062 reference=None, 1063 start=None, 1064 end=None, 1065 region=None ): 1066 """ Parse a stream of VCF-formatted lines. 1067 Initializes class instance and return generator """ 1068 return self.tabixfile.fetch( 1069 reference, 1070 start, 1071 end, 1072 region, 1073 parser = asVCFRecord(self)) 1074 1075 def validate(self, record): 1076 '''validate vcf record. 1077 1078 returns a validated record. 1079 ''' 1080 1081 raise NotImplementedError("needs to be checked") 1082 1083 chrom, pos = record.chrom, record.pos 1084 1085 # check reference 1086 ref = record.ref 1087 if ref == ".": 1088 self.error(str(record),self.MISSING_REF) 1089 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) 1090 else: ref = "" 1091 else: 1092 for c in ref: 1093 if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) 1094 if "N" in ref: ref = get_sequence(chrom, 1095 pos, 1096 pos+len(ref), 1097 self._reference) 1098 1099 # make sure reference is sane 1100 if self._reference: 1101 left = max(0,self.pos-100) 1102 faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) 1103 faref = faref_leftflank[pos-left:] 1104 if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) 1105 ref = faref 1106 1107 # check: format fields are defined 1108 for f in record.format: 1109 if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) 1110 1111 # check: all filters are defined 1112 for f in record.filter: 1113 if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) 1114 1115 # convert v3.3 alleles 1116 if self._version == 33: 1117 if len(ref) != 1: self.error(str(record),self.V33_BAD_REF) 1118 newalts = [] 1119 have_deletions = False 1120 for a in alt: 1121 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference 1122 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference 1123 elif a.startswith('D'): # allow D<seq> and D<num> 1124 have_deletions = True 1125 try: 1126 l = int(a[1:]) # throws ValueError if sequence 1127 if len(ref) < l: # add to reference if necessary 1128 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) 1129 ref += addns 1130 for i,na in enumerate(newalts): newalts[i] = na+addns 1131 a = ref[l:] # new deletion, deleting pos...pos+l 1132 except ValueError: 1133 s = a[1:] 1134 if len(ref) < len(s): # add Ns to reference if necessary 1135 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) 1136 if not s.endswith(addns) and addns != 'N'*len(addns): 1137 self.error(str(record),self.V33_UNMATCHED_DELETION, 1138 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) 1139 ref += addns 1140 for i,na in enumerate(newalts): newalts[i] = na+addns 1141 a = ref[len(s):] # new deletion, deleting from pos 1142 else: 1143 self.error(str(record),self.V33_BAD_ALLELE) 1144 newalts.append(a) 1145 alt = newalts 1146 # deletion alleles exist, add dummy 1st reference allele, and account for leading base 1147 if have_deletions: 1148 if pos == 0: 1149 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 1150 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) 1151 ref += addn 1152 alt = [allele+addn for allele in alt] 1153 else: 1154 addn = get_sequence(chrom,pos-1,pos,self._reference) 1155 ref = addn + ref 1156 alt = [addn + allele for allele in alt] 1157 pos -= 1 1158 else: 1159 # format v4.0 -- just check for nucleotides 1160 for allele in alt: 1161 if not alleleRegEx.match(allele): 1162 self.error(str(record),self.V40_BAD_ALLELE,allele) 1163 1164 1165 # check for leading nucleotide in indel calls 1166 for allele in alt: 1167 if len(allele) != len(ref): 1168 if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE) 1169 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): 1170 self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE) 1171 1172 # trim trailing bases in alleles 1173 # AH: not certain why trimming this needs to be added 1174 # disabled now for unit testing 1175 # for i in range(1,min(len(ref),min(map(len,alt)))): 1176 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): 1177 # break 1178 # ref, alt = ref[:-1], [allele[:-1] for allele in alt] 1179 1180 # left-align alleles, if a reference is available 1181 if self._leftalign and self._reference: 1182 while left < pos: 1183 movable = True 1184 for allele in alt: 1185 if len(allele) > len(ref): 1186 longest, shortest = allele, ref 1187 else: 1188 longest, shortest = ref, allele 1189 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): 1190 movable = False 1191 if longest[-1].upper() != longest[len(shortest)-1].upper(): 1192 movable = False 1193 if not movable: 1194 break 1195 ref = ref[:-1] 1196 alt = [allele[:-1] for allele in alt] 1197 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: 1198 ref = faref_leftflank[pos-left-1] + ref 1199 alt = [faref_leftflank[pos-left-1] + allele for allele in alt] 1200 pos -= 1 1201 1202__all__ = [ 1203 "VCF", "VCFRecord", ] 1204