1#!/usr/local/bin/python3.8 2 3import sys 4 5 6#------------------------------------------------------------------------------- 7# Read fasta file 8#------------------------------------------------------------------------------- 9 10def readFasta( fasta ): 11 print(f"Reading FASTA file {fasta}", file=sys.stderr) 12 lineNum = 1 13 chrname = '' 14 chrs = dict() 15 seq = [] # Read sequence as a list of strigs 16 17 for line in open(fasta): 18 if line.startswith(">"): 19 # Add previous sequence, if any 20 if chrname != '': chrs[chrname] = ''.join(seq).upper() 21 chrname = line[1:].strip() 22 if chrname.startswith('chr'): chrname = chrname[3:] 23 print(f"Chromosome '{chrname}'", file=sys.stderr) 24 chrs[chrname] = "" 25 seq = [] 26 else: 27 seq.append( line.rstrip() ) 28 29 # Show something 30 if lineNum % 10000 == 0: 31 sys.stderr.write('.') 32 lineNum += 1 33 34 print("", file=sys.stderr) 35 if chrname != '': chrs[chrname] = ''.join(seq).upper() 36 return chrs 37 38#------------------------------------------------------------------------------- 39# Main 40#------------------------------------------------------------------------------- 41 42# Read fasta file 43fasta = sys.argv[1] 44chrs = readFasta(fasta) 45 46# Read VCF file 47for line in sys.stdin: 48 line = line.rstrip() 49 50 if line.startswith('#'): 51 # Show header 52 print(line) 53 else: 54 # Extract REF field 55 fields = line.split('\t') 56 (chrom, pos, ref) = ( fields[0], fields[1], fields[3] ) 57 58 if chrom in chrs: 59 chrSeq = chrs[chrom] 60 61 # Get coordinates 62 posStart = int(pos) - 1 63 posEnd = posStart + len(ref) 64 65 # Correct 'REF' sequence 66 if posEnd < len(chrSeq): 67 refOri = ref 68 ref = chrSeq[posStart:posEnd] 69 fields[3] = ref 70 else: 71 print(f"Position {pos} not found in chromosome '{chrom}' (chromosome length {len(chrSeq)})", file=sys.stderr) 72 else: 73 print(f"Chromosome '{chrom}' not found", file=sys.stderr) 74 75 # Show corrected line 76 print('\t'.join(fields)) 77