1#!/usr/local/bin/python3.8
2
3import sys
4
5
6#-------------------------------------------------------------------------------
7# Read fasta file
8#-------------------------------------------------------------------------------
9
10def readFasta( fasta ):
11	print(f"Reading FASTA file {fasta}", file=sys.stderr)
12	lineNum = 1
13	chrname = ''
14	chrs = dict()
15	seq = []	# Read sequence as a list of strigs
16
17	for line in open(fasta):
18		if line.startswith(">"):
19			# Add previous sequence, if any
20			if chrname != '':	chrs[chrname] = ''.join(seq).upper()
21			chrname = line[1:].strip()
22			if chrname.startswith('chr'): chrname = chrname[3:]
23			print(f"Chromosome '{chrname}'", file=sys.stderr)
24			chrs[chrname] = ""
25			seq = []
26		else:
27			seq.append( line.rstrip() )
28
29		# Show something
30		if lineNum % 10000 == 0:
31			sys.stderr.write('.')
32		lineNum += 1
33
34	print("", file=sys.stderr)
35	if chrname != '':   chrs[chrname] = ''.join(seq).upper()
36	return chrs
37
38#-------------------------------------------------------------------------------
39# Main
40#-------------------------------------------------------------------------------
41
42# Read fasta file
43fasta = sys.argv[1]
44chrs = readFasta(fasta)
45
46# Read VCF file
47for line in sys.stdin:
48	line = line.rstrip()
49
50	if line.startswith('#'):
51		# Show header
52		print(line)
53	else:
54		# Extract REF field
55		fields = line.split('\t')
56		(chrom, pos, ref) = ( fields[0], fields[1], fields[3] )
57
58		if chrom in chrs:
59			chrSeq = chrs[chrom]
60
61			# Get coordinates
62			posStart = int(pos) - 1
63			posEnd = posStart + len(ref)
64
65			# Correct 'REF' sequence
66			if posEnd < len(chrSeq):
67				refOri = ref
68				ref = chrSeq[posStart:posEnd]
69				fields[3] = ref
70			else:
71				print(f"Position {pos} not found in chromosome '{chrom}' (chromosome length {len(chrSeq)})", file=sys.stderr)
72		else:
73			print(f"Chromosome '{chrom}' not found", file=sys.stderr)
74
75		# Show corrected line
76		print('\t'.join(fields))
77