1#!/usr/bin/env python
2
3from optparse import OptionParser
4
5from rpy import r
6
7import bx.align.maf
8import bx.bitset
9from bx.bitset_builders import binned_bitsets_from_file
10
11
12def main():
13    parser = OptionParser(usage="usage: %prog [options] maf_file snp_file neutral_file window_size step_size")
14    parser.add_option("-o", "--outfile", help="Specify file for output")
15    parser.add_option("-s", "--species", type="string", default="panTro2")
16    parser.add_option("-b", "--build", type="string", default="hg18")
17    (options, args) = parser.parse_args()
18
19    if len(args) != 5:
20        parser.error("Incorrect number of arguments")
21    else:
22        maf_filename = args[0]
23        snp_filename = args[1]
24        neutral_filename = args[2]
25        window_size = int(args[3])
26        step_size = int(args[4])
27
28    if options.outfile is not None:
29        out_file = open(options.outfile, 'w')
30
31    # Generate snp and neutral bitsets
32    AR_snp_bitsets = binned_bitsets_from_file(open(snp_filename))
33    neutral_bitsets = binned_bitsets_from_file(open(neutral_filename))
34
35    # Generate divergence bitset from maf file
36    AR_div_bitsets = dict()
37    chr_lens = dict()
38    reader = bx.align.maf.Reader(open(maf_filename))
39
40    for block in reader:
41        comp1 = block.get_component_by_src_start(options.build)
42        comp2 = block.get_component_by_src_start(options.species)
43
44        if comp1 is None or comp2 is None:
45            continue
46
47        # Chromosome, start, and stop of reference species alignment
48        chr = comp1.src.split('.')[1]
49        start = comp1.start
50
51        # Get or create bitset for this chromosome
52        if chr in AR_div_bitsets:
53            bitset = AR_div_bitsets[chr]
54        else:
55            bitset = AR_div_bitsets[chr] = bx.bitset.BinnedBitSet()
56            chr_lens[chr] = comp1.get_src_size()
57
58        # Iterate over text and set diverged bit
59        pos = start
60        for ch1, ch2 in zip(comp1.text.upper(), comp2.text.upper()):
61            if ch1 == '-':
62                continue
63            if ch2 == '-':
64                pos += 1
65                continue
66
67            if ch1 != ch2 and not AR_snp_bitsets[chr][pos]:
68                bitset.set(pos)
69            pos += 1
70
71    # Debugging Code
72#     for chr in AR_div_bitsets:
73#         for pos in range(0, AR_div_bitsets[chr].size):
74#             if AR_div_bitsets[pos]:
75#                 print >> sys.stderr, chr, pos, pos+1
76
77    # Copy div and snp bitsets
78    nonAR_snp_bitsets = dict()
79    for chr in AR_snp_bitsets:
80        nonAR_snp_bitsets[chr] = bx.bitset.BinnedBitSet()
81        nonAR_snp_bitsets[chr].ior(AR_snp_bitsets[chr])
82
83    nonAR_div_bitsets = dict()
84    for chr in AR_div_bitsets:
85        nonAR_div_bitsets[chr] = bx.bitset.BinnedBitSet()
86        nonAR_div_bitsets[chr].ior(AR_div_bitsets[chr])
87
88    # Generates AR snps by intersecting with neutral intervals
89    for chr in AR_snp_bitsets:
90        AR_snp_bitsets[chr].iand(neutral_bitsets[chr])
91
92    # Generates AR divs by intersecting with neutral intervals
93    for chr in AR_div_bitsets:
94        AR_div_bitsets[chr].iand(neutral_bitsets[chr])
95
96    # Inverts the neutral intervals so now represents nonAR
97    for chr in neutral_bitsets:
98        neutral_bitsets[chr].invert()
99
100    # Generates nonAR snps by intersecting with masked neutral intervals
101    for chr in nonAR_snp_bitsets:
102        nonAR_snp_bitsets[chr].iand(neutral_bitsets[chr])
103    # Generates nonAR divs by intersecting with masked neutral intervals
104    for chr in nonAR_div_bitsets:
105        nonAR_div_bitsets[chr].iand(neutral_bitsets[chr])
106
107    for chr in AR_div_bitsets:
108        for window in range(0, chr_lens[chr] - window_size, step_size):
109            # neutral_size = neutral_bitsets[chr].count_range(window, window_size)
110            # if neutral_size < 9200: continue
111            AR_snp = AR_snp_bitsets[chr].count_range(window, window_size)
112            AR_div = AR_div_bitsets[chr].count_range(window, window_size)
113            nonAR_snp = nonAR_snp_bitsets[chr].count_range(window, window_size)
114            nonAR_div = nonAR_div_bitsets[chr].count_range(window, window_size)
115
116            if nonAR_snp >= 6 and nonAR_div >= 6 and AR_snp >= 6 and AR_div >= 6:
117                MK_pval = MK_chi_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div)
118            else:
119                MK_pval = MK_fisher_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div)
120
121            if options.outfile is not None:
122                out_file.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f\n" % (chr, window, window+window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval))
123            else:
124                print("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f" % (chr, window, window+window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval))
125
126    if options.outfile is not None:
127        out_file.close()
128
129
130def MK_fisher_pvalue(win_snp, win_div, AR_snp, AR_div):
131
132    if win_snp == 0 and win_div == 0 and AR_snp == 0 and AR_div == 0:
133        return 1.0
134
135    fisher_result = r.fisher_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2))
136
137    return fisher_result['p.value']
138
139
140def MK_chi_pvalue(win_snp, win_div, AR_snp, AR_div):
141
142    chi_result = r.chisq_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2))
143
144    return chi_result['p.value']
145
146
147main()
148