1#!/usr/bin/env python 2 3from optparse import OptionParser 4 5from rpy import r 6 7import bx.align.maf 8import bx.bitset 9from bx.bitset_builders import binned_bitsets_from_file 10 11 12def main(): 13 parser = OptionParser(usage="usage: %prog [options] maf_file snp_file neutral_file window_size step_size") 14 parser.add_option("-o", "--outfile", help="Specify file for output") 15 parser.add_option("-s", "--species", type="string", default="panTro2") 16 parser.add_option("-b", "--build", type="string", default="hg18") 17 (options, args) = parser.parse_args() 18 19 if len(args) != 5: 20 parser.error("Incorrect number of arguments") 21 else: 22 maf_filename = args[0] 23 snp_filename = args[1] 24 neutral_filename = args[2] 25 window_size = int(args[3]) 26 step_size = int(args[4]) 27 28 if options.outfile is not None: 29 out_file = open(options.outfile, 'w') 30 31 # Generate snp and neutral bitsets 32 AR_snp_bitsets = binned_bitsets_from_file(open(snp_filename)) 33 neutral_bitsets = binned_bitsets_from_file(open(neutral_filename)) 34 35 # Generate divergence bitset from maf file 36 AR_div_bitsets = dict() 37 chr_lens = dict() 38 reader = bx.align.maf.Reader(open(maf_filename)) 39 40 for block in reader: 41 comp1 = block.get_component_by_src_start(options.build) 42 comp2 = block.get_component_by_src_start(options.species) 43 44 if comp1 is None or comp2 is None: 45 continue 46 47 # Chromosome, start, and stop of reference species alignment 48 chr = comp1.src.split('.')[1] 49 start = comp1.start 50 51 # Get or create bitset for this chromosome 52 if chr in AR_div_bitsets: 53 bitset = AR_div_bitsets[chr] 54 else: 55 bitset = AR_div_bitsets[chr] = bx.bitset.BinnedBitSet() 56 chr_lens[chr] = comp1.get_src_size() 57 58 # Iterate over text and set diverged bit 59 pos = start 60 for ch1, ch2 in zip(comp1.text.upper(), comp2.text.upper()): 61 if ch1 == '-': 62 continue 63 if ch2 == '-': 64 pos += 1 65 continue 66 67 if ch1 != ch2 and not AR_snp_bitsets[chr][pos]: 68 bitset.set(pos) 69 pos += 1 70 71 # Debugging Code 72# for chr in AR_div_bitsets: 73# for pos in range(0, AR_div_bitsets[chr].size): 74# if AR_div_bitsets[pos]: 75# print >> sys.stderr, chr, pos, pos+1 76 77 # Copy div and snp bitsets 78 nonAR_snp_bitsets = dict() 79 for chr in AR_snp_bitsets: 80 nonAR_snp_bitsets[chr] = bx.bitset.BinnedBitSet() 81 nonAR_snp_bitsets[chr].ior(AR_snp_bitsets[chr]) 82 83 nonAR_div_bitsets = dict() 84 for chr in AR_div_bitsets: 85 nonAR_div_bitsets[chr] = bx.bitset.BinnedBitSet() 86 nonAR_div_bitsets[chr].ior(AR_div_bitsets[chr]) 87 88 # Generates AR snps by intersecting with neutral intervals 89 for chr in AR_snp_bitsets: 90 AR_snp_bitsets[chr].iand(neutral_bitsets[chr]) 91 92 # Generates AR divs by intersecting with neutral intervals 93 for chr in AR_div_bitsets: 94 AR_div_bitsets[chr].iand(neutral_bitsets[chr]) 95 96 # Inverts the neutral intervals so now represents nonAR 97 for chr in neutral_bitsets: 98 neutral_bitsets[chr].invert() 99 100 # Generates nonAR snps by intersecting with masked neutral intervals 101 for chr in nonAR_snp_bitsets: 102 nonAR_snp_bitsets[chr].iand(neutral_bitsets[chr]) 103 # Generates nonAR divs by intersecting with masked neutral intervals 104 for chr in nonAR_div_bitsets: 105 nonAR_div_bitsets[chr].iand(neutral_bitsets[chr]) 106 107 for chr in AR_div_bitsets: 108 for window in range(0, chr_lens[chr] - window_size, step_size): 109 # neutral_size = neutral_bitsets[chr].count_range(window, window_size) 110 # if neutral_size < 9200: continue 111 AR_snp = AR_snp_bitsets[chr].count_range(window, window_size) 112 AR_div = AR_div_bitsets[chr].count_range(window, window_size) 113 nonAR_snp = nonAR_snp_bitsets[chr].count_range(window, window_size) 114 nonAR_div = nonAR_div_bitsets[chr].count_range(window, window_size) 115 116 if nonAR_snp >= 6 and nonAR_div >= 6 and AR_snp >= 6 and AR_div >= 6: 117 MK_pval = MK_chi_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div) 118 else: 119 MK_pval = MK_fisher_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div) 120 121 if options.outfile is not None: 122 out_file.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f\n" % (chr, window, window+window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval)) 123 else: 124 print("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f" % (chr, window, window+window_size, nonAR_snp, nonAR_div, AR_snp, AR_div, MK_pval)) 125 126 if options.outfile is not None: 127 out_file.close() 128 129 130def MK_fisher_pvalue(win_snp, win_div, AR_snp, AR_div): 131 132 if win_snp == 0 and win_div == 0 and AR_snp == 0 and AR_div == 0: 133 return 1.0 134 135 fisher_result = r.fisher_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2)) 136 137 return fisher_result['p.value'] 138 139 140def MK_chi_pvalue(win_snp, win_div, AR_snp, AR_div): 141 142 chi_result = r.chisq_test(r.matrix(r.c([win_snp, win_div, AR_snp, AR_div]), nr=2)) 143 144 return chi_result['p.value'] 145 146 147main() 148