1#!/usr/bin/env python 2 3""" 4For every column that occurs in a multiple alignment print the column 5and the number of times it occurs (one column/count per line, tab 6separated), sorted by count descending. 7 8This version allows special handling of the 'wildcard' symbol in alignments. 9 10Note: all blocks must have exactly the same number of species. 11 12usage: %prog [options] < maf > column_counts 13 -w, --wildcard: include wildcards 14 -m, --maxwildcards=N: only allow N missing species 15""" 16 17import sys 18 19import bx.align.maf 20from bx.cookbook import ( 21 cross_lists, 22 doc_optparse, 23) 24 25counts = {} 26 27nspecies = None 28 29for block in bx.align.maf.Reader(sys.stdin): 30 # Ensure all blocks have the same number of rows 31 if nspecies: 32 assert len(block.components) == nspecies 33 else: 34 nspecies = len(block.components) 35 # Increment count for each column 36 for col in zip(* [iter(comp.text.upper()) for comp in block.components]): 37 col = ''.join(col) 38 try: 39 counts[col] += 1 40 except Exception: 41 counts[col] = 1 42 43options, args = doc_optparse.parse(__doc__) 44 45wildcard = False 46if options.wildcard: 47 wildcard = True 48 max_wildcard = nspecies - 1 49if options.maxwildcards: 50 wildcard = True 51 max_wildcard = int(options.maxwildcards) 52 53nucs = "ACGT-" 54if wildcard: 55 nucs += "*" 56 57for col in cross_lists(*([nucs] * nspecies)): 58 col = ''.join(col) 59 if wildcard and col.count("*") > max_wildcard: 60 continue 61 if col.count("-") == nspecies: 62 continue 63 print(col, counts.get(col, 0)) 64