1#!/usr/bin/env python
2
3"""
4For every column that occurs in a multiple alignment print the column
5and the number of times it occurs (one column/count per line, tab
6separated), sorted by count descending.
7
8This version allows special handling of the 'wildcard' symbol in alignments.
9
10Note: all blocks must have exactly the same number of species.
11
12usage: %prog [options] < maf > column_counts
13    -w, --wildcard: include wildcards
14    -m, --maxwildcards=N: only allow N missing species
15"""
16
17import sys
18
19import bx.align.maf
20from bx.cookbook import (
21    cross_lists,
22    doc_optparse,
23)
24
25counts = {}
26
27nspecies = None
28
29for block in bx.align.maf.Reader(sys.stdin):
30    # Ensure all blocks have the same number of rows
31    if nspecies:
32        assert len(block.components) == nspecies
33    else:
34        nspecies = len(block.components)
35    # Increment count for each column
36    for col in zip(* [iter(comp.text.upper()) for comp in block.components]):
37        col = ''.join(col)
38        try:
39            counts[col] += 1
40        except Exception:
41            counts[col] = 1
42
43options, args = doc_optparse.parse(__doc__)
44
45wildcard = False
46if options.wildcard:
47    wildcard = True
48    max_wildcard = nspecies - 1
49if options.maxwildcards:
50    wildcard = True
51    max_wildcard = int(options.maxwildcards)
52
53nucs = "ACGT-"
54if wildcard:
55    nucs += "*"
56
57for col in cross_lists(*([nucs] * nspecies)):
58    col = ''.join(col)
59    if wildcard and col.count("*") > max_wildcard:
60        continue
61    if col.count("-") == nspecies:
62        continue
63    print(col, counts.get(col, 0))
64