1#!/usr/bin/env python
2
3import sys
4import collections
5import utils
6import pyphlan as ppa
7
8try:
9    import argparse as ap
10    import bz2
11except ImportError:
12    sys.stderr.write( "argparse not found" )
13    sys.exit(-1)
14
15def read_params( args ):
16    p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n')
17
18    p.add_argument( '--cscores', required = True, default=None, type=str )
19    p.add_argument( '--fwmarkers', required = True, default=None, type=str )
20    p.add_argument( '--maps', required = True, default=None, type=str )
21    p.add_argument( '--taxonomy', required = True, default=None, type=str )
22    p.add_argument('--g2t', metavar="Mapping file from genes to taxa", default=None, type=str )
23    p.add_argument('--t2g', metavar="Mapping file from taxa to genes", default=None, type=str )
24    p.add_argument('--g2c', metavar="Mapping file from genomes to contigs", required = True, default=None, type=str )
25    p.add_argument('out', nargs='?', default=None, type=str,
26            help=   "the output summary file\n"
27                    "[stdout if not present]")
28
29    return vars( p.parse_args() )
30
31if __name__ == "__main__":
32    args = read_params( sys.argv )
33
34    cscores = collections.defaultdict( set )
35    fwmarkers = {}
36    maps = collections.defaultdict( set )
37    tree = ppa.PpaTree( args['taxonomy'] )
38    clades2terms = ppa.clades2terms( tree.tree )
39
40    clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()])
41    ntaxa = 0
42    for v in clades2terms.values():
43        ntaxa += len(v)
44
45    for l in open( args['cscores'] ):
46        gene_seed, clade, n, n_tot, coreness = l.strip().split('\t')
47        gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness)
48        cscores[gene_seed].add( (clade, n, n_tot, coreness) )
49
50    for i,l in enumerate(open( args['fwmarkers'] )):
51        taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t')
52        taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness)
53        fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness)
54
55    for l in open( args['maps'] ):
56        line = l.strip().split('\t')
57        if len(line) < 2:
58            continue
59        fr,to = line
60        fr,to = int(fr.split("_")[0]), to # can be improved!!!!
61        maps[fr].add( to )
62
63    g2t,g2c,c2g = {},{},{}
64    if args['g2t']:
65        with open( args['g2t'] ) as inp:
66            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
67    elif args['t2g']:
68        with open( args['t2g'] ) as inp:
69            for ll in (l.strip().split('\t') for l in inp):
70                for g in ll[1:]:
71                    g2t[int(g)] = int(ll[0])
72
73    genomes = set([g2t[g] for g in cscores])
74
75    with open( args['g2c'] ) as inp:
76        for l in inp:
77            line = list(l.strip().split('\t'))
78            #if int(line[0]) not in genomes:
79            #    continue
80            #vals = [int(a) for a in line if utils.is_number(a)]
81            vals = [a for a in line]
82            if len(vals) > 1:
83                g2c[int(vals[0])] = vals[1:]
84    for g,c in g2c.items():
85        for cc in c:
86            c2g[cc] = g
87    with utils.openw( args['out'] ) as out:
88        for gene_seed,cscores_t in cscores.items():
89            taxa = g2t[gene_seed]
90            for clade, n, n_tot, coreness in cscores_t:
91                out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" )
92
93            # anche sotto ???
94
95            if gene_seed in fwmarkers:
96                taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed]
97                if uniqueness < 0.01:
98                     out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
99                                                      str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness)]) +"\n" )
100
101                if gene_seed in maps:
102                    ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
103                    ext_tax_ok = ext_tax & clades2taxa[clade]
104                    ext_tax_ko = ext_tax - clades2taxa[clade]
105                    ext_tax_okl = ":".join([str(s) for s in ext_tax_ok])
106                    ext_tax_kol = ":".join([str(s) for s in ext_tax_ko])
107                    muniq = 1.0 - float(len(ext_tax_kol)) / float(ntaxa)
108
109                    out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness),
110                                                   str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness),
111                                                   str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)),
112                                                   str(muniq), ext_tax_okl, ext_tax_kol]) +"\n" )
113
114    """
115    gid2cores = collections.defaultdict( set )
116    with utils.openr(args['cg']) as inp:
117        for line in (l.split('\t') for l in inp):
118            if int(line[0]) > 0:
119                gid,clade,ncore,ngenomes,pv =  line[:5]
120            else:
121                gid,clade,ncore,ngenomes,pv =  line[1:6]
122            gid2cores[gid].add( (clade,ncore,ngenomes,pv) )
123
124    clades2cores = collections.defaultdict( set )
125    for k,v in gid2cores.items():
126        if len(v) > 1:
127            continue
128        clades2cores[list(v)[0][0]].add( k )
129
130    with utils.openw(args['cgs']) as out:
131        for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])):
132            out.write( "\t".join([k,str(len(v))]) +"\n" )
133    """
134