1#!/usr/bin/env python 2 3import sys 4import collections 5import utils 6import pyphlan as ppa 7 8try: 9 import argparse as ap 10 import bz2 11except ImportError: 12 sys.stderr.write( "argparse not found" ) 13 sys.exit(-1) 14 15def read_params( args ): 16 p = ap.ArgumentParser(description='Profile ChocoPhlAn genes\n') 17 18 p.add_argument( '--cscores', required = True, default=None, type=str ) 19 p.add_argument( '--fwmarkers', required = True, default=None, type=str ) 20 p.add_argument( '--maps', required = True, default=None, type=str ) 21 p.add_argument( '--taxonomy', required = True, default=None, type=str ) 22 p.add_argument('--g2t', metavar="Mapping file from genes to taxa", default=None, type=str ) 23 p.add_argument('--t2g', metavar="Mapping file from taxa to genes", default=None, type=str ) 24 p.add_argument('--g2c', metavar="Mapping file from genomes to contigs", required = True, default=None, type=str ) 25 p.add_argument('out', nargs='?', default=None, type=str, 26 help= "the output summary file\n" 27 "[stdout if not present]") 28 29 return vars( p.parse_args() ) 30 31if __name__ == "__main__": 32 args = read_params( sys.argv ) 33 34 cscores = collections.defaultdict( set ) 35 fwmarkers = {} 36 maps = collections.defaultdict( set ) 37 tree = ppa.PpaTree( args['taxonomy'] ) 38 clades2terms = ppa.clades2terms( tree.tree ) 39 40 clades2taxa = dict([(clade.full_name,set([-int(taxon.name[4:]) for taxon in taxa])) for clade,taxa in clades2terms.items()]) 41 ntaxa = 0 42 for v in clades2terms.values(): 43 ntaxa += len(v) 44 45 for l in open( args['cscores'] ): 46 gene_seed, clade, n, n_tot, coreness = l.strip().split('\t') 47 gene_seed, clade, n, n_tot, coreness = int(gene_seed), clade, int(n), int(n_tot), float(coreness) 48 cscores[gene_seed].add( (clade, n, n_tot, coreness) ) 49 50 for i,l in enumerate(open( args['fwmarkers'] )): 51 taxa_id, gene_seed, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness, ext_taxa = l.strip().split('\t') 52 taxa_id, gene_seed, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = int(taxa_id), int(gene_seed), int(n), int(n_tot), float(coreness), int(n_ext_seeds), int(n_ext_taxa), float(uniqueness) 53 fwmarkers[gene_seed] = (taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness) 54 55 for l in open( args['maps'] ): 56 line = l.strip().split('\t') 57 if len(line) < 2: 58 continue 59 fr,to = line 60 fr,to = int(fr.split("_")[0]), to # can be improved!!!! 61 maps[fr].add( to ) 62 63 g2t,g2c,c2g = {},{},{} 64 if args['g2t']: 65 with open( args['g2t'] ) as inp: 66 g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp)) 67 elif args['t2g']: 68 with open( args['t2g'] ) as inp: 69 for ll in (l.strip().split('\t') for l in inp): 70 for g in ll[1:]: 71 g2t[int(g)] = int(ll[0]) 72 73 genomes = set([g2t[g] for g in cscores]) 74 75 with open( args['g2c'] ) as inp: 76 for l in inp: 77 line = list(l.strip().split('\t')) 78 #if int(line[0]) not in genomes: 79 # continue 80 #vals = [int(a) for a in line if utils.is_number(a)] 81 vals = [a for a in line] 82 if len(vals) > 1: 83 g2c[int(vals[0])] = vals[1:] 84 for g,c in g2c.items(): 85 for cc in c: 86 c2g[cc] = g 87 with utils.openw( args['out'] ) as out: 88 for gene_seed,cscores_t in cscores.items(): 89 taxa = g2t[gene_seed] 90 for clade, n, n_tot, coreness in cscores_t: 91 out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" ) 92 93 # anche sotto ??? 94 95 if gene_seed in fwmarkers: 96 taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed] 97 if uniqueness < 0.01: 98 out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness), 99 str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness)]) +"\n" ) 100 101 if gene_seed in maps: 102 ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]]) 103 ext_tax_ok = ext_tax & clades2taxa[clade] 104 ext_tax_ko = ext_tax - clades2taxa[clade] 105 ext_tax_okl = ":".join([str(s) for s in ext_tax_ok]) 106 ext_tax_kol = ":".join([str(s) for s in ext_tax_ko]) 107 muniq = 1.0 - float(len(ext_tax_kol)) / float(ntaxa) 108 109 out.write( "\t".join(["MARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness), 110 str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness), 111 str(len(ext_tax)), str(len(ext_tax_ok)), str(len(ext_tax_ko)), 112 str(muniq), ext_tax_okl, ext_tax_kol]) +"\n" ) 113 114 """ 115 gid2cores = collections.defaultdict( set ) 116 with utils.openr(args['cg']) as inp: 117 for line in (l.split('\t') for l in inp): 118 if int(line[0]) > 0: 119 gid,clade,ncore,ngenomes,pv = line[:5] 120 else: 121 gid,clade,ncore,ngenomes,pv = line[1:6] 122 gid2cores[gid].add( (clade,ncore,ngenomes,pv) ) 123 124 clades2cores = collections.defaultdict( set ) 125 for k,v in gid2cores.items(): 126 if len(v) > 1: 127 continue 128 clades2cores[list(v)[0][0]].add( k ) 129 130 with utils.openw(args['cgs']) as out: 131 for k,v in sorted(clades2cores.items(),key=lambda x:-len(x[1])): 132 out.write( "\t".join([k,str(len(v))]) +"\n" ) 133 """ 134