1#!/usr/bin/env python
2
3import argparse
4import subprocess
5import sys
6import re
7import os
8
9#SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
10BIN_DIR = #__#
11WORKING_DIR = os.getcwd()
12
13CONF_TEMPLATE = """DATA
14PE = pe %d %d %s
15END
16PARAMETERS
17STOP_AFTER_SUPERREADS=1
18NUM_THREADS=%d
19GRAPH_KMER_SIZE=auto
20END
21"""
22
23if __name__ == "__main__":
24    parser = argparse.ArgumentParser(description="Quantifies Super-Reads")
25    parser.add_argument("-1", "--short-pair1", required=False, type=str, default=None, help="Paired short read FASTQ. Must also specify '--short-pair2' and NOT '--short-unpaired'")
26    parser.add_argument("-2", "--short-pair2", required=False, type=str, default=None, help="Paired short read FASTQ. Must also specify '--short-pair1' and NOT '--short-unpaired'")
27    parser.add_argument("-U", "--short-unpaired", required=False, type=str, default=None, help="Unpaired short read FASTQ. Must not include '--short-pair1' or '--short-pair2'")
28    parser.add_argument("-w", "--work-dir", required=False, type=str, default=None, help="MaSuRCA super-read assembly 'work1' directory. Will perform super-read assembly if not included.")
29    parser.add_argument("-H", "--hisat-index", required=True, type=str, help="HISAT2 index prefix for aligning short reads")
30    parser.add_argument("-G", "--gmap-index", required=True, type=str, help="Gmap index for aligning super-reads")
31    parser.add_argument("-g", "--gmap-directory", required=False, type=str, default=None, help="Gmap directory to find gmap index")
32    parser.add_argument("-p", "--num-threads", required=False, type=int, default=1, help="Number of threads each tool can use")
33    parser.add_argument("-o", "--out-dir", required=False, type=str, default="", help="Output directory")
34    parser.add_argument("--frag-len", required=False, type=float, default=100, help="Paired fragment length for MaSuRCA to use. Will only be used if '--work-dir' not specify")
35    parser.add_argument("--frag-std", required=False, type=float, default=20, help="Paired fragment standard deviation for MaSuRCA to use. Will only be used if '--work-dir' not specify")
36    parser.add_argument("--gmap-cmd", required=False, type=str, default="gmap", help="Gmap executable")
37    parser.add_argument("--hisat2-cmd", required=False, type=str, default="hisat2", help="HISAT2 executable")
38    args = parser.parse_args()
39
40    if not os.path.isdir(BIN_DIR):
41        sys.stderr.write("Error: programs directory not found. Please run './install.sh''\n")
42        sys.exit(1)
43
44    if args.short_unpaired != None:
45        short_reads = os.path.abspath(args.short_unpaired)
46        paired = False
47    elif args.short_pair1 != None and args.short_pair2 != None:
48        short_reads = "%s %s" % (os.path.abspath(args.short_pair1), os.path.abspath(args.short_pair2))
49        paired = True
50    else:
51        sys.stderr.write("Error: must specify short read input")
52        sys.exit(1)
53
54
55    if args.work_dir == None:
56        if not os.path.exists(args.out_dir):
57            try:
58              os.makedirs(args.out_dir)
59            except OSError, e:
60              sys.stderr.write("\nError creating directory %s (%s)" % (args.out_dir, e));
61              sys.exit(1)
62        os.chdir(args.out_dir)
63        conf_str = CONF_TEMPLATE % (
64            args.frag_len,
65            args.frag_std,
66            short_reads,
67            args.num_threads)
68        conf_file = open("super_reads.conf", "w")
69        conf_file.write(conf_str)
70        conf_file.close()
71        subprocess.call([os.path.join(BIN_DIR, "createSuperReads_RNA"), "super_reads.conf"])
72        subprocess.call("./assemble.sh")
73
74        os.chdir(WORKING_DIR)
75        work1_dir = os.path.join(args.out_dir, "work1")
76    else:
77        work1_dir = args.work_dir
78
79    gmap_call = [args.gmap_cmd,
80                 "-f", "samse",
81                 "-t", str(args.num_threads),
82                 "-d", args.gmap_index]
83
84    if args.gmap_directory != None:
85        gmap_call += ["--dir", args.gmap_directory]
86    gmap_call += ["%s/superReadSequences.fasta" % work1_dir]
87
88    gmap_out_name = os.path.join(args.out_dir, "sr.sam")
89
90    gmap_out = open(gmap_out_name, "w")
91    subprocess.call(gmap_call, stdout=gmap_out)
92    gmap_out.close()
93
94    cmd = [os.path.join(BIN_DIR, "assign_reads"),
95                     "-p", str(args.num_threads),
96                     "-w", work1_dir,
97                     "-s", gmap_out_name,
98                     "-o", os.path.join(args.out_dir, "sr_quant.sam")]
99    subprocess.call(cmd)
100
101    hisat2_call = [args.hisat2_cmd,
102                  "-p", str(args.num_threads),
103                  "-x", args.hisat_index]
104
105    if paired:
106        hisat2_call += ["-1", args.short_pair1, "-2", args.short_pair2]
107    else:
108        hisat2_call += ["-U", args.short_unpaired]
109
110    hisat2_call += [">", os.path.join(args.out_dir, "short.sam")]
111
112    subprocess.call(hisat2_call)
113
114    subprocess.call(["samtools", "merge", "-f",
115                     "-@", str(args.num_threads),
116                     os.path.join(args.out_dir, "sr_merge.bam"),
117                     os.path.join(args.out_dir, "sr_quant.sam"),
118                     os.path.join(args.out_dir, "short.sam")])
119
120    subprocess.call(["samtools", "sort",
121                     os.path.join(args.out_dir, "sr_merge.bam"),
122                     "-o", os.path.join(args.out_dir, "sr_sort.bam")])
123