1#!/usr/local/bin/python3.8
2# encoding: utf-8
3"""
4cuffmerge.py
5
6Created by Cole Trapnell on 2011-03-17.
7Copyright (c) 2011 Cole Trapnell. All rights reserved.
8"""
9
10import sys
11import getopt
12from datetime import datetime, date, time
13import shutil
14import subprocess
15import errno
16import os
17import tempfile
18import warnings
19import types
20
21help_message = '''
22cuffmerge takes two or more Cufflinks GTF files and merges them into a
23single unified transcript catalog.  Optionally, you can provide the script
24with a reference GTF, and the script will use it to attach gene names and other
25metadata to the merged catalog.
26
27Usage:
28    cuffmerge [Options] <assembly_GTF_list.txt>
29
30Options:
31    -h/--help                               Prints the help message and exits
32    -o                     <output_dir>     Directory where merged assembly will be written  [ default: ./merged_asm  ]
33    -g/--ref-gtf                            An optional "reference" annotation GTF.
34    -s/--ref-sequence      <seq_dir>/<seq_fasta> Genomic DNA sequences for the reference.
35    --min-isoform-fraction <0-1.0>          Discard isoforms with abundance below this       [ default:           0.05 ]
36    -p/--num-threads       <int>            Use this many threads to merge assemblies.       [ default:             1  ]
37    --keep-tmp                              Keep all intermediate files during merge
38'''
39
40output_dir = "./merged_asm/"
41logging_dir = output_dir + "logs/"
42run_log = None
43run_cmd = None
44tmp_dir = output_dir + "/tmp/"
45bin_dir = sys.path[0] + "/"
46run_meta_assembly = True
47fail_str = "\t[FAILED]\n"
48params = None
49
50class Usage(Exception):
51    def __init__(self, msg):
52        self.msg = msg
53
54class TestParams:
55
56    class SystemParams:
57        def __init__(self,
58                     threads,
59                     keep_tmp):
60            self.threads = threads
61            self.keep_tmp = keep_tmp
62
63        def parse_options(self, opts):
64            for option, value in opts:
65                if option in ("-p", "--num-threads"):
66                    self.threads = int(value)
67                if option in ("--keep-tmp"):
68                    self.keep_tmp = True
69
70        def check(self):
71            pass
72
73    def __init__(self):
74        self.system_params = self.SystemParams(1,               # threads
75                                               False)           # keep_tmp
76        self.ref_gtf = None
77        self.fasta = None
78        self.min_isoform_frac = 0.05
79
80    def check(self):
81        self.system_params.check()
82
83    def parse_options(self, argv):
84        try:
85            opts, args = getopt.getopt(argv[1:],
86                                       "hvp:o:g:M:s:q:F:",
87                                       ["version",
88                                        "help",
89                                        "ref-sequence=",
90                                        "ref-gtf=",
91                                        "output-dir=",
92                                        "num-threads=",
93                                        "keep-tmp",
94                                        "min-isoform-fraction="])
95        except getopt.error, msg:
96            raise Usage(msg)
97
98        self.system_params.parse_options(opts)
99
100        global output_dir
101        global logging_dir
102        global tmp_dir
103
104        # option processing
105        for option, value in opts:
106            if option in ("-v", "--version"):
107                print "merge_cuff_asms v%s" % (get_version())
108                exit(0)
109            if option in ("-h", "--help"):
110                raise Usage(help_message)
111            if option in ("-g", "--ref-gtf"):
112                self.ref_gtf = value
113            if option in ("-s", "--ref-sequence"):
114                self.fasta = value
115            if option in ("-F", "--min-isoform-fraction"):
116                self.min_isoform_frac = float(value)
117            if option in ("-o", "--output-dir"):
118                output_dir = value + "/"
119                logging_dir = output_dir + "logs/"
120                tmp_dir = output_dir + "tmp/"
121
122        return args
123
124
125def right_now():
126    curr_time = datetime.now()
127    return curr_time.strftime("%c")
128
129def prepare_output_dir():
130
131    print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
132    if os.path.exists(output_dir):
133        pass
134    else:
135        os.makedirs(output_dir)
136
137    #print >> sys.stderr, "Checking for %s", logging_dir
138    if os.path.exists(logging_dir):
139        pass
140    else:
141        #print >> sys.stderr, "Creating %s", logging_dir
142        os.makedirs(logging_dir)
143
144    if os.path.exists(tmp_dir):
145        pass
146    else:
147        os.makedirs(tmp_dir)
148
149def formatTD(td):
150    hours = td.seconds // 3600
151    minutes = (td.seconds % 3600) // 60
152    seconds = td.seconds % 60
153    return '%02d:%02d:%02d' % (hours, minutes, seconds)
154
155def tmp_name(prefix):
156    tmp_root = output_dir + "tmp/"
157    if os.path.exists(tmp_root):
158        pass
159    else:
160        os.mkdir(tmp_root)
161    return tmp_root + prefix + os.tmpnam().split('/')[-1]
162
163def cufflinks(out_dir,
164              sam_file,
165              min_isoform_frac,
166              gtf_file=None,
167              extra_opts=["-q", "--overhang-tolerance", "200", "--library-type=transfrags",  "-A","0.0", "--min-frags-per-transfrag", "0", "--no-5-extend"],
168              lsf=False,
169              curr_queue=None):
170    if gtf_file != None:
171        print >> sys.stderr, "[%s] Quantitating transcripts" % (right_now())
172    else:
173        print >> sys.stderr, "[%s] Assembling transcripts" % (right_now())
174
175    cmd = ["cufflinks"]
176
177    if out_dir != None and out_dir != "":
178        cmd.extend(["-o", out_dir])
179
180    cmd.extend(["-F", str(min_isoform_frac)])
181
182    if gtf_file != None:
183        cmd.extend(["-g", gtf_file])
184
185    if extra_opts != None:
186        cmd.extend(extra_opts)
187    global params
188    # Run Cufflinks with more than one thread?
189    cmd.extend(["-p", str(params.system_params.threads)])
190
191    cmd.append(sam_file)
192
193    try:
194        print >> run_log, " ".join(cmd)
195        ret = subprocess.call(cmd)
196        if ret != 0:
197            print >> sys.stderr, fail_str, "Error: could not execute cufflinks"
198            exit(1)
199    # cufflinks not found
200    except OSError, o:
201        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
202            print >> sys.stderr, fail_str, "Error: cufflinks not found on this system.  Did you forget to include it in your PATH?"
203        exit(1)
204
205def cuffcompare(prefix, ref_gtf, fasta, cuff_gtf):
206
207    print >> sys.stderr, "[%s] Comparing reference %s to assembly %s" % (right_now(), ref_gtf, cuff_gtf)
208    cmd = ["cuffcompare"]
209
210    if  prefix != None:
211        cmd.extend(["-o", prefix])
212    if  ref_gtf != None:
213        cmd.extend(["-r", ref_gtf])
214    if  fasta != None:
215        cmd.extend(["-s", fasta])
216    if type(cuff_gtf) == types.ListType:
217        for g in cuff_gtf:
218            cmd.extend([g])
219    else:
220        cmd.extend([cuff_gtf])
221
222    try:
223        print >> run_log, " ".join(cmd)
224        ret = subprocess.call(cmd)
225        if ret != 0:
226            print >> sys.stderr, fail_str, "Error: could not execute cuffcompare"
227            exit(1)
228    # cuffcompare not found
229    except OSError, o:
230        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
231            print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system.  Did you forget to include it in your PATH?"
232        exit(1)
233
234def gtf_to_sam(gtf_filename):
235
236    sam_out = tmp_name("gtf2sam_")
237
238    cmd = ["gtf_to_sam"]
239    cmd.append("-F")
240    cmd.append(gtf_filename)
241    cmd.append(sam_out)
242    try:
243        print >> run_log, " ".join(cmd)
244        ret = subprocess.call(cmd)
245        if ret != 0:
246            print >> sys.stderr, fail_str, "Error: could not execute gtf_to_sam"
247            exit(1)
248    # gtf_to_sam not found
249    except OSError, o:
250        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
251            print >> sys.stderr, fail_str, "Error: gtf_to_sam not found on this system.  Did you forget to include it in your PATH?"
252        exit(1)
253    return sam_out
254
255def test_input_files(filename_list):
256    """This function takes a file that contains a list of GTF files,
257       tests accessibility of each, and returns the list of filenames"""
258
259    OK = True
260    input_files = []
261    for line in filename_list:
262        line = line.strip()
263
264        # Skip comment line
265        if len(line) == 0 or line[0] == "#":
266            continue
267        try:
268            g = open(line,"r")
269            input_files.append(line)
270
271        except OSError, o:
272            if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
273                print >> sys.stderr, fail_str, "Error: could not open %s" % line
274            OK = False
275    if not OK:
276        sys.exit(1)
277    return input_files
278
279def convert_gtf_to_sam(gtf_filename_list):
280    """This function takes a list of GTF files, converts them all to
281       temporary SAM files, and returns the list of temporary file names."""
282    print >> sys.stderr, "[%s] Converting GTF files to SAM" % (right_now())
283    OK = True
284    sam_input_filenames = []
285    for line in gtf_filename_list:
286        try:
287            sam_out = gtf_to_sam(line)
288            sam_input_filenames.append(sam_out)
289        except OSError, o:
290            if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
291                print >> sys.stderr, fail_str, "Error: could not open %s" % line
292            OK = False
293    if not OK:
294        sys.exit(1)
295    return sam_input_filenames
296
297def merge_sam_inputs(sam_input_list, header):
298    sorted_map_name = tmp_name( "mergeSam_")
299
300    sorted_map = open(sorted_map_name, "w")
301
302    #print header
303
304    # The header was built from a dict keyed by chrom, so
305    # the records will be lexicographically ordered and
306    # should match the BAM after the sort below.
307    print >> sorted_map, header,
308
309    sorted_map.close()
310    sort_cmd =["sort",
311               "-k",
312               "3,3",
313               "-k",
314               "4,4n",
315               "--temporary-directory="+tmp_dir]
316    sort_cmd.extend(sam_input_list)
317
318    print >> run_log, " ".join(sort_cmd), ">", sorted_map_name
319    subprocess.call(sort_cmd,
320                   stdout=open(sorted_map_name, "a"))
321    return sorted_map_name
322
323def compare_to_reference(meta_asm_gtf, ref_gtf, fasta):
324    global tmp_dir
325    print >> sys.stderr, "[%s] Comparing against reference file %s" % (right_now(), ref_gtf)
326    ref_str = ""
327    if ref_gtf != None:
328        ref_str = " -r %s " % ref_gtf
329
330    if fasta != None:
331        comp_cmd = '''cuffcompare -o %s -C -G %s -s %s %s''' % (tmp_dir, ref_str, fasta, meta_asm_gtf)
332    else:
333        comp_cmd = '''cuffcompare -o %s -C -G %s %s''' % (tmp_dir, ref_str, meta_asm_gtf)
334
335    #cmd = bsub_cmd(comp_cmd, "/gencode_cmp", True, job_mem=8)
336    cmd = comp_cmd
337
338    try:
339        print >> run_log, cmd
340        ret = subprocess.call(cmd,shell=True)
341        if ret != 0:
342            print >> sys.stderr, fail_str, "Error: could not execute cuffcompare"
343            exit(1)
344        #tmap_out = meta_asm_gtf.split("/")[-1] + ".tmap"
345        tfpath, tfname = os.path.split(meta_asm_gtf)
346        if tfpath: tfpath+='/'
347        tmap_out = tfpath+'.'+tfname+".tmap"
348        return tmap_out
349    # cuffcompare not found
350    except OSError, o:
351        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
352            print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system.  Did you forget to include it in your PATH?"
353        exit(1)
354
355def select_gtf(gtf_in_filename, ids, gtf_out_filename):
356    f_gtf = open(gtf_in_filename)
357    #print >> sys.stderr, "Select GTF: Ids are: "
358    #print >> sys.stderr, ids
359    #print >> sys.stderr, "reference gtf file name:"
360    #print >> sys.stderr, gtf_in_filename
361    out_gtf = open(gtf_out_filename, "w")
362    for line in f_gtf:
363        line = line.strip()
364        cols = line.split('\t')
365        if len(cols) < 9:
366            continue
367        attrs = cols[8]
368        attr_cols = attrs.split(';')
369        for col in attr_cols:
370            if col.find("transcript_id") != -1:
371                first_quote = col.find('"')
372                last_quote = col.find('"', first_quote + 1)
373                transcript = col[first_quote + 1:last_quote]
374                #print >> sys.stderr, transcript
375                if transcript in ids:
376                    print >> out_gtf, line
377
378
379def merge_gtfs(gtf_filenames, merged_gtf, ref_gtf=None):
380    print >> sys.stderr, "[%s] Merging linc gtf files with cuffcompare" % (right_now())
381    cmd = ["cuffcompare"]
382
383    cmd.extend(["-o", merged_gtf])
384    if ref_gtf != None:
385        cmd.extend(["-r", ref_gtf])
386
387    cmd.extend(gtf_filenames)
388    cmd = " ".join(cmd)
389    #cmd = bsub_cmd(cmd, "/merge_gtf", True, job_mem=8)
390
391    try:
392        print >> run_log, cmd
393        ret = subprocess.call(cmd, shell=True)
394        if ret != 0:
395            print >> sys.stderr, fail_str, "Error: could not execute cuffcompare"
396            exit(1)
397        return merged_gtf + ".combined.gtf"
398    # cuffcompare not found
399    except OSError, o:
400        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
401            print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system.  Did you forget to include it in your PATH?"
402        exit(1)
403
404def compare_meta_asm_against_ref(ref_gtf, fasta_file, gtf_input_file, class_codes=["c", "i", "r", "p", "e"]):
405    #print >> sys.stderr, "Cuffcmpare all assemblies GTFs"
406
407    tmap = compare_to_reference(gtf_input_file, ref_gtf, fasta_file)
408
409    #print >> sys.stderr, "Cuffcmpare all assemblies GTFs : filter %s" % ",".join(class_codes)
410    selected_ids= set([])
411    f_tmap = open(tmap)
412    #out = open("tmp_meta_asm_selectedIds.txt", "w")
413    for line in f_tmap:
414        line = line.strip()
415        cols = line.split('\t')
416        if len(cols) < 5:
417            continue
418        class_code = cols[2]
419        name = cols[4]
420        if class_code not in class_codes:
421            selected_ids.add(name)
422
423    global output_dir
424    asm_dir = output_dir
425
426    if os.path.exists(asm_dir):
427        pass
428    else:
429        os.mkdir(asm_dir)
430    current_asm_gtf = output_dir +"transcripts.gtf"
431    select_gtf(current_asm_gtf, selected_ids, output_dir + "/merged.gtf")
432    mtmap = compare_to_reference(output_dir + "/merged.gtf", ref_gtf, fasta_file)
433    if os.path.exists(mtmap):
434        os.remove(mtmap)
435    if os.path.exists(mtmap.split(".tmap")[0]+".refmap"):
436        os.remove(mtmap.split(".tmap")[0]+".refmap")
437
438    global tmp_dir
439    shutil.move(tmp_dir+".combined.gtf", output_dir + "/merged.gtf")
440
441#    os.remove("tmp_meta_asm.combined.gtf")
442    if os.path.exists(tmap):
443        os.remove(tmap)
444    if os.path.exists(tmap.split(".tmap")[0]+".refmap"):
445        os.remove(tmap.split(".tmap")[0]+".refmap")
446    #tmp_dir = asm_dir
447    #tmp_files = os.listdir(tmp_dir)
448    #for t in tmp_files:
449    #    os.remove(tmp_dir+t)
450    #os.rmdir(tmp_dir)
451
452#os.remove("tmp_meta_asm.tmap")
453
454def get_version():
455    return "1.0.0"
456
457def get_gtf_chrom_info(gtf_filename, known_chrom_info=None):
458    gtf_file = open(gtf_filename)
459    if known_chrom_info == None:
460        chroms = {}
461    else:
462        chroms = known_chrom_info
463    for line in gtf_file:
464        line = line.strip()
465
466        if len(line) == 0 or line[0] == "#":
467            continue
468        cols = line.split('\t')
469        if len(cols) < 8:
470            continue
471        chrom = cols[0]
472        left = int(cols[3])
473        right = int(cols[4])
474        bounds = chroms.setdefault(chrom, [9999999999,-1])
475        if bounds[0] > left:
476            bounds[0] = left
477        if bounds[1] < right:
478            bounds[1] = right
479    return chroms
480
481def header_for_chrom_info(chrom_info):
482    header_strs = ["""@HD\tVN:1.0\tSO:coordinate"""]
483    chrom_list = [(chrom, limits) for chrom, limits in chrom_info.iteritems()]
484    chrom_list.sort(lambda x,y: cmp(x[0],y[0]))
485    #print chrom_list
486    for chrom, limits in chrom_list:
487        line = "@SQ\tSN:%s\tLN:\t%d" % (chrom, limits[1])
488        header_strs.append(line)
489    header_strs.append("@PG\tID:cuffmerge\tVN:1.0.0\n")
490    header = "\n".join(header_strs)
491    return header
492
493def main(argv=None):
494
495    # Set this so sort is consistent across platforms and python versions
496    # (and always character-lexicographic)
497    os.environ['LC_ALL']='C'
498    warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
499    global params
500    params = TestParams()
501
502    try:
503        if argv is None:
504            argv = sys.argv
505            args = params.parse_options(argv)
506            params.check()
507
508        if len(args) < 1:
509            raise(Usage(help_message))
510
511        global run_log
512        global run_cmd
513
514        print >> sys.stderr
515        print >> sys.stderr, "[%s] Beginning transcriptome assembly merge" % (right_now())
516        print >> sys.stderr, "-------------------------------------------"
517        print >> sys.stderr
518
519        start_time = datetime.now()
520        prepare_output_dir()
521
522        run_log = open(logging_dir + "run.log", "w", 0)
523        run_cmd = " ".join(argv)
524        print >> run_log, run_cmd
525
526        transfrag_list_file = open(args[0], "r")
527
528        if params.ref_gtf != None:
529            test_input_files([params.ref_gtf])
530        else:
531            print >> sys.stderr, "Warning: no reference GTF provided!"
532
533        # Check that all the primary assemblies are accessible before starting the time consuming stuff
534        gtf_input_files = test_input_files(transfrag_list_file)
535
536        all_gtfs = []
537        all_gtfs.extend(gtf_input_files)
538        if params.ref_gtf != None:
539            all_gtfs.append(params.ref_gtf)
540        chrom_info = {}
541        for gtf in all_gtfs:
542            chrom_info = get_gtf_chrom_info(gtf, chrom_info)
543
544        header = header_for_chrom_info(chrom_info)
545
546        #Meta assembly option:
547        global run_meta_assembly
548        if run_meta_assembly:
549            # Convert the primary assemblies to SAM format
550            sam_input_files = convert_gtf_to_sam(gtf_input_files)
551            # Merge the primary assembly SAMs into a single input SAM file
552            merged_sam_filename = merge_sam_inputs(sam_input_files, header)
553            # Run cufflinks on the primary assembly transfrags to generate a meta-assembly
554            cufflinks(output_dir, merged_sam_filename, params.min_isoform_frac, params.ref_gtf)
555            compare_meta_asm_against_ref(params.ref_gtf, params.fasta, output_dir+"/transcripts.gtf")
556        #Meta Cuffcompare option:
557        else:
558            cuffcompare_all_assemblies(gtf_input_files) #FIXME: where is this function ?
559
560
561        if not params.system_params.keep_tmp:
562            tmp_files = os.listdir(tmp_dir)
563            for t in tmp_files:
564                os.remove(tmp_dir+t)
565            os.rmdir(tmp_dir)
566            os.remove(output_dir + "/transcripts.gtf")
567            os.remove(output_dir + "/skipped.gtf")
568            os.remove(output_dir + "/genes.fpkm_tracking")
569            os.remove(output_dir + "/isoforms.fpkm_tracking")
570    except Usage, err:
571        print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
572        return 2
573
574
575if __name__ == "__main__":
576    sys.exit(main())
577