1#!/usr/local/bin/python3.8 2# encoding: utf-8 3""" 4cuffmerge.py 5 6Created by Cole Trapnell on 2011-03-17. 7Copyright (c) 2011 Cole Trapnell. All rights reserved. 8""" 9 10import sys 11import getopt 12from datetime import datetime, date, time 13import shutil 14import subprocess 15import errno 16import os 17import tempfile 18import warnings 19import types 20 21help_message = ''' 22cuffmerge takes two or more Cufflinks GTF files and merges them into a 23single unified transcript catalog. Optionally, you can provide the script 24with a reference GTF, and the script will use it to attach gene names and other 25metadata to the merged catalog. 26 27Usage: 28 cuffmerge [Options] <assembly_GTF_list.txt> 29 30Options: 31 -h/--help Prints the help message and exits 32 -o <output_dir> Directory where merged assembly will be written [ default: ./merged_asm ] 33 -g/--ref-gtf An optional "reference" annotation GTF. 34 -s/--ref-sequence <seq_dir>/<seq_fasta> Genomic DNA sequences for the reference. 35 --min-isoform-fraction <0-1.0> Discard isoforms with abundance below this [ default: 0.05 ] 36 -p/--num-threads <int> Use this many threads to merge assemblies. [ default: 1 ] 37 --keep-tmp Keep all intermediate files during merge 38''' 39 40output_dir = "./merged_asm/" 41logging_dir = output_dir + "logs/" 42run_log = None 43run_cmd = None 44tmp_dir = output_dir + "/tmp/" 45bin_dir = sys.path[0] + "/" 46run_meta_assembly = True 47fail_str = "\t[FAILED]\n" 48params = None 49 50class Usage(Exception): 51 def __init__(self, msg): 52 self.msg = msg 53 54class TestParams: 55 56 class SystemParams: 57 def __init__(self, 58 threads, 59 keep_tmp): 60 self.threads = threads 61 self.keep_tmp = keep_tmp 62 63 def parse_options(self, opts): 64 for option, value in opts: 65 if option in ("-p", "--num-threads"): 66 self.threads = int(value) 67 if option in ("--keep-tmp"): 68 self.keep_tmp = True 69 70 def check(self): 71 pass 72 73 def __init__(self): 74 self.system_params = self.SystemParams(1, # threads 75 False) # keep_tmp 76 self.ref_gtf = None 77 self.fasta = None 78 self.min_isoform_frac = 0.05 79 80 def check(self): 81 self.system_params.check() 82 83 def parse_options(self, argv): 84 try: 85 opts, args = getopt.getopt(argv[1:], 86 "hvp:o:g:M:s:q:F:", 87 ["version", 88 "help", 89 "ref-sequence=", 90 "ref-gtf=", 91 "output-dir=", 92 "num-threads=", 93 "keep-tmp", 94 "min-isoform-fraction="]) 95 except getopt.error, msg: 96 raise Usage(msg) 97 98 self.system_params.parse_options(opts) 99 100 global output_dir 101 global logging_dir 102 global tmp_dir 103 104 # option processing 105 for option, value in opts: 106 if option in ("-v", "--version"): 107 print "merge_cuff_asms v%s" % (get_version()) 108 exit(0) 109 if option in ("-h", "--help"): 110 raise Usage(help_message) 111 if option in ("-g", "--ref-gtf"): 112 self.ref_gtf = value 113 if option in ("-s", "--ref-sequence"): 114 self.fasta = value 115 if option in ("-F", "--min-isoform-fraction"): 116 self.min_isoform_frac = float(value) 117 if option in ("-o", "--output-dir"): 118 output_dir = value + "/" 119 logging_dir = output_dir + "logs/" 120 tmp_dir = output_dir + "tmp/" 121 122 return args 123 124 125def right_now(): 126 curr_time = datetime.now() 127 return curr_time.strftime("%c") 128 129def prepare_output_dir(): 130 131 print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir) 132 if os.path.exists(output_dir): 133 pass 134 else: 135 os.makedirs(output_dir) 136 137 #print >> sys.stderr, "Checking for %s", logging_dir 138 if os.path.exists(logging_dir): 139 pass 140 else: 141 #print >> sys.stderr, "Creating %s", logging_dir 142 os.makedirs(logging_dir) 143 144 if os.path.exists(tmp_dir): 145 pass 146 else: 147 os.makedirs(tmp_dir) 148 149def formatTD(td): 150 hours = td.seconds // 3600 151 minutes = (td.seconds % 3600) // 60 152 seconds = td.seconds % 60 153 return '%02d:%02d:%02d' % (hours, minutes, seconds) 154 155def tmp_name(prefix): 156 tmp_root = output_dir + "tmp/" 157 if os.path.exists(tmp_root): 158 pass 159 else: 160 os.mkdir(tmp_root) 161 return tmp_root + prefix + os.tmpnam().split('/')[-1] 162 163def cufflinks(out_dir, 164 sam_file, 165 min_isoform_frac, 166 gtf_file=None, 167 extra_opts=["-q", "--overhang-tolerance", "200", "--library-type=transfrags", "-A","0.0", "--min-frags-per-transfrag", "0", "--no-5-extend"], 168 lsf=False, 169 curr_queue=None): 170 if gtf_file != None: 171 print >> sys.stderr, "[%s] Quantitating transcripts" % (right_now()) 172 else: 173 print >> sys.stderr, "[%s] Assembling transcripts" % (right_now()) 174 175 cmd = ["cufflinks"] 176 177 if out_dir != None and out_dir != "": 178 cmd.extend(["-o", out_dir]) 179 180 cmd.extend(["-F", str(min_isoform_frac)]) 181 182 if gtf_file != None: 183 cmd.extend(["-g", gtf_file]) 184 185 if extra_opts != None: 186 cmd.extend(extra_opts) 187 global params 188 # Run Cufflinks with more than one thread? 189 cmd.extend(["-p", str(params.system_params.threads)]) 190 191 cmd.append(sam_file) 192 193 try: 194 print >> run_log, " ".join(cmd) 195 ret = subprocess.call(cmd) 196 if ret != 0: 197 print >> sys.stderr, fail_str, "Error: could not execute cufflinks" 198 exit(1) 199 # cufflinks not found 200 except OSError, o: 201 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 202 print >> sys.stderr, fail_str, "Error: cufflinks not found on this system. Did you forget to include it in your PATH?" 203 exit(1) 204 205def cuffcompare(prefix, ref_gtf, fasta, cuff_gtf): 206 207 print >> sys.stderr, "[%s] Comparing reference %s to assembly %s" % (right_now(), ref_gtf, cuff_gtf) 208 cmd = ["cuffcompare"] 209 210 if prefix != None: 211 cmd.extend(["-o", prefix]) 212 if ref_gtf != None: 213 cmd.extend(["-r", ref_gtf]) 214 if fasta != None: 215 cmd.extend(["-s", fasta]) 216 if type(cuff_gtf) == types.ListType: 217 for g in cuff_gtf: 218 cmd.extend([g]) 219 else: 220 cmd.extend([cuff_gtf]) 221 222 try: 223 print >> run_log, " ".join(cmd) 224 ret = subprocess.call(cmd) 225 if ret != 0: 226 print >> sys.stderr, fail_str, "Error: could not execute cuffcompare" 227 exit(1) 228 # cuffcompare not found 229 except OSError, o: 230 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 231 print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system. Did you forget to include it in your PATH?" 232 exit(1) 233 234def gtf_to_sam(gtf_filename): 235 236 sam_out = tmp_name("gtf2sam_") 237 238 cmd = ["gtf_to_sam"] 239 cmd.append("-F") 240 cmd.append(gtf_filename) 241 cmd.append(sam_out) 242 try: 243 print >> run_log, " ".join(cmd) 244 ret = subprocess.call(cmd) 245 if ret != 0: 246 print >> sys.stderr, fail_str, "Error: could not execute gtf_to_sam" 247 exit(1) 248 # gtf_to_sam not found 249 except OSError, o: 250 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 251 print >> sys.stderr, fail_str, "Error: gtf_to_sam not found on this system. Did you forget to include it in your PATH?" 252 exit(1) 253 return sam_out 254 255def test_input_files(filename_list): 256 """This function takes a file that contains a list of GTF files, 257 tests accessibility of each, and returns the list of filenames""" 258 259 OK = True 260 input_files = [] 261 for line in filename_list: 262 line = line.strip() 263 264 # Skip comment line 265 if len(line) == 0 or line[0] == "#": 266 continue 267 try: 268 g = open(line,"r") 269 input_files.append(line) 270 271 except OSError, o: 272 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 273 print >> sys.stderr, fail_str, "Error: could not open %s" % line 274 OK = False 275 if not OK: 276 sys.exit(1) 277 return input_files 278 279def convert_gtf_to_sam(gtf_filename_list): 280 """This function takes a list of GTF files, converts them all to 281 temporary SAM files, and returns the list of temporary file names.""" 282 print >> sys.stderr, "[%s] Converting GTF files to SAM" % (right_now()) 283 OK = True 284 sam_input_filenames = [] 285 for line in gtf_filename_list: 286 try: 287 sam_out = gtf_to_sam(line) 288 sam_input_filenames.append(sam_out) 289 except OSError, o: 290 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 291 print >> sys.stderr, fail_str, "Error: could not open %s" % line 292 OK = False 293 if not OK: 294 sys.exit(1) 295 return sam_input_filenames 296 297def merge_sam_inputs(sam_input_list, header): 298 sorted_map_name = tmp_name( "mergeSam_") 299 300 sorted_map = open(sorted_map_name, "w") 301 302 #print header 303 304 # The header was built from a dict keyed by chrom, so 305 # the records will be lexicographically ordered and 306 # should match the BAM after the sort below. 307 print >> sorted_map, header, 308 309 sorted_map.close() 310 sort_cmd =["sort", 311 "-k", 312 "3,3", 313 "-k", 314 "4,4n", 315 "--temporary-directory="+tmp_dir] 316 sort_cmd.extend(sam_input_list) 317 318 print >> run_log, " ".join(sort_cmd), ">", sorted_map_name 319 subprocess.call(sort_cmd, 320 stdout=open(sorted_map_name, "a")) 321 return sorted_map_name 322 323def compare_to_reference(meta_asm_gtf, ref_gtf, fasta): 324 global tmp_dir 325 print >> sys.stderr, "[%s] Comparing against reference file %s" % (right_now(), ref_gtf) 326 ref_str = "" 327 if ref_gtf != None: 328 ref_str = " -r %s " % ref_gtf 329 330 if fasta != None: 331 comp_cmd = '''cuffcompare -o %s -C -G %s -s %s %s''' % (tmp_dir, ref_str, fasta, meta_asm_gtf) 332 else: 333 comp_cmd = '''cuffcompare -o %s -C -G %s %s''' % (tmp_dir, ref_str, meta_asm_gtf) 334 335 #cmd = bsub_cmd(comp_cmd, "/gencode_cmp", True, job_mem=8) 336 cmd = comp_cmd 337 338 try: 339 print >> run_log, cmd 340 ret = subprocess.call(cmd,shell=True) 341 if ret != 0: 342 print >> sys.stderr, fail_str, "Error: could not execute cuffcompare" 343 exit(1) 344 #tmap_out = meta_asm_gtf.split("/")[-1] + ".tmap" 345 tfpath, tfname = os.path.split(meta_asm_gtf) 346 if tfpath: tfpath+='/' 347 tmap_out = tfpath+'.'+tfname+".tmap" 348 return tmap_out 349 # cuffcompare not found 350 except OSError, o: 351 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 352 print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system. Did you forget to include it in your PATH?" 353 exit(1) 354 355def select_gtf(gtf_in_filename, ids, gtf_out_filename): 356 f_gtf = open(gtf_in_filename) 357 #print >> sys.stderr, "Select GTF: Ids are: " 358 #print >> sys.stderr, ids 359 #print >> sys.stderr, "reference gtf file name:" 360 #print >> sys.stderr, gtf_in_filename 361 out_gtf = open(gtf_out_filename, "w") 362 for line in f_gtf: 363 line = line.strip() 364 cols = line.split('\t') 365 if len(cols) < 9: 366 continue 367 attrs = cols[8] 368 attr_cols = attrs.split(';') 369 for col in attr_cols: 370 if col.find("transcript_id") != -1: 371 first_quote = col.find('"') 372 last_quote = col.find('"', first_quote + 1) 373 transcript = col[first_quote + 1:last_quote] 374 #print >> sys.stderr, transcript 375 if transcript in ids: 376 print >> out_gtf, line 377 378 379def merge_gtfs(gtf_filenames, merged_gtf, ref_gtf=None): 380 print >> sys.stderr, "[%s] Merging linc gtf files with cuffcompare" % (right_now()) 381 cmd = ["cuffcompare"] 382 383 cmd.extend(["-o", merged_gtf]) 384 if ref_gtf != None: 385 cmd.extend(["-r", ref_gtf]) 386 387 cmd.extend(gtf_filenames) 388 cmd = " ".join(cmd) 389 #cmd = bsub_cmd(cmd, "/merge_gtf", True, job_mem=8) 390 391 try: 392 print >> run_log, cmd 393 ret = subprocess.call(cmd, shell=True) 394 if ret != 0: 395 print >> sys.stderr, fail_str, "Error: could not execute cuffcompare" 396 exit(1) 397 return merged_gtf + ".combined.gtf" 398 # cuffcompare not found 399 except OSError, o: 400 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: 401 print >> sys.stderr, fail_str, "Error: cuffcompare not found on this system. Did you forget to include it in your PATH?" 402 exit(1) 403 404def compare_meta_asm_against_ref(ref_gtf, fasta_file, gtf_input_file, class_codes=["c", "i", "r", "p", "e"]): 405 #print >> sys.stderr, "Cuffcmpare all assemblies GTFs" 406 407 tmap = compare_to_reference(gtf_input_file, ref_gtf, fasta_file) 408 409 #print >> sys.stderr, "Cuffcmpare all assemblies GTFs : filter %s" % ",".join(class_codes) 410 selected_ids= set([]) 411 f_tmap = open(tmap) 412 #out = open("tmp_meta_asm_selectedIds.txt", "w") 413 for line in f_tmap: 414 line = line.strip() 415 cols = line.split('\t') 416 if len(cols) < 5: 417 continue 418 class_code = cols[2] 419 name = cols[4] 420 if class_code not in class_codes: 421 selected_ids.add(name) 422 423 global output_dir 424 asm_dir = output_dir 425 426 if os.path.exists(asm_dir): 427 pass 428 else: 429 os.mkdir(asm_dir) 430 current_asm_gtf = output_dir +"transcripts.gtf" 431 select_gtf(current_asm_gtf, selected_ids, output_dir + "/merged.gtf") 432 mtmap = compare_to_reference(output_dir + "/merged.gtf", ref_gtf, fasta_file) 433 if os.path.exists(mtmap): 434 os.remove(mtmap) 435 if os.path.exists(mtmap.split(".tmap")[0]+".refmap"): 436 os.remove(mtmap.split(".tmap")[0]+".refmap") 437 438 global tmp_dir 439 shutil.move(tmp_dir+".combined.gtf", output_dir + "/merged.gtf") 440 441# os.remove("tmp_meta_asm.combined.gtf") 442 if os.path.exists(tmap): 443 os.remove(tmap) 444 if os.path.exists(tmap.split(".tmap")[0]+".refmap"): 445 os.remove(tmap.split(".tmap")[0]+".refmap") 446 #tmp_dir = asm_dir 447 #tmp_files = os.listdir(tmp_dir) 448 #for t in tmp_files: 449 # os.remove(tmp_dir+t) 450 #os.rmdir(tmp_dir) 451 452#os.remove("tmp_meta_asm.tmap") 453 454def get_version(): 455 return "1.0.0" 456 457def get_gtf_chrom_info(gtf_filename, known_chrom_info=None): 458 gtf_file = open(gtf_filename) 459 if known_chrom_info == None: 460 chroms = {} 461 else: 462 chroms = known_chrom_info 463 for line in gtf_file: 464 line = line.strip() 465 466 if len(line) == 0 or line[0] == "#": 467 continue 468 cols = line.split('\t') 469 if len(cols) < 8: 470 continue 471 chrom = cols[0] 472 left = int(cols[3]) 473 right = int(cols[4]) 474 bounds = chroms.setdefault(chrom, [9999999999,-1]) 475 if bounds[0] > left: 476 bounds[0] = left 477 if bounds[1] < right: 478 bounds[1] = right 479 return chroms 480 481def header_for_chrom_info(chrom_info): 482 header_strs = ["""@HD\tVN:1.0\tSO:coordinate"""] 483 chrom_list = [(chrom, limits) for chrom, limits in chrom_info.iteritems()] 484 chrom_list.sort(lambda x,y: cmp(x[0],y[0])) 485 #print chrom_list 486 for chrom, limits in chrom_list: 487 line = "@SQ\tSN:%s\tLN:\t%d" % (chrom, limits[1]) 488 header_strs.append(line) 489 header_strs.append("@PG\tID:cuffmerge\tVN:1.0.0\n") 490 header = "\n".join(header_strs) 491 return header 492 493def main(argv=None): 494 495 # Set this so sort is consistent across platforms and python versions 496 # (and always character-lexicographic) 497 os.environ['LC_ALL']='C' 498 warnings.filterwarnings("ignore", "tmpnam is a potential security risk") 499 global params 500 params = TestParams() 501 502 try: 503 if argv is None: 504 argv = sys.argv 505 args = params.parse_options(argv) 506 params.check() 507 508 if len(args) < 1: 509 raise(Usage(help_message)) 510 511 global run_log 512 global run_cmd 513 514 print >> sys.stderr 515 print >> sys.stderr, "[%s] Beginning transcriptome assembly merge" % (right_now()) 516 print >> sys.stderr, "-------------------------------------------" 517 print >> sys.stderr 518 519 start_time = datetime.now() 520 prepare_output_dir() 521 522 run_log = open(logging_dir + "run.log", "w", 0) 523 run_cmd = " ".join(argv) 524 print >> run_log, run_cmd 525 526 transfrag_list_file = open(args[0], "r") 527 528 if params.ref_gtf != None: 529 test_input_files([params.ref_gtf]) 530 else: 531 print >> sys.stderr, "Warning: no reference GTF provided!" 532 533 # Check that all the primary assemblies are accessible before starting the time consuming stuff 534 gtf_input_files = test_input_files(transfrag_list_file) 535 536 all_gtfs = [] 537 all_gtfs.extend(gtf_input_files) 538 if params.ref_gtf != None: 539 all_gtfs.append(params.ref_gtf) 540 chrom_info = {} 541 for gtf in all_gtfs: 542 chrom_info = get_gtf_chrom_info(gtf, chrom_info) 543 544 header = header_for_chrom_info(chrom_info) 545 546 #Meta assembly option: 547 global run_meta_assembly 548 if run_meta_assembly: 549 # Convert the primary assemblies to SAM format 550 sam_input_files = convert_gtf_to_sam(gtf_input_files) 551 # Merge the primary assembly SAMs into a single input SAM file 552 merged_sam_filename = merge_sam_inputs(sam_input_files, header) 553 # Run cufflinks on the primary assembly transfrags to generate a meta-assembly 554 cufflinks(output_dir, merged_sam_filename, params.min_isoform_frac, params.ref_gtf) 555 compare_meta_asm_against_ref(params.ref_gtf, params.fasta, output_dir+"/transcripts.gtf") 556 #Meta Cuffcompare option: 557 else: 558 cuffcompare_all_assemblies(gtf_input_files) #FIXME: where is this function ? 559 560 561 if not params.system_params.keep_tmp: 562 tmp_files = os.listdir(tmp_dir) 563 for t in tmp_files: 564 os.remove(tmp_dir+t) 565 os.rmdir(tmp_dir) 566 os.remove(output_dir + "/transcripts.gtf") 567 os.remove(output_dir + "/skipped.gtf") 568 os.remove(output_dir + "/genes.fpkm_tracking") 569 os.remove(output_dir + "/isoforms.fpkm_tracking") 570 except Usage, err: 571 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) 572 return 2 573 574 575if __name__ == "__main__": 576 sys.exit(main()) 577