1#!/usr/local/bin/python3.8 2 3import sys, os, subprocess, signal 4import multiprocessing 5import platform 6import string 7import re 8from datetime import datetime, date, time 9from collections import defaultdict 10from argparse import ArgumentParser, FileType 11 12osx_mode = False 13if sys.platform == 'darwin': 14 osx_mode = True 15 16MAX_EDIT = 21 17signal.signal(signal.SIGPIPE, signal.SIG_DFL) 18 19cigar_re = re.compile('\d+\w') 20 21""" 22""" 23def parse_mem_usage(resource): 24 if osx_mode: 25 resource = resource.strip().split('\n') 26 for line in resource: 27 if line.find('maximum resident set size') != -1: 28 return int(line.split()[0]) / 1024 29 else: 30 resource = resource.split(' ') 31 for line in resource: 32 idx = line.find('maxresident') 33 if idx != -1: 34 return line[:idx] 35 36 return '0' 37 38 39""" 40""" 41def reverse_complement(seq): 42 result = "" 43 for nt in seq: 44 base = nt 45 if nt == 'A': 46 base = 'T' 47 elif nt == 'a': 48 base = 't' 49 elif nt == 'C': 50 base = 'G' 51 elif nt == 'c': 52 base = 'g' 53 elif nt == 'G': 54 base = 'C' 55 elif nt == 'g': 56 base = 'c' 57 elif nt == 'T': 58 base = 'A' 59 elif nt == 't': 60 base = 'a' 61 62 result = base + result 63 64 return result 65 66 67""" 68""" 69def read_genome(genome_filename): 70 chr_dic = {} 71 genome_file = open(genome_filename, "r") 72 73 chr_name, sequence = "", "" 74 for line in genome_file: 75 if line[0] == ">": 76 if chr_name and sequence: 77 chr_dic[chr_name] = sequence 78 79 chr_name = line[1:-1].split()[0] 80 sequence = "" 81 else: 82 sequence += line[:-1] 83 84 if chr_name and sequence: 85 chr_dic[chr_name] = sequence 86 87 genome_file.close() 88 89 print >> sys.stderr, "genome is loaded" 90 91 return chr_dic 92 93 94""" 95""" 96def read_snp(snp_filename): 97 snps = defaultdict(list) 98 snp_file = open(snp_filename, 'r') 99 100 for line in snp_file: 101 line = line.strip() 102 if not line or line.startswith('#'): 103 continue 104 try: 105 snpID, type, chr, pos, data = line.split('\t') 106 except ValueError: 107 continue 108 109 assert type in ["single", "deletion", "insertion"] 110 if type == "deletion": 111 data = int(data) 112 snps[chr].append([snpID, type, int(pos), data]) 113 114 print >> sys.stderr, "snp is loaded" 115 116 return snps 117 118 119""" 120""" 121def extract_splice_sites(gtf_fname): 122 trans = {} 123 124 gtf_file = open(gtf_fname) 125 # Parse valid exon lines from the GTF file into a dict by transcript_id 126 for line in gtf_file: 127 line = line.strip() 128 if not line or line.startswith('#'): 129 continue 130 if '#' in line: 131 line = line.split('#')[0].strip() 132 133 try: 134 chrom, source, feature, left, right, score, \ 135 strand, frame, values = line.split('\t') 136 except ValueError: 137 continue 138 left, right = int(left), int(right) 139 140 if feature != 'exon' or left >= right: 141 continue 142 143 values_dict = {} 144 for attr in values.split(';')[:-1]: 145 attr, _, val = attr.strip().partition(' ') 146 values_dict[attr] = val.strip('"') 147 148 if 'gene_id' not in values_dict or \ 149 'transcript_id' not in values_dict: 150 continue 151 152 transcript_id = values_dict['transcript_id'] 153 if transcript_id not in trans: 154 trans[transcript_id] = [chrom, strand, [[left, right]]] 155 else: 156 trans[transcript_id][2].append([left, right]) 157 158 gtf_file.close() 159 160 # Sort exons and merge where separating introns are <=5 bps 161 for tran, [chrom, strand, exons] in trans.items(): 162 exons.sort() 163 tmp_exons = [exons[0]] 164 for i in range(1, len(exons)): 165 if exons[i][0] - tmp_exons[-1][1] <= 5: 166 tmp_exons[-1][1] = exons[i][1] 167 else: 168 tmp_exons.append(exons[i]) 169 trans[tran] = [chrom, strand, tmp_exons] 170 171 # Calculate and print the unique junctions 172 junctions = set() 173 for chrom, strand, exons in trans.values(): 174 for i in range(1, len(exons)): 175 junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]])) 176 177 return junctions 178 179 180def to_junction_str(junction): 181 return "%s-%d-%d" % (junction[0], junction[1], junction[2]) 182 183 184def to_junction(junction_str): 185 chr, left, right = junction_str.split("-") 186 return [chr, int(left), int(right)] 187 188 189def junction_cmp(a, b): 190 if a[0] != b[0]: 191 if a[0] < b[0]: 192 return -1 193 else: 194 return 1 195 196 if a[1] != b[1]: 197 if a[1] < b[1]: 198 return -1 199 else: 200 return 1 201 202 if a[2] != b[2]: 203 if a[2] < b[2]: 204 return -1 205 else: 206 return 1 207 208 return 0 209 210 211# chr and pos are assumed to be integers 212def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100): 213 junctions = [] 214 right_pos = pos 215 cigars = cigar_re.findall(cigar_str) 216 cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))] 217 218 left_anchor_lens = [] 219 cur_left_anchor_len = 0 220 for i in range(len(cigars)): 221 length, cigar_op = cigars[i] 222 if cigar_op in "MI": 223 cur_left_anchor_len += length 224 elif cigar_op == "N": 225 assert cur_left_anchor_len > 0 226 left_anchor_lens.append(cur_left_anchor_len) 227 cur_left_anchor_len = 0 228 229 for i in range(len(cigars)): 230 length, cigar_op = cigars[i] 231 if cigar_op == "N": 232 left, right = right_pos - 1, right_pos + length 233 234 if i > 0 and cigars[i-1][1] in "ID": 235 if cigars[i-1][1] == "I": 236 left += cigars[i-1][0] 237 else: 238 left -= cigars[i-1][0] 239 if i + 1 < len(cigars) and cigars[i+1][1] in "ID": 240 if cigars[i+1][1] == "I": 241 right -= cigars[i+1][0] 242 else: 243 right += cigars[i+1][0] 244 245 junction_idx = len(junctions) 246 assert junction_idx < len(left_anchor_lens) 247 left_anchor_len = left_anchor_lens[junction_idx] 248 assert left_anchor_len > 0 and left_anchor_len < read_len 249 right_anchor_len = read_len - left_anchor_len 250 if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len: 251 junctions.append([chr, left, right]) 252 253 if cigar_op in "MND": 254 right_pos += length 255 256 return junctions 257 258 259def get_right(pos, cigars): 260 right_pos = pos 261 cigars = cigar_re.findall(cigars) 262 for cigar in cigars: 263 length = int(cigar[:-1]) 264 cigar_op = cigar[-1] 265 if cigar_op in "MDN": 266 right_pos += length 267 268 return right_pos 269 270def get_cigar_chars(cigars): 271 cigars = cigar_re.findall(cigars) 272 cigar_chars = "" 273 for cigar in cigars: 274 cigar_op = cigar[-1] 275 cigar_chars += cigar_op 276 277 return cigar_chars 278 279def get_cigar_chars_MN(cigars): 280 cigars = cigar_re.findall(cigars) 281 cigar_chars = "" 282 for cigar in cigars: 283 cigar_op = cigar[-1] 284 if cigar_op in "MN": 285 if cigar_chars == "" or cigar_chars[-1] != cigar_op: 286 cigar_chars += cigar_op 287 288 return cigar_chars 289 290def is_non_canonical_junction_read(chr_dic, chr, left, cigars, canonical_junctions = [["GT", "AG"], ["GC", "AG"], ["AT", "AC"]]): 291 pos = left 292 for cigar in cigar_re.findall(cigars): 293 cigar_op = cigar[-1] 294 cigar_len = int(cigar[:-1]) 295 296 if cigar_op in 'MD': 297 pos += cigar_len 298 elif cigar_op == 'N': 299 right = pos + cigar_len 300 301 donor = chr_dic[chr][pos-1:pos+1] 302 acceptor = chr_dic[chr][right-3:right-1] 303 304 rev_donor = reverse_complement(acceptor) 305 rev_acceptor = reverse_complement(donor) 306 307 # print donor, acceptor 308 # print rev_donor, rev_acceptor 309 310 if [donor, acceptor] not in canonical_junctions and [rev_donor, rev_acceptor] not in canonical_junctions: 311 return True 312 313 pos = right 314 315 return False 316 317def is_canonical_junction(chr_dic, junction): 318 chr, left, right = junction 319 donor = chr_dic[chr][left:left+2] 320 acceptor = chr_dic[chr][right-3:right-1] 321 rev_donor = reverse_complement(acceptor) 322 rev_acceptor = reverse_complement(donor) 323 324 if (donor == "GT" and acceptor == "AG") or \ 325 (rev_donor == "GT" and rev_acceptor == "AG"): 326 return True 327 328 return False 329 330def is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str): 331 result_junctions = [] 332 junctions = get_junctions(chr, pos, cigar_str, 0, 101) 333 for junction in junctions: 334 junction_str = to_junction_str(junction) 335 is_gtf_junction = False 336 def find_in_gtf_junctions(gtf_junctions, junction): 337 l, u = 0, len(gtf_junctions) 338 while l < u: 339 m = (l + u) / 2 340 assert m >= 0 and m < len(gtf_junctions) 341 cmp_result = junction_cmp(junction, gtf_junctions[m]) 342 if cmp_result == 0: 343 return m 344 elif cmp_result < 0: 345 u = m 346 else: 347 l = m + 1 348 349 return l 350 351 # allow small (<= 5bp) discrepancy for non-canonical splice sites. 352 relaxed_junction_dist = 5 353 chr, left, right = junction 354 gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relaxed_junction_dist, right - relaxed_junction_dist]) 355 if gtf_index >= 0: 356 i = gtf_index 357 while i < len(gtf_junctions): 358 chr2, left2, right2 = gtf_junctions[i] 359 if chr2 > chr or \ 360 left2 - left > relaxed_junction_dist or \ 361 right2 - right > relaxed_junction_dist: 362 break 363 364 if abs(left - left2) <= relaxed_junction_dist and left - left2 == right - right2: 365 canonical = is_canonical_junction(chr_dic, gtf_junctions[i]) 366 if left == left2 or not canonical: 367 is_gtf_junction = True 368 break 369 370 i += 1 371 372 result_junctions.append([junction_str, is_gtf_junction]) 373 374 is_gtf_junction_read = False 375 if len(result_junctions) > 0: 376 is_gtf_junction_read = True 377 for junction_str, is_gtf_junction in result_junctions: 378 if not is_gtf_junction: 379 is_gtf_junction_read = False 380 break 381 382 return result_junctions, len(result_junctions) > 0, is_gtf_junction_read 383 384 385def is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str): 386 junctions, junction_read, gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str) 387 mate_junctions, mate_junction_read, mate_gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, mate_chr, mate_pos, mate_cigar_str) 388 junctions += mate_junctions 389 junction_pair = len(junctions) > 0 390 if junction_pair: 391 gtf_junction_pair = True 392 if junction_read and not gtf_junction_read: 393 gtf_junction_pair = False 394 if mate_junction_read and not mate_gtf_junction_read: 395 gtf_junction_pair = False 396 else: 397 gtf_junction_pair = False 398 399 return junctions, junction_pair, gtf_junction_pair 400 401""" 402""" 403def getSNPs(chr_snps, left, right): 404 low, high = 0, len(chr_snps) 405 while low < high: 406 mid = (low + high) / 2 407 snpID, type, pos, data = chr_snps[mid] 408 if pos < left: 409 low = mid + 1 410 else: 411 high = mid - 1 412 413 snps = [] 414 for i in xrange(low, len(chr_snps)): 415 snp = chr_snps[i] 416 snpID, type, pos, data = snp 417 pos2 = pos 418 if type == "deletion": 419 pos2 += data 420 if pos2 >= right: 421 break 422 if pos >= left: 423 if len(snps) > 0: 424 _, prev_type, prev_pos, prev_data = snps[-1] 425 assert prev_pos <= pos 426 prev_pos2 = prev_pos 427 if prev_type == "deletion": 428 prev_pos2 += prev_data 429 if pos <= prev_pos2: 430 continue 431 snps.append(snp) 432 433 return snps 434 435""" 436""" 437def check_snps(snps, check_type, ref_pos, read_seq): 438 found = False 439 440 for snp in snps: 441 snp_type, snp_pos, snp_data = snp[1:4] 442 443 if snp_type == check_type: 444 if snp_type == 'single': 445 if snp_pos == ref_pos and snp_data[0] == read_seq[0]: 446 found = True 447 break 448 elif snp_type == 'insertion': 449 if snp_pos == ref_pos and snp_data == read_seq: 450 found = True 451 break 452 453 elif snp_type == 'deletion': 454 # snp_data and read_seq are length of sequence deleted 455 if snp_pos == ref_pos and int(snp_data) == int(read_seq): 456 found = True 457 break 458 459 return found 460 461 462def extract_reads_and_pairs(chr_dic, sam_filename, read_filename, pair_filename, unmapped_read_1_fq_name, unmapped_read_2_fq_name, snps_dict = None): 463 temp_read_filename, temp_pair_filename = read_filename + ".temp", pair_filename + ".temp" 464 temp_read_file, temp_pair_file = open(temp_read_filename, "w"), open(temp_pair_filename, "w") 465 466 unmapped_read_1_fq, unmapped_read_2_fq = open(unmapped_read_1_fq_name, "w"), open(unmapped_read_2_fq_name, "w") 467 hisat2 = read_filename.find("hisat2") != -1 or pair_filename.find("hisat2") != -1 468 vg = read_filename.find("vg") != -1 or pair_filename.find("vg") != -1 469 470 read_dic = {} 471 prev_read_id, prev_read_seq = "", "" 472 sam_file = open(sam_filename, "r") 473 for line in sam_file: 474 if line[0] == "@": 475 continue 476 477 fields = line[:-1].split() 478 read_id, flag, chr, pos, mapQ, cigar_str, mate_chr, mate_pos, template_len, read_seq, read_qual = fields[:11] 479 if 'H' in cigar_str: 480 continue 481 482 flag, pos, mate_pos = int(flag), int(pos), int(mate_pos) 483 if read_seq == "*" and prev_read_id == read_id: 484 read_seq = prev_read_seq 485 read_seq = read_seq.upper() 486 if read_seq == "" or read_seq == "*": 487 continue 488 489 if flag & 0x04 != 0 or \ 490 chr == "*" or \ 491 cigar_str == "*": 492 """ 493 if flag & 0x80 != 0: 494 print >> unmapped_read_2_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual) 495 else: 496 print >> unmapped_read_1_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual) 497 """ 498 continue 499 500 if mate_chr == '=': 501 mate_chr = chr 502 503 if len(read_id) >= 3 and read_id[-2] == "/": 504 read_id = read_id[:-2] 505 506 if read_id.find("seq.") == 0: 507 read_id = read_id[4:] 508 509 if read_id != prev_read_id: 510 read_dic = {} 511 512 HISAT2_XM, HISAT2_NM = 0, 0 513 if hisat2: 514 for field in fields[11:]: 515 if field[:5] == "XM:i:": 516 HISAT2_XM = int(field[5:]) 517 elif field[:5] == "NM:i:": 518 HISAT2_NM = int(field[5:]) 519 520 prev_read_id = read_id 521 prev_read_seq = read_seq 522 523 if snps_dict != None and chr in snps_dict: 524 chr_snps = snps_dict[chr] 525 else: 526 chr_snps = [] 527 528 snps = None 529 530 XM, gap = 0, 0 531 read_pos, right_pos = 0, pos - 1, 532 junction_read = False 533 534 cigars = cigar_re.findall(cigar_str) 535 for i in range(len(cigars)): 536 cigar = cigars[i] 537 length = int(cigar[:-1]) 538 cigar_op = cigar[-1] 539 540 if cigar_op == "S": 541 if i != 0 and i != len(cigars) - 1: 542 print >> sys.stderr, "S is located at %dth out of %d %s" % (i+1, len(cigars), cigar_str) 543 544 if cigar_op in "MS": 545 ref_pos = right_pos 546 if cigar_op == "S" and i == 0: 547 ref_seq = chr_dic[chr][right_pos-length:right_pos] 548 ref_pos = right_pos - length 549 else: 550 ref_seq = chr_dic[chr][right_pos:right_pos+length] 551 552 ref_seq = ref_seq.upper() 553 if length == len(ref_seq): 554 for j in range(length): 555 if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]: 556 if snps_dict == None: 557 XM += 1 558 else: 559 if snps == None: 560 snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) 561 562 found_snp = check_snps(snps, 'single', ref_pos + j, read_seq[read_pos + j]) 563 if not found_snp: 564 XM += 1 565 566 if hisat2 and cigar_op == "S": 567 HISAT2_XM += 1 568 HISAT2_NM += 1 569 else: 570 XM += length 571 572 if cigar_op in "I": 573 if snps == None: 574 snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) 575 found_snp = check_snps(snps, 'insertion', right_pos, read_seq[read_pos:read_pos + length]) 576 if not found_snp: 577 gap += length 578 579 if cigar_op in "D": 580 if snps == None: 581 snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq)) 582 found_snp = check_snps(snps, 'deletion', right_pos, length) 583 if not found_snp: 584 gap += length 585 586 if cigar_op in "MND": 587 right_pos += length 588 589 if cigar_op in "MIS": 590 read_pos += length 591 592 if cigar_op == "N": 593 junction_read = True 594 595 NM = XM + gap 596 if hisat2: 597 XM, NM = HISAT2_XM, HISAT2_NM 598 if NM < MAX_EDIT: 599 print >> temp_read_file, "%s\t%d\t%s\t%s\t%s\tXM:i:%d\tNM:i:%d" % \ 600 (read_id, flag, chr, pos, cigar_str, XM, NM) 601 602 found = False 603 me = "%s\t%s\t%d" % (read_id, chr, pos) 604 partner = "%s\t%s\t%d" % (read_id, mate_chr, mate_pos) 605 if partner in read_dic: 606 maps = read_dic[partner] 607 for map in maps: 608 if map[0] == me: 609 mate_flag, mate_cigar_str, mate_XM, mate_NM = map[1:] 610 if mate_pos > pos: 611 flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr_str, mate_pos, mate_cigar_str, mate_XM, mate_NM = \ 612 mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM 613 614 print >> temp_pair_file, "%s\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d" % \ 615 (read_id, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM) 616 found = True 617 break 618 619 if not found: 620 if not me in read_dic: 621 read_dic[me] = [] 622 623 read_dic[me].append([partner, flag, cigar_str, XM, NM]) 624 625 sam_file.close() 626 627 temp_read_file.close() 628 temp_pair_file.close() 629 630 unmapped_read_1_fq.close() 631 unmapped_read_2_fq.close() 632 633 634 sort = False 635 if vg: 636 sort = True 637 638 if sort: 639 command = "sort %s | uniq > %s; rm %s" % (temp_read_filename, read_filename, temp_read_filename) 640 os.system(command) 641 642 command = "sort %s | uniq > %s; rm %s" % (temp_pair_filename, pair_filename, temp_pair_filename) 643 os.system(command) 644 else: 645 command = "mv %s %s; mv %s %s" % (temp_read_filename, read_filename, temp_pair_filename, pair_filename) 646 os.system(command) 647 648 649def remove_redundant_junctions(junctions): 650 temp_junctions = [] 651 for junction in junctions: 652 temp_junctions.append(to_junction(junction)) 653 junctions = sorted(list(temp_junctions), cmp=junction_cmp) 654 temp_junctions = [] 655 for can_junction in junctions: 656 if len(temp_junctions) <= 0: 657 temp_junctions.append(can_junction) 658 else: 659 chr, left, right = temp_junctions[-1] 660 chr2, left2, right2 = can_junction 661 if chr == chr2 and \ 662 abs(left - left2) == abs(right - right2) and \ 663 abs(left - left2) <= 10: 664 continue 665 temp_junctions.append(can_junction) 666 junctions = set() 667 for junction in temp_junctions: 668 junctions.add(to_junction_str(junction)) 669 670 return junctions 671 672 673 674def read_stat(read_filename, gtf_junctions, chr_dic = None, debug = False): 675 read_stat = [[0, 0, 0] for i in range(MAX_EDIT)] 676 temp_junctions = [set() for i in range(MAX_EDIT)] 677 temp_gtf_junctions = [set() for i in range(MAX_EDIT)] 678 679 alignment = [] 680 prev_read_id = "" 681 read_file = open(read_filename, "r") 682 for line in read_file: 683 read_id, flag, chr, pos, cigar_str, XM, NM = line[:-1].split() 684 flag, pos = int(flag), int(pos) 685 XM, NM = int(XM[5:]), int(NM[5:]) 686 687 read_junctions, junction_read, gtf_junction_read = \ 688 is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str) 689 690 if junction_read: 691 for junction_str, is_gtf_junction in read_junctions: 692 if NM < len(temp_junctions): 693 temp_junctions[NM].add(junction_str) 694 695 if is_gtf_junction: 696 temp_gtf_junctions[NM].add(junction_str) 697 698 if read_id != prev_read_id: 699 if prev_read_id != "": 700 NM2, junction_read2, gtf_junction_read2 = alignment 701 if NM2 < len(read_stat): 702 read_stat[NM2][0] += 1 703 704 if junction_read2: 705 read_stat[NM2][1] += 1 706 707 if gtf_junction_read2: 708 read_stat[NM2][2] += 1 709 710 alignment = [] 711 712 prev_read_id = read_id 713 714 if not alignment: 715 alignment = [NM, junction_read, gtf_junction_read] 716 elif alignment[0] > NM or \ 717 (alignment[0] == NM and not alignment[2] and junction_read): 718 alignment = [NM, junction_read, gtf_junction_read] 719 720 read_file.close() 721 722 for i in range(len(read_stat)): 723 temp_junctions[i] = remove_redundant_junctions(temp_junctions[i]) 724 temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i]) 725 726 for i in range(len(read_stat)): 727 read_stat[i].append(len(temp_junctions[i])) 728 read_stat[i].append(len(temp_gtf_junctions[i])) 729 730 if alignment: 731 NM2, junction_read2, gtf_junction_read2 = alignment 732 if NM2 < len(read_stat): 733 read_stat[NM2][0] += 1 734 735 if junction_read2: 736 read_stat[NM2][1] += 1 737 738 if gtf_junction_read2: 739 read_stat[NM2][2] += 1 740 741 return read_stat 742 743 744def cal_read_len(cigar_str): 745 length = 0 746 leftmost_softclip = 0 747 rightmost_softclip = 0 748 cigars = cigar_re.findall(cigar_str) 749 750 for i in range(len(cigars)): 751 cigar = cigars[i] 752 cigar_length = int(cigar[:-1]) 753 cigar_op = cigar[-1] 754 755 if cigar_op in "MIS": 756 length += cigar_length 757 758 if (i == 0) and (cigar_op == "S"): 759 leftmost_softclip = cigar_length 760 if (i == (len(cigars) - 1)) and (cigar_op == "S"): 761 rightmost_softclip = cigar_length 762 763 return length, leftmost_softclip, rightmost_softclip 764 765def is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM): 766 concord_length = 1000 767 segment_length = sys.maxint 768 769 pairs = {} 770 pairs[0] = [flag, chr, pos, cigar_str, XM, NM] 771 pairs[1] = [mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM] 772 773 if chr != mate_chr: 774 return False, segment_length 775 if (flag & 0x10 == 0x10) or (mate_flag & 0x10 == 0): 776 return False, segment_length 777 778 assert pos <= mate_pos 779 780 left = pairs[0] 781 right = pairs[1] 782 783 left_start = left[2] 784 left_len, _, _ = cal_read_len(left[3]) # cigar 785 786 right_start = right[2] 787 right_len, _, right_soft = cal_read_len(right[3]) 788 789 segment_length = (right_start + right_len) - left_start - right_soft 790 assert segment_length >= 0 791 792 if segment_length > concord_length: 793 return False, segment_length 794 795 return True, segment_length 796 797def pair_stat(pair_filename, gtf_junctions, chr_dic): 798 # pair_stat = NM, junction_pair, gtf_junction, concordant_alignment] 799 pair_stat = [[0, 0, 0, 0] for i in range(MAX_EDIT)] 800 dis_pair_stat = [0 for i in range(MAX_EDIT)] 801 temp_junctions = [set() for i in range(MAX_EDIT)] 802 temp_gtf_junctions = [set() for i in range(MAX_EDIT)] 803 804 alignment, dis_alignments = [], [] 805 prev_read_id = "" 806 con_file = open(pair_filename + ".con", "w") 807 discon_file = open(pair_filename + ".discon", "w") 808 pair_file = open(pair_filename, "r") 809 for line in pair_file: 810 read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM = line[:-1].split() 811 flag, pos, XM, NM, mate_flag, mate_pos, mate_XM, mate_NM = \ 812 int(flag), int(pos), int(XM[5:]), int(NM[5:]), int(mate_flag), int(mate_pos), int(mate_XM[5:]), int(mate_NM[5:]) 813 814 pair_XM = XM + mate_XM 815 pair_NM = NM + mate_NM 816 817 pair_junctions, junction_pair, gtf_junction_pair = \ 818 is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str) 819 820 # check concordantly 821 concord_align, segment_len = is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM) 822 print >> (con_file if concord_align else discon_file), line.strip(), ('none', 'first')[(flag & 0x40 == 0x40)], ('none', 'last')[(mate_flag & 0x80 == 0x80)], segment_len 823 824 if junction_pair: 825 for junction_str, is_gtf_junction in pair_junctions: 826 if pair_NM < len(temp_junctions): 827 temp_junctions[pair_NM].add(junction_str) 828 829 if is_gtf_junction: 830 temp_gtf_junctions[pair_NM].add(junction_str) 831 832 if read_id != prev_read_id: 833 if prev_read_id != "": 834 NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment 835 if NM2 < len(pair_stat): 836 pair_stat[NM2][0] += 1 837 838 if junction_read2: 839 pair_stat[NM2][1] += 1 840 if gtf_junction_read2: 841 pair_stat[NM2][2] += 1 842 if concord_align2: 843 pair_stat[NM2][3] += 1 844 845 for NM2 in dis_alignments: 846 if NM2 < len(dis_pair_stat): 847 dis_pair_stat[NM2] += 1 848 849 alignment = [] 850 dis_alignment = [] 851 852 prev_read_id = read_id 853 854 if not alignment: 855 alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align] 856 elif alignment[0] > pair_NM or \ 857 (alignment[0] == pair_NM and not alignment[2] and junction_pair): 858 alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align] 859 860 if mate_chr != chr or ((flag & 0x10) != 0 or (mate_flag & 0x10) == 0): 861 if len(dis_alignments) == 0: 862 dis_alignments = [pair_NM] 863 elif dis_alignments[0] > pair_NM: 864 dis_alignments = [pair_NM] 865 866 pair_file.close() 867 con_file.close() 868 discon_file.close() 869 870 # process last line 871 if alignment: 872 NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment 873 if NM2 < len(pair_stat): 874 pair_stat[NM2][0] += 1 875 876 if junction_read2: 877 pair_stat[NM2][1] += 1 878 if gtf_junction_read2: 879 pair_stat[NM2][2] += 1 880 881 if concord_align2: 882 pair_stat[NM2][3] += 1 883 884 assert len(dis_alignments) <= 1 885 for NM2 in dis_alignments: 886 if NM2 < len(dis_pair_stat): 887 dis_pair_stat[NM2] += 1 888 889 for i in range(len(pair_stat)): 890 temp_junctions[i] = remove_redundant_junctions(temp_junctions[i]) 891 temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i]) 892 893 for i in range(len(pair_stat)): 894 pair_stat[i].append(len(temp_junctions[i])) 895 pair_stat[i].append(len(temp_gtf_junctions[i])) 896 897 return pair_stat, dis_pair_stat 898 899 900def sql_execute(sql_db, sql_query): 901 sql_cmd = [ 902 "sqlite3", sql_db, 903 "-separator", "\t", 904 "%s;" % sql_query 905 ] 906 # print >> sys.stderr, sql_cmd 907 sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE) 908 output = sql_process.communicate()[0][:-1] 909 return output 910 911 912def create_sql_db(sql_db): 913 if os.path.exists(sql_db): 914 print >> sys.stderr, sql_db, "already exists!" 915 return 916 917 columns = [ 918 ["id", "integer primary key autoincrement"], 919 ["reads", "text"], 920 ["genome", "text"], 921 ["end_type", "text"], 922 ["aligner", "text"], 923 ["version", "test"], 924 ["use_annotation", "text"], 925 ["edit_distance", "integer"], 926 ["mapped_reads", "integer"], 927 ["junction_reads", "integer"], 928 ["gtf_junction_reads", "integer"], 929 ["junctions", "integer"], 930 ["gtf_junctions", "integer"], 931 ["runtime", "real"], 932 ["host", "text"], 933 ["created", "text"], 934 ["cmd", "text"] 935 ] 936 937 sql_create_table = "CREATE TABLE Mappings (" 938 for i in range(len(columns)): 939 name, type = columns[i] 940 if i != 0: 941 sql_create_table += ", " 942 sql_create_table += ("%s %s" % (name, type)) 943 sql_create_table += ");" 944 sql_execute(sql_db, sql_create_table) 945 946 947def write_analysis_data(sql_db, database_name, paired): 948 if not os.path.exists(sql_db): 949 return 950 951 if paired: 952 paired = "paired" 953 else: 954 paired = "single" 955 956 aligners = [] 957 sql_aligners = "SELECT aligner FROM Mappings WHERE end_type = '%s' GROUP BY aligner" % (paired) 958 output = sql_execute(sql_db, sql_aligners) 959 aligners = output.split() 960 961 database_fname = database_name + "_" + paired + ".analysis" 962 database_file = open(database_fname, "w") 963 964 print >> database_file, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime" 965 for aligner in aligners: 966 for edit_distance in range(MAX_EDIT): 967 sql_row = "SELECT aligner, use_annotation, end_type, edit_distance, mapped_reads, junction_reads, gtf_junction_reads, junctions, gtf_junctions, runtime FROM Mappings" 968 sql_row += " WHERE reads = '%s' and aligner = '%s' and edit_distance = %d and end_type = '%s' ORDER BY created DESC LIMIT 1" % (database_name, aligner, edit_distance, paired) 969 output = sql_execute(sql_db, sql_row) 970 if output: 971 print >> database_file, output 972 973 database_file.close() 974 975 976def calculate_read_cost(single_end, 977 paired_end, 978 test_aligners, 979 fresh, 980 runtime_only, 981 verbose): 982 sql_db_name = "analysis.db" 983 if not os.path.exists(sql_db_name): 984 create_sql_db(sql_db_name) 985 986 full_workdir = os.getcwd() 987 workdir = full_workdir.split("/")[-1] 988 989 num_cpus = multiprocessing.cpu_count() 990 if num_cpus > 8: 991 num_threads = min(8, num_cpus) 992 desktop = False 993 else: 994 num_threads = min(3, num_cpus) 995 desktop = True 996 997 verbose = False 998 sql_write = True 999 is_large_file = False 1000 gz_file = False 1001 if os.path.exists("1.fq.gz"): 1002 gz_file = True 1003 if os.path.getsize("1.fq.gz") > (1024 * 1024 * 1024): 1004 is_large_file = True 1005 1006 elif os.path.exists("1.fq"): 1007 gz_file = False 1008 if os.path.getsize("1.fq") > (2 * 1024 * 1024 * 1024): 1009 is_large_file = True 1010 1011 else: 1012 assert(False) 1013 1014 1015 aligners = [ 1016 # ["hisat2", "", "", "", ""], 1017 # ["hisat2", "", "", "", "--sensitive"], 1018 # ["hisat2", "", "", "", "--very-sensitive"], 1019 # ["hisat2", "", "", "", "-k 50 --score-min C,-50,0"], 1020 # ["hisat2", "", "snp", "", ""], 1021 # ["hisat2", "", "snp", "", "--sensitive"], 1022 # ["hisat2", "", "snp", "", "-k 50"], 1023 # ["hisat2", "", "", "205", ""], 1024 # ["hisat2", "", "snp", "205", ""], 1025 # ["hisat2", "", "snp_tran", "205", ""], 1026 # ["hisat2", "", "tran", "", ""], 1027 # ["hisat2", "x1", "snp", "", ""], 1028 # ["hisat2", "x1", "", "", ""], 1029 # ["hisat2", "x2", "", "", ""], 1030 # ["hisat2", "", "tran", "", ""], 1031 # ["hisat2", "", "snp_tran", "204", ""], 1032 # ["hisat2", "", "snp_tran", "", ""], 1033 # ["hisat2", "", "", "210", ""], 1034 ["hisat2", "", "rep", "", ""], 1035 # ["hisat2", "", "rep", "", "--read-lengths 101"], 1036 # ["hisat2", "", "rep", "", "--sensitive"], 1037 # ["hisat2", "", "rep-100-300", "", ""], 1038 # ["hisat2", "", "rep-101-300", "", "--sensitive"], 1039 # ["hisat2", "", "rep-101-300", "", "-k 10 --score-min C,-50,0"], 1040 # ["hisat2", "", "rep-150-300", "", ""], 1041 # ["tophat2", "", "", "", ""], 1042 # ["bowtie", "", "", "", ""], 1043 ["bowtie2", "", "", "", ""], 1044 # ["bowtie2", "", "", "", "-k 10"], 1045 ["bwa", "mem", "", "", ""], 1046 # ["bwa", "mem", "", "", "-a"], 1047 # ["bwa", "sw", "", "", ""], 1048 # ["star", "", "", "", ""], 1049 # ["star", "x2", "", "", ""], 1050 # ["vg", "", "", "", ""], 1051 # ["vg", "", "", "", "-M 10"], 1052 # ["vg", "", "snp", "", ""], 1053 # ["vg", "", "snp", "", "-M 10"], 1054 # ["minimap2", "", "", "", ""], 1055 ] 1056 1057 # sql_write = False 1058 verbose = True 1059 debug = False 1060 1061 genome = "genome" 1062 cwd = os.getcwd() 1063 RNA = (cwd.find("RNA") != -1) 1064 1065 chr_dic = read_genome("../../../data/" + genome + ".fa") 1066 snp_dic = read_snp("../../../data/" + genome + ".snp") 1067 gtf_junction_strs = extract_splice_sites("../../../data/" + genome + ".gtf") 1068 gene = "no" 1069 gtf_junctions = [] 1070 for junction_str in gtf_junction_strs: 1071 junction = to_junction(junction_str) 1072 gtf_junctions.append(junction) 1073 gtf_junctions = sorted(gtf_junctions, cmp=junction_cmp) 1074 1075 print >> sys.stderr, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime" 1076 1077 for paired in [False, True]: 1078 if not paired and not single_end: 1079 continue 1080 if paired and not paired_end: 1081 continue 1082 1083 type_read1_fname = "1.fq" 1084 if gz_file: 1085 type_read1_fname += ".gz" 1086 1087 if paired: 1088 type_read2_fname = "2.fq" 1089 if gz_file: 1090 type_read2_fname += ".gz" 1091 1092 else: 1093 type_read2_fname = "" 1094 1095 aligner_bin_base = "../../../../aligners/bin" 1096 def get_aligner_version(aligner): 1097 version = "" 1098 if aligner == "hisat2" or \ 1099 aligner == "hisat" or \ 1100 aligner == "bowtie" or \ 1101 aligner == "bowtie2": 1102 if version: 1103 cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)] 1104 else: 1105 cmd = ["%s/%s" % (aligner_bin_base, aligner)] 1106 cmd += ["--version"] 1107 cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) 1108 version = cmd_process.communicate()[0][:-1].split("\n")[0] 1109 version = version.split()[-1] 1110 elif aligner == "tophat2": 1111 cmd = ["%s/tophat" % (aligner_bin_base)] 1112 cmd += ["--version"] 1113 cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) 1114 version = cmd_process.communicate()[0][:-1].split()[-1] 1115 elif aligner in ["star", "starx2"]: 1116 version = "2.4.2a" 1117 elif aligner == "gsnap": 1118 cmd = ["%s/gsnap" % (aligner_bin_base)] 1119 cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) 1120 version = cmd_process.communicate()[1][:-1].split("\n")[0] 1121 version = version.split()[2] 1122 elif aligner == "bwa": 1123 if version: 1124 cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] 1125 else: 1126 cmd = ["%s/bwa" % (aligner_bin_base)] 1127 cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) 1128 version = cmd_process.communicate()[1][:-1].split("\n")[2] 1129 version = version.split()[1] 1130 elif aligner == "vg": 1131 cmd = ["%s/vg" % (aligner_bin_base)] 1132 cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE) 1133 version = cmd_process.communicate()[1][:-1].split("\n")[0] 1134 version = version.split()[5] 1135 elif aligner == "minimap2": 1136 cmd = ["%s/minimap2" % (aligner_bin_base)] 1137 cmd += ["--version"] 1138 cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE) 1139 version = cmd_process.communicate()[0][:-1].split("\n")[0] 1140 1141 return version 1142 1143 index_base = "../../../../indexes" 1144 index_add = "" 1145 if genome != "genome": 1146 index_add = "_" + genome 1147 def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0): 1148 cmd = ["/usr/bin/time"] 1149 if osx_mode: 1150 cmd += ['-l'] 1151 if aligner == "hisat2": 1152 if version: 1153 cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)] 1154 else: 1155 cmd += ["%s/hisat2" % (aligner_bin_base)] 1156 if num_threads > 1: 1157 cmd += ["-p", str(num_threads)] 1158 1159 # cmd += ["-k", "5"] 1160 # cmd += ["--score-min", "C,-18"] 1161 1162 # daehwan - for debugging purposes 1163 # cmd += ["--score-min", "C,-50"] 1164 # cmd += ["--pen-cansplice", "0"] 1165 # cmd += ["--pen-noncansplice", "12"] 1166 # cmd += ["--pen-intronlen", "G,-8,1"] 1167 # cmd += ["--metrics", "1", 1168 # "--metrics-file", "metrics.out"] 1169 1170 if version == "204": 1171 cmd += ["--sp", "2,1"] 1172 1173 if not RNA: 1174 cmd += ["--no-spliced-alignment"] 1175 1176 if type in ["x1", "x2"]: 1177 cmd += ["--no-temp-splicesite"] 1178 1179 # DK - for debugging purposes 1180 # cmd += ["--dta"] 1181 """ 1182 if index_type == "tran": 1183 cmd += ["--no-anchorstop"] 1184 cmd += ["-k", "100"] 1185 """ 1186 1187 if options != "": 1188 cmd += options.split(' ') 1189 1190 if type == "x2": 1191 if cmd_idx == 0: 1192 cmd += ["--novel-splicesite-outfile"] 1193 else: 1194 cmd += ["--novel-splicesite-infile"] 1195 cmd += ["splicesites.txt"] 1196 1197 # "--novel-splicesite-infile", 1198 # "../splicesites.txt", 1199 # "--rna-strandness", 1200 # "FR", 1201 if version: 1202 index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome 1203 else: 1204 index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome 1205 if index_type: 1206 index_cmd += ("_" + index_type) 1207 cmd += [index_cmd] 1208 if paired: 1209 cmd += ["-1", read1_fname, 1210 "-2", read2_fname] 1211 else: 1212 cmd += ["-U", read1_fname] 1213 elif aligner == "hisat": 1214 cmd += ["%s/hisat" % (aligner_bin_base)] 1215 if num_threads > 1: 1216 cmd += ["-p", str(num_threads)] 1217 # cmd += ["-k", "5"] 1218 # cmd += ["--score-min", "C,-18"] 1219 if version != "": 1220 version = int(version) 1221 else: 1222 version = sys.maxint 1223 1224 if not RNA: 1225 cmd += ["--no-spliced-alignment"] 1226 1227 if type in ["x1", "x2"] or not RNA: 1228 cmd += ["--no-temp-splicesite"] 1229 1230 """ 1231 cmd += ["--rdg", "100,100", 1232 "--rfg", "100,100"] 1233 """ 1234 1235 if type == "x2": 1236 if cmd_idx == 0: 1237 cmd += ["--novel-splicesite-outfile"] 1238 else: 1239 cmd += ["--novel-splicesite-infile"] 1240 cmd += ["splicesites.txt"] 1241 1242 # "--novel-splicesite-infile", 1243 # "../splicesites.txt", 1244 # "--rna-strandness", 1245 # "FR", 1246 cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome] 1247 if paired: 1248 cmd += ["-1", read1_fname, 1249 "-2", read2_fname] 1250 else: 1251 cmd += [read1_fname] 1252 elif aligner == "tophat2": 1253 cmd += ["%s/tophat" % (aligner_bin_base)] 1254 if num_threads > 1: 1255 cmd += ["-p", str(num_threads)] 1256 cmd += ["--read-edit-dist", "3"] 1257 cmd += ["--no-sort-bam"] 1258 cmd += ["--read-realign-edit-dist", "0"] 1259 cmd += ["--keep-tmp", 1260 "%s/HISAT%s/" % (index_base, index_add) + genome, 1261 read1_fname] 1262 if paired: 1263 cmd += [read2_fname] 1264 elif aligner == "star": 1265 cmd += ["%s/STAR" % (aligner_bin_base)] 1266 if num_threads > 1: 1267 cmd += ["--runThreadN", str(num_threads)] 1268 if type == "x2" and cmd_idx == 1: 1269 cmd += ["--genomeDir", "."] 1270 else: 1271 cmd += ["--genomeDir", "%s/STAR%s" % (index_base, index_add)] 1272 if desktop: 1273 cmd += ["--genomeLoad", "NoSharedMemory"] 1274 else: 1275 cmd += ["--genomeLoad", "LoadAndKeep"] 1276 if type == "x2": 1277 if cmd_idx == 1: 1278 cmd += ["--alignSJDBoverhangMin", "1"] 1279 cmd += ["--readFilesIn", 1280 read1_fname] 1281 if paired: 1282 cmd += [read2_fname] 1283 if paired: 1284 cmd += ["--outFilterMismatchNmax", "6"] 1285 else: 1286 cmd += ["--outFilterMismatchNmax", "3"] 1287 elif aligner == "bowtie": 1288 cmd += ["%s/bowtie" % (aligner_bin_base)] 1289 if num_threads > 1: 1290 cmd += ["-p", str(num_threads)] 1291 cmd += ["--sam", 1292 "-k", "10"] 1293 cmd += ["-n", "3"] 1294 if paired: 1295 cmd += ["-X", "500"] 1296 cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome] 1297 if paired: 1298 cmd += ["-1", read1_fname, 1299 "-2", read2_fname] 1300 else: 1301 cmd += [read1_fname] 1302 elif aligner == "bowtie2": 1303 if version: 1304 cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)] 1305 else: 1306 cmd += ["%s/bowtie2" % (aligner_bin_base)] 1307 if num_threads > 1: 1308 cmd += ["-p", str(num_threads)] 1309 #cmd += ["-k", "10"] 1310 #cmd += ["--score-min", "C,-18"] 1311 cmd += ["-X", "1000"] 1312 1313 if options: 1314 cmd += options.split(' ') 1315 1316 if version: 1317 cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome] 1318 else: 1319 cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome] 1320 if paired: 1321 cmd += ["-1", read1_fname, 1322 "-2", read2_fname] 1323 else: 1324 cmd += [read1_fname] 1325 elif aligner == "gsnap": 1326 cmd += ["%s/gsnap" % (aligner_bin_base), 1327 "-A", 1328 "sam"] 1329 if num_threads > 1: 1330 cmd += ["-t", str(num_threads)] 1331 cmd += ["--max-mismatches=3", 1332 "-D", "%s/GSNAP%s" % (index_base, index_add), 1333 "-N", "1", 1334 "-d", genome, 1335 read1_fname] 1336 if paired: 1337 cmd += [read2_fname] 1338 elif aligner == "bwa": 1339 if version: 1340 cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)] 1341 else: 1342 cmd += ["%s/bwa" % (aligner_bin_base)] 1343 if type in ["mem", "aln"]: 1344 cmd += [type] 1345 elif type == "sw": 1346 cmd += ["bwa" + type] 1347 if num_threads > 1: 1348 cmd += ["-t", str(num_threads)] 1349 if options: 1350 cmd += options.split(' ') 1351 if version: 1352 cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)] 1353 else: 1354 cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)] 1355 cmd += [read1_fname] 1356 if paired: 1357 cmd += [read2_fname] 1358 elif aligner == "vg": 1359 # vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa --surject-to sam > result.sam 1360 cmd += ["%s/vg" % (aligner_bin_base)] 1361 cmd += ["map"] 1362 cmd += ["-t", str(num_threads)] 1363 cmd += ["--surject-to", "sam"] 1364 index_cmd = "%s/VG%s/" % (index_base, index_add) + genome 1365 if index_type: 1366 index_cmd += ("_" + index_type) 1367 1368 if options: 1369 cmd += options.split(' ') 1370 1371 cmd += ["-d", index_cmd] 1372 1373 cmd += ["-f", read1_fname] 1374 if paired: 1375 cmd += ["-f", read2_fname] 1376 1377 elif aligner == "minimap2": 1378 # minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam 1379 cmd += ["%s/minimap2" % (aligner_bin_base)] 1380 cmd += ["-a"] 1381 cmd += ["-x", "sr"] 1382 index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome 1383 if index_type: 1384 index_cmd += ("_" + index_type) 1385 index_cmd += ".mmi" 1386 cmd += [index_cmd] 1387 cmd += [read1_fname] 1388 if paired: 1389 cmd += [read2_fname] 1390 else: 1391 assert False 1392 1393 return cmd 1394 1395 for aligner, type, index_type, version, options in aligners: 1396 skip = False 1397 if len(test_aligners) > 0: 1398 skip = True 1399 for test_aligner in test_aligners: 1400 if aligner == test_aligner: 1401 skip = False 1402 if skip: 1403 continue 1404 1405 aligner_name = aligner + type + version 1406 if (aligner == "hisat2" or aligner == "vg") and index_type != "": 1407 aligner_name += ("_" + index_type) 1408 1409 if options != "": 1410 option_name = options.replace(' ', '').replace('-', '').replace(',', '') 1411 aligner_name = aligner_name + '_' + option_name 1412 if paired: 1413 aligner_dir = aligner_name + "_paired" 1414 else: 1415 aligner_dir = aligner_name + "_single" 1416 1417 if fresh and os.path.exists(aligner_dir): 1418 os.system("rm -rf %s" % aligner_dir) 1419 1420 if not os.path.exists(aligner_dir): 1421 os.mkdir(aligner_dir) 1422 os.chdir(aligner_dir) 1423 1424 out_fname = "accepted.sam" 1425 aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname) 1426 duration = 0.1 1427 mem_usage = '' 1428 if not os.path.exists(out_fname): 1429 if not os.path.exists("../one.fq") or not os.path.exists("../two.fq"): 1430 if gz_file: 1431 os.system("gzip -cd ../1.fq.gz | head -400 > ../one.fq") 1432 os.system("gzip -cd ../2.fq.gz | head -400 > ../two.fq") 1433 else: 1434 os.system("head -400 ../1.fq > ../one.fq") 1435 os.system("head -400 ../2.fq > ../two.fq") 1436 1437 # dummy commands for caching index 1438 loading_time = 0 1439 if aligner not in ["tophat2"]: 1440 for i in range(3): 1441 dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fq", "../two.fq", "/dev/null") 1442 start_time = datetime.now() 1443 if verbose: 1444 print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd) 1445 if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]: 1446 proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE) 1447 else: 1448 proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1449 proc.communicate() 1450 finish_time = datetime.now() 1451 duration = finish_time - start_time 1452 duration = duration.total_seconds() 1453 if verbose: 1454 print >> sys.stderr, finish_time, "duration:", duration 1455 loading_time = duration 1456 1457 # align all reads 1458 if paired: 1459 sweep_read_cmd = "cat ../%s ../%s > /dev/null" % (type_read1_fname, type_read2_fname) 1460 else: 1461 sweep_read_cmd = "cat ../%s > /dev/null" % (type_read1_fname) 1462 print >> sys.stderr, datetime.now(), "\t", sweep_read_cmd 1463 os.system(sweep_read_cmd) 1464 1465 skip_alignment = False 1466 if paired and aligner == "olego" and os.path.exists(out_fname + "1"): 1467 skip_alignment = True 1468 1469 if not skip_alignment: 1470 aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname) 1471 start_time = datetime.now() 1472 if verbose: 1473 print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd) 1474 if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2"]: 1475 proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) 1476 else: 1477 proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1478 _, mem_usage = proc.communicate() 1479 mem_usage = parse_mem_usage(mem_usage) 1480 finish_time = datetime.now() 1481 duration = finish_time - start_time 1482 duration = duration.total_seconds() - loading_time 1483 if duration < 0.1: 1484 duration = 0.1 1485 if verbose: 1486 print >> sys.stderr, finish_time, "duration:", duration 1487 1488 if verbose: 1489 print >> sys.stderr, finish_time, "Memory Usage: %dMB" % (int(mem_usage) / 1024) 1490 1491 if debug and aligner == "hisat" and type == "x1": 1492 os.system("cat metrics.out") 1493 print >> sys.stderr, "\ttime: %.4f" % (duration) 1494 # break 1495 1496 if aligner == "star" and type in ["", "gtf"]: 1497 os.system("mv Aligned.out.sam %s" % out_fname) 1498 elif aligner in ["hisat2", "hisat"] and type == "x2": 1499 aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) 1500 if verbose: 1501 print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd) 1502 start_time = datetime.now() 1503 proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE) 1504 proc.communicate() 1505 finish_time = datetime.now() 1506 duration += (finish_time - start_time).total_seconds() 1507 duration -= loading_time 1508 if duration < 0.1: 1509 duration = 0.1 1510 if verbose: 1511 print >> sys.stderr, finish_time, "duration:", duration 1512 elif aligner == "star" and type == "x2": 1513 assert os.path.exists("SJ.out.tab") 1514 os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb") 1515 for file in os.listdir("."): 1516 if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]: 1517 continue 1518 os.remove(file) 1519 star_index_cmd = "STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../../data/genome.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN %d" % (num_threads) 1520 print >> sys.stderr, "\t", datetime.now(), star_index_cmd 1521 os.system(star_index_cmd) 1522 if verbose: 1523 print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd) 1524 proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1525 proc.communicate() 1526 if verbose: 1527 print >> sys.stderr, "\t", datetime.now(), "finished" 1528 aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1) 1529 start_time = datetime.now() 1530 if verbose: 1531 print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd) 1532 proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 1533 proc.communicate() 1534 finish_time = datetime.now() 1535 duration += (finish_time - start_time).total_seconds() 1536 duration -= loading_time 1537 if duration < 0.1: 1538 duration = 0.1 1539 if verbose: 1540 print >> sys.stderr, "\t", finish_time, "finished:", duration 1541 os.system("mv Aligned.out.sam %s" % out_fname) 1542 elif aligner == "tophat2": 1543 os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname) 1544 elif aligner == "vg": 1545 index_name = '%s/VG%s/' % (index_base, index_add) + genome 1546 if index_type: 1547 index_name += ('_' + index_type) 1548 1549 os.system("echo %s %s %f >> runtime" % (str(datetime.now()), aligner, duration)) 1550 1551 if aligner in ["star", "tophat2", "gsnap"]: 1552 os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname)) 1553 1554 if runtime_only: 1555 os.chdir("..") 1556 continue 1557 1558 suffix = aligner 1559 read_sam, pair_sam = suffix + ".read.sam", suffix + ".pair.sam" 1560 unmapped_read_1_fq, unmapped_read_2_fq = suffix + ".unmapped.1.fq", suffix + ".unmapped.2.fq" 1561 if not os.path.exists(read_sam) or not os.path.exists(pair_sam): 1562 if index_type == 'snp': 1563 extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq, snp_dic) 1564 else: 1565 extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq) 1566 1567 1568 1569 out = '' 1570 if gz_file: 1571 out = subprocess.check_output("gzip -cd ../%s | wc -l" % type_read1_fname, shell=True) 1572 else: 1573 out = subprocess.check_output("wc -l ../%s" % type_read1_fname, shell=True) 1574 1575 numreads = int(out.split()[0]) / 4 1576 1577 done_filename = suffix + ".done" 1578 if not os.path.exists(done_filename): 1579 done_file = open(done_filename, "w") 1580 if paired: 1581 sum = [0, 0, 0, 0, 0, 0] # mappep_read, junction_read, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions 1582 dis_sum = 0 1583 stat, dis_stat = pair_stat(pair_sam, gtf_junctions, chr_dic) 1584 output = "" 1585 for i in range(len(stat)): 1586 for j in range(len(sum)): 1587 sum[j] += stat[i][j] 1588 1589 dis_sum += dis_stat[i] 1590 mapped_reads, junction_reads, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions = sum 1591 output += "%s\t%s\tpaired\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\t%d\t%.2f%%\n" % \ 1592 (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)), concord_mapped_read, float(concord_mapped_read) * 100.0 / numreads) 1593 1594 if sql_write and os.path.exists("../" + sql_db_name): 1595 sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \ 1596 (workdir, genome, "paired", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd)) 1597 sql_execute("../" + sql_db_name, sql_insert) 1598 1599 1600 print >> sys.stderr, output, 1601 print >> done_file, output 1602 else: 1603 sum = [0, 0, 0, 0, 0] 1604 stat = read_stat(read_sam, gtf_junctions, chr_dic) 1605 output = "" 1606 for i in range(len(stat)): 1607 for j in range(len(sum)): 1608 sum[j] += stat[i][j] 1609 1610 mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions = sum 1611 output += "%s\t%s\tsingle\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\n" % \ 1612 (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration))) 1613 1614 if sql_write and os.path.exists("../" + sql_db_name): 1615 sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \ 1616 (workdir, genome, "single", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd)) 1617 sql_execute("../" + sql_db_name, sql_insert) 1618 1619 print >> sys.stderr, output, 1620 print >> done_file, output 1621 1622 done_file.close() 1623 1624 1625 os.chdir("..") 1626 1627 if os.path.exists(sql_db_name): 1628 write_analysis_data(sql_db_name, workdir, paired) 1629 1630 1631 1632if __name__ == "__main__": 1633 parser = ArgumentParser( 1634 description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.') 1635 parser.add_argument('--single-end', 1636 dest='paired_end', 1637 action='store_false', 1638 help='run single-end only') 1639 parser.add_argument('--paired-end', 1640 dest='single_end', 1641 action='store_false', 1642 help='run paired_end only') 1643 parser.add_argument('--aligner-list', 1644 dest='aligner_list', 1645 type=str, 1646 default="", 1647 help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa') 1648 parser.add_argument('--fresh', 1649 dest='fresh', 1650 action='store_true', 1651 help='delete existing alignment related directories (e.g. hisat2_single)') 1652 parser.add_argument('--runtime-only', 1653 dest='runtime_only', 1654 action='store_true', 1655 help='run programs without evaluation') 1656 parser.add_argument('-v', '--verbose', 1657 dest='verbose', 1658 action='store_true', 1659 help='also print some statistics to stderr') 1660 1661 args = parser.parse_args() 1662 1663 aligners = [] 1664 for aligner in args.aligner_list.split(','): 1665 if aligner == "": 1666 continue 1667 aligners.append(aligner) 1668 1669 calculate_read_cost(args.single_end, 1670 args.paired_end, 1671 aligners, 1672 args.fresh, 1673 args.runtime_only, 1674 args.verbose) 1675 1676