1#!/usr/local/bin/python3.8
2
3import sys, os, subprocess, signal
4import multiprocessing
5import platform
6import string
7import re
8from datetime import datetime, date, time
9from collections import defaultdict
10from argparse import ArgumentParser, FileType
11
12osx_mode = False
13if sys.platform == 'darwin':
14    osx_mode = True
15
16MAX_EDIT = 21
17signal.signal(signal.SIGPIPE, signal.SIG_DFL)
18
19cigar_re = re.compile('\d+\w')
20
21"""
22"""
23def parse_mem_usage(resource):
24    if osx_mode:
25        resource = resource.strip().split('\n')
26        for line in resource:
27            if line.find('maximum resident set size') != -1:
28                return int(line.split()[0]) / 1024
29    else:
30        resource = resource.split(' ')
31        for line in resource:
32            idx = line.find('maxresident')
33            if idx != -1:
34                return line[:idx]
35
36    return '0'
37
38
39"""
40"""
41def reverse_complement(seq):
42    result = ""
43    for nt in seq:
44        base = nt
45        if nt == 'A':
46            base = 'T'
47        elif nt == 'a':
48            base = 't'
49        elif nt == 'C':
50            base = 'G'
51        elif nt == 'c':
52            base = 'g'
53        elif nt == 'G':
54            base = 'C'
55        elif nt == 'g':
56            base = 'c'
57        elif nt == 'T':
58            base = 'A'
59        elif nt == 't':
60            base = 'a'
61
62        result = base + result
63
64    return result
65
66
67"""
68"""
69def read_genome(genome_filename):
70    chr_dic = {}
71    genome_file = open(genome_filename, "r")
72
73    chr_name, sequence = "", ""
74    for line in genome_file:
75        if line[0] == ">":
76            if chr_name and sequence:
77                chr_dic[chr_name] = sequence
78
79            chr_name = line[1:-1].split()[0]
80            sequence = ""
81        else:
82            sequence += line[:-1]
83
84    if chr_name and sequence:
85        chr_dic[chr_name] = sequence
86
87    genome_file.close()
88
89    print >> sys.stderr, "genome is loaded"
90
91    return chr_dic
92
93
94"""
95"""
96def read_snp(snp_filename):
97    snps = defaultdict(list)
98    snp_file = open(snp_filename, 'r')
99
100    for line in snp_file:
101        line = line.strip()
102        if not line or line.startswith('#'):
103            continue
104        try:
105            snpID, type, chr, pos, data = line.split('\t')
106        except ValueError:
107            continue
108
109        assert type in ["single", "deletion", "insertion"]
110        if type == "deletion":
111            data = int(data)
112        snps[chr].append([snpID, type, int(pos), data])
113
114    print >> sys.stderr, "snp is loaded"
115
116    return snps
117
118
119"""
120"""
121def extract_splice_sites(gtf_fname):
122    trans = {}
123
124    gtf_file = open(gtf_fname)
125    # Parse valid exon lines from the GTF file into a dict by transcript_id
126    for line in gtf_file:
127        line = line.strip()
128        if not line or line.startswith('#'):
129            continue
130        if '#' in line:
131            line = line.split('#')[0].strip()
132
133        try:
134            chrom, source, feature, left, right, score, \
135                strand, frame, values = line.split('\t')
136        except ValueError:
137            continue
138        left, right = int(left), int(right)
139
140        if feature != 'exon' or left >= right:
141            continue
142
143        values_dict = {}
144        for attr in values.split(';')[:-1]:
145            attr, _, val = attr.strip().partition(' ')
146            values_dict[attr] = val.strip('"')
147
148        if 'gene_id' not in values_dict or \
149                'transcript_id' not in values_dict:
150            continue
151
152        transcript_id = values_dict['transcript_id']
153        if transcript_id not in trans:
154            trans[transcript_id] = [chrom, strand, [[left, right]]]
155        else:
156            trans[transcript_id][2].append([left, right])
157
158    gtf_file.close()
159
160    # Sort exons and merge where separating introns are <=5 bps
161    for tran, [chrom, strand, exons] in trans.items():
162            exons.sort()
163            tmp_exons = [exons[0]]
164            for i in range(1, len(exons)):
165                if exons[i][0] - tmp_exons[-1][1] <= 5:
166                    tmp_exons[-1][1] = exons[i][1]
167                else:
168                    tmp_exons.append(exons[i])
169            trans[tran] = [chrom, strand, tmp_exons]
170
171    # Calculate and print the unique junctions
172    junctions = set()
173    for chrom, strand, exons in trans.values():
174        for i in range(1, len(exons)):
175            junctions.add(to_junction_str([chrom, exons[i-1][1], exons[i][0]]))
176
177    return junctions
178
179
180def to_junction_str(junction):
181    return "%s-%d-%d" % (junction[0], junction[1], junction[2])
182
183
184def to_junction(junction_str):
185    chr, left, right = junction_str.split("-")
186    return [chr, int(left), int(right)]
187
188
189def junction_cmp(a, b):
190    if a[0] != b[0]:
191        if a[0] < b[0]:
192            return -1
193        else:
194            return 1
195
196    if a[1] != b[1]:
197        if a[1] < b[1]:
198            return -1
199        else:
200            return 1
201
202    if a[2] != b[2]:
203        if a[2] < b[2]:
204            return -1
205        else:
206            return 1
207
208    return 0
209
210
211# chr and pos are assumed to be integers
212def get_junctions(chr, pos, cigar_str, min_anchor_len = 0, read_len = 100):
213    junctions = []
214    right_pos = pos
215    cigars = cigar_re.findall(cigar_str)
216    cigars = [[int(cigars[i][:-1]), cigars[i][-1]] for i in range(len(cigars))]
217
218    left_anchor_lens = []
219    cur_left_anchor_len = 0
220    for i in range(len(cigars)):
221        length, cigar_op = cigars[i]
222        if cigar_op in "MI":
223            cur_left_anchor_len += length
224        elif cigar_op == "N":
225            assert cur_left_anchor_len > 0
226            left_anchor_lens.append(cur_left_anchor_len)
227            cur_left_anchor_len = 0
228
229    for i in range(len(cigars)):
230        length, cigar_op = cigars[i]
231        if cigar_op == "N":
232            left, right = right_pos - 1, right_pos + length
233
234            if i > 0 and cigars[i-1][1] in "ID":
235                if cigars[i-1][1] == "I":
236                    left += cigars[i-1][0]
237                else:
238                    left -= cigars[i-1][0]
239            if i + 1 < len(cigars) and cigars[i+1][1] in "ID":
240                if cigars[i+1][1] == "I":
241                    right -= cigars[i+1][0]
242                else:
243                    right += cigars[i+1][0]
244
245            junction_idx = len(junctions)
246            assert junction_idx < len(left_anchor_lens)
247            left_anchor_len = left_anchor_lens[junction_idx]
248            assert left_anchor_len > 0 and left_anchor_len < read_len
249            right_anchor_len = read_len - left_anchor_len
250            if left_anchor_len >= min_anchor_len and right_anchor_len >= min_anchor_len:
251                junctions.append([chr, left, right])
252
253        if cigar_op in "MND":
254            right_pos += length
255
256    return junctions
257
258
259def get_right(pos, cigars):
260    right_pos = pos
261    cigars = cigar_re.findall(cigars)
262    for cigar in cigars:
263        length = int(cigar[:-1])
264        cigar_op = cigar[-1]
265        if cigar_op in "MDN":
266            right_pos += length
267
268    return right_pos
269
270def get_cigar_chars(cigars):
271    cigars = cigar_re.findall(cigars)
272    cigar_chars = ""
273    for cigar in cigars:
274        cigar_op = cigar[-1]
275        cigar_chars += cigar_op
276
277    return cigar_chars
278
279def get_cigar_chars_MN(cigars):
280    cigars = cigar_re.findall(cigars)
281    cigar_chars = ""
282    for cigar in cigars:
283        cigar_op = cigar[-1]
284        if cigar_op in "MN":
285            if cigar_chars == "" or cigar_chars[-1] != cigar_op:
286                cigar_chars += cigar_op
287
288    return cigar_chars
289
290def is_non_canonical_junction_read(chr_dic, chr, left, cigars, canonical_junctions = [["GT", "AG"], ["GC", "AG"], ["AT", "AC"]]):
291    pos = left
292    for cigar in cigar_re.findall(cigars):
293        cigar_op = cigar[-1]
294        cigar_len = int(cigar[:-1])
295
296        if cigar_op in 'MD':
297            pos += cigar_len
298        elif cigar_op == 'N':
299            right = pos + cigar_len
300
301            donor = chr_dic[chr][pos-1:pos+1]
302            acceptor = chr_dic[chr][right-3:right-1]
303
304            rev_donor = reverse_complement(acceptor)
305            rev_acceptor = reverse_complement(donor)
306
307            # print donor, acceptor
308            # print rev_donor, rev_acceptor
309
310            if [donor, acceptor] not in canonical_junctions and [rev_donor, rev_acceptor] not in canonical_junctions:
311                return True
312
313            pos = right
314
315    return False
316
317def is_canonical_junction(chr_dic, junction):
318    chr, left, right = junction
319    donor = chr_dic[chr][left:left+2]
320    acceptor = chr_dic[chr][right-3:right-1]
321    rev_donor = reverse_complement(acceptor)
322    rev_acceptor = reverse_complement(donor)
323
324    if (donor == "GT" and acceptor == "AG") or \
325            (rev_donor == "GT" and rev_acceptor == "AG"):
326        return True
327
328    return False
329
330def is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str):
331    result_junctions = []
332    junctions = get_junctions(chr, pos, cigar_str, 0, 101)
333    for junction in junctions:
334        junction_str = to_junction_str(junction)
335        is_gtf_junction = False
336        def find_in_gtf_junctions(gtf_junctions, junction):
337            l, u = 0, len(gtf_junctions)
338            while l < u:
339                m = (l + u) / 2
340                assert m >= 0 and m < len(gtf_junctions)
341                cmp_result = junction_cmp(junction, gtf_junctions[m])
342                if cmp_result == 0:
343                    return m
344                elif cmp_result < 0:
345                    u = m
346                else:
347                    l = m + 1
348
349            return l
350
351        # allow small (<= 5bp) discrepancy for non-canonical splice sites.
352        relaxed_junction_dist = 5
353        chr, left, right = junction
354        gtf_index = find_in_gtf_junctions(gtf_junctions, [chr, left - relaxed_junction_dist, right - relaxed_junction_dist])
355        if gtf_index >= 0:
356            i = gtf_index
357            while i < len(gtf_junctions):
358                chr2, left2, right2 = gtf_junctions[i]
359                if chr2 > chr or \
360                        left2 - left > relaxed_junction_dist or \
361                        right2 - right > relaxed_junction_dist:
362                    break
363
364                if abs(left - left2) <= relaxed_junction_dist and left - left2 == right - right2:
365                    canonical = is_canonical_junction(chr_dic, gtf_junctions[i])
366                    if left == left2 or not canonical:
367                        is_gtf_junction = True
368                        break
369
370                i += 1
371
372        result_junctions.append([junction_str, is_gtf_junction])
373
374    is_gtf_junction_read = False
375    if len(result_junctions) > 0:
376        is_gtf_junction_read = True
377        for junction_str, is_gtf_junction in result_junctions:
378            if not is_gtf_junction:
379                is_gtf_junction_read = False
380                break
381
382    return result_junctions, len(result_junctions) > 0, is_gtf_junction_read
383
384
385def is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str):
386    junctions, junction_read, gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
387    mate_junctions, mate_junction_read, mate_gtf_junction_read = is_junction_read(chr_dic, gtf_junctions, mate_chr, mate_pos, mate_cigar_str)
388    junctions += mate_junctions
389    junction_pair = len(junctions) > 0
390    if junction_pair:
391        gtf_junction_pair = True
392        if junction_read and not gtf_junction_read:
393            gtf_junction_pair = False
394        if mate_junction_read and not mate_gtf_junction_read:
395            gtf_junction_pair = False
396    else:
397        gtf_junction_pair = False
398
399    return junctions, junction_pair, gtf_junction_pair
400
401"""
402"""
403def getSNPs(chr_snps, left, right):
404    low, high = 0, len(chr_snps)
405    while low < high:
406        mid = (low + high) / 2
407        snpID, type, pos, data = chr_snps[mid]
408        if pos < left:
409            low = mid + 1
410        else:
411            high = mid - 1
412
413    snps = []
414    for i in xrange(low, len(chr_snps)):
415        snp = chr_snps[i]
416        snpID, type, pos, data = snp
417        pos2 = pos
418        if type == "deletion":
419            pos2 += data
420        if pos2 >= right:
421            break
422        if pos >= left:
423            if len(snps) > 0:
424                _, prev_type, prev_pos, prev_data = snps[-1]
425                assert prev_pos <= pos
426                prev_pos2 = prev_pos
427                if prev_type == "deletion":
428                    prev_pos2 += prev_data
429                if pos <= prev_pos2:
430                    continue
431            snps.append(snp)
432
433    return snps
434
435"""
436"""
437def check_snps(snps, check_type, ref_pos, read_seq):
438    found = False
439
440    for snp in snps:
441        snp_type, snp_pos, snp_data = snp[1:4]
442
443        if snp_type == check_type:
444            if snp_type == 'single':
445                if snp_pos == ref_pos and snp_data[0] == read_seq[0]:
446                    found = True
447                    break
448            elif snp_type == 'insertion':
449                if snp_pos == ref_pos and snp_data == read_seq:
450                    found = True
451                    break
452
453            elif snp_type == 'deletion':
454                # snp_data and read_seq are length of sequence deleted
455                if snp_pos == ref_pos and int(snp_data) == int(read_seq):
456                    found = True
457                    break
458
459    return found
460
461
462def extract_reads_and_pairs(chr_dic, sam_filename, read_filename, pair_filename, unmapped_read_1_fq_name, unmapped_read_2_fq_name, snps_dict = None):
463    temp_read_filename, temp_pair_filename = read_filename + ".temp", pair_filename + ".temp"
464    temp_read_file, temp_pair_file = open(temp_read_filename, "w"), open(temp_pair_filename, "w")
465
466    unmapped_read_1_fq, unmapped_read_2_fq = open(unmapped_read_1_fq_name, "w"), open(unmapped_read_2_fq_name, "w")
467    hisat2 = read_filename.find("hisat2") != -1 or pair_filename.find("hisat2") != -1
468    vg = read_filename.find("vg") != -1 or pair_filename.find("vg") != -1
469
470    read_dic = {}
471    prev_read_id, prev_read_seq = "", ""
472    sam_file = open(sam_filename, "r")
473    for line in sam_file:
474        if line[0] == "@":
475            continue
476
477        fields = line[:-1].split()
478        read_id, flag, chr, pos, mapQ, cigar_str, mate_chr, mate_pos, template_len, read_seq, read_qual = fields[:11]
479        if 'H' in cigar_str:
480            continue
481
482        flag, pos, mate_pos = int(flag), int(pos), int(mate_pos)
483        if read_seq == "*" and prev_read_id == read_id:
484            read_seq = prev_read_seq
485        read_seq = read_seq.upper()
486        if read_seq == "" or read_seq == "*":
487            continue
488
489        if flag & 0x04 != 0 or \
490               chr == "*" or \
491               cigar_str == "*":
492            """
493            if flag & 0x80 != 0:
494                print >> unmapped_read_2_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
495            else:
496                print >> unmapped_read_1_fq, "@%s\n%s\n+%s\n%s" % (read_id, read_seq, read_id, read_qual)
497            """
498            continue
499
500        if mate_chr == '=':
501            mate_chr = chr
502
503        if len(read_id) >= 3 and read_id[-2] == "/":
504            read_id = read_id[:-2]
505
506        if read_id.find("seq.") == 0:
507            read_id = read_id[4:]
508
509        if read_id != prev_read_id:
510            read_dic = {}
511
512        HISAT2_XM, HISAT2_NM = 0, 0
513        if hisat2:
514            for field in fields[11:]:
515                if field[:5] == "XM:i:":
516                    HISAT2_XM = int(field[5:])
517                elif field[:5] == "NM:i:":
518                    HISAT2_NM = int(field[5:])
519
520        prev_read_id = read_id
521        prev_read_seq = read_seq
522
523        if snps_dict != None and chr in snps_dict:
524            chr_snps = snps_dict[chr]
525        else:
526            chr_snps = []
527
528        snps = None
529
530        XM, gap = 0, 0
531        read_pos, right_pos = 0, pos - 1,
532        junction_read = False
533
534        cigars = cigar_re.findall(cigar_str)
535        for i in range(len(cigars)):
536            cigar = cigars[i]
537            length = int(cigar[:-1])
538            cigar_op = cigar[-1]
539
540            if cigar_op == "S":
541                if i != 0 and i != len(cigars) - 1:
542                    print >> sys.stderr, "S is located at %dth out of %d %s" % (i+1, len(cigars), cigar_str)
543
544            if cigar_op in "MS":
545                ref_pos = right_pos
546                if cigar_op == "S" and i == 0:
547                    ref_seq = chr_dic[chr][right_pos-length:right_pos]
548                    ref_pos = right_pos - length
549                else:
550                    ref_seq = chr_dic[chr][right_pos:right_pos+length]
551
552                ref_seq = ref_seq.upper()
553                if length == len(ref_seq):
554                    for j in range(length):
555                        if ref_seq[j] != "N" and read_seq[read_pos+j] != ref_seq[j]:
556                            if snps_dict == None:
557                                XM += 1
558                            else:
559                                if snps == None:
560                                    snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
561
562                                found_snp = check_snps(snps, 'single', ref_pos + j, read_seq[read_pos + j])
563                                if not found_snp:
564                                    XM += 1
565
566                            if hisat2 and cigar_op == "S":
567                                HISAT2_XM += 1
568                                HISAT2_NM += 1
569                else:
570                    XM += length
571
572            if cigar_op in "I":
573                if snps == None:
574                    snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
575                found_snp = check_snps(snps, 'insertion', right_pos, read_seq[read_pos:read_pos + length])
576                if not found_snp:
577                    gap += length
578
579            if cigar_op in "D":
580                if snps == None:
581                    snps = getSNPs(chr_snps, pos - 1, pos + len(read_seq))
582                found_snp = check_snps(snps, 'deletion', right_pos, length)
583                if not found_snp:
584                    gap += length
585
586            if cigar_op in "MND":
587                right_pos += length
588
589            if cigar_op in "MIS":
590                read_pos += length
591
592            if cigar_op == "N":
593                junction_read = True
594
595        NM = XM + gap
596        if hisat2:
597            XM, NM = HISAT2_XM, HISAT2_NM
598        if NM < MAX_EDIT:
599            print >> temp_read_file, "%s\t%d\t%s\t%s\t%s\tXM:i:%d\tNM:i:%d" % \
600                  (read_id, flag, chr, pos, cigar_str, XM, NM)
601
602            found = False
603            me = "%s\t%s\t%d" % (read_id, chr, pos)
604            partner = "%s\t%s\t%d" % (read_id, mate_chr, mate_pos)
605            if partner in read_dic:
606                maps = read_dic[partner]
607                for map in maps:
608                    if map[0] == me:
609                        mate_flag, mate_cigar_str, mate_XM, mate_NM = map[1:]
610                        if mate_pos > pos:
611                            flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr_str, mate_pos, mate_cigar_str, mate_XM, mate_NM = \
612                                  mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM
613
614                        print >> temp_pair_file, "%s\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d\t%d\t%s\t%d\t%s\tXM:i:%d\tNM:i:%d" % \
615                              (read_id, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM, flag, chr, pos, cigar_str, XM, NM)
616                        found = True
617                        break
618
619            if not found:
620                if not me in read_dic:
621                    read_dic[me] = []
622
623                read_dic[me].append([partner, flag, cigar_str, XM, NM])
624
625    sam_file.close()
626
627    temp_read_file.close()
628    temp_pair_file.close()
629
630    unmapped_read_1_fq.close()
631    unmapped_read_2_fq.close()
632
633
634    sort = False
635    if vg:
636        sort = True
637
638    if sort:
639        command = "sort %s | uniq > %s; rm %s" % (temp_read_filename, read_filename, temp_read_filename)
640        os.system(command)
641
642        command = "sort %s | uniq > %s; rm %s" % (temp_pair_filename, pair_filename, temp_pair_filename)
643        os.system(command)
644    else:
645        command = "mv %s %s; mv %s %s" % (temp_read_filename, read_filename, temp_pair_filename, pair_filename)
646        os.system(command)
647
648
649def remove_redundant_junctions(junctions):
650    temp_junctions = []
651    for junction in junctions:
652        temp_junctions.append(to_junction(junction))
653    junctions = sorted(list(temp_junctions), cmp=junction_cmp)
654    temp_junctions = []
655    for can_junction in junctions:
656        if len(temp_junctions) <= 0:
657            temp_junctions.append(can_junction)
658        else:
659            chr, left, right = temp_junctions[-1]
660            chr2, left2, right2 = can_junction
661            if chr == chr2 and \
662                    abs(left - left2) == abs(right - right2) and \
663                    abs(left - left2) <= 10:
664                continue
665            temp_junctions.append(can_junction)
666    junctions = set()
667    for junction in temp_junctions:
668        junctions.add(to_junction_str(junction))
669
670    return junctions
671
672
673
674def read_stat(read_filename, gtf_junctions, chr_dic = None, debug = False):
675    read_stat = [[0, 0, 0] for i in range(MAX_EDIT)]
676    temp_junctions = [set() for i in range(MAX_EDIT)]
677    temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
678
679    alignment = []
680    prev_read_id = ""
681    read_file = open(read_filename, "r")
682    for line in read_file:
683        read_id, flag, chr, pos, cigar_str, XM, NM = line[:-1].split()
684        flag, pos = int(flag), int(pos)
685        XM, NM = int(XM[5:]), int(NM[5:])
686
687        read_junctions, junction_read, gtf_junction_read = \
688            is_junction_read(chr_dic, gtf_junctions, chr, pos, cigar_str)
689
690        if junction_read:
691            for junction_str, is_gtf_junction in read_junctions:
692                if NM < len(temp_junctions):
693                    temp_junctions[NM].add(junction_str)
694
695                    if is_gtf_junction:
696                        temp_gtf_junctions[NM].add(junction_str)
697
698        if read_id != prev_read_id:
699            if prev_read_id != "":
700                NM2, junction_read2, gtf_junction_read2 = alignment
701                if NM2 < len(read_stat):
702                    read_stat[NM2][0] += 1
703
704                    if junction_read2:
705                        read_stat[NM2][1] += 1
706
707                        if gtf_junction_read2:
708                            read_stat[NM2][2] += 1
709
710            alignment = []
711
712        prev_read_id = read_id
713
714        if not alignment:
715            alignment = [NM, junction_read, gtf_junction_read]
716        elif alignment[0] > NM or \
717                (alignment[0] == NM and not alignment[2] and junction_read):
718            alignment = [NM, junction_read, gtf_junction_read]
719
720    read_file.close()
721
722    for i in range(len(read_stat)):
723        temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
724        temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
725
726    for i in range(len(read_stat)):
727        read_stat[i].append(len(temp_junctions[i]))
728        read_stat[i].append(len(temp_gtf_junctions[i]))
729
730    if alignment:
731        NM2, junction_read2, gtf_junction_read2 = alignment
732        if NM2 < len(read_stat):
733            read_stat[NM2][0] += 1
734
735            if junction_read2:
736                read_stat[NM2][1] += 1
737
738                if gtf_junction_read2:
739                    read_stat[NM2][2] += 1
740
741    return read_stat
742
743
744def cal_read_len(cigar_str):
745    length = 0
746    leftmost_softclip = 0
747    rightmost_softclip = 0
748    cigars = cigar_re.findall(cigar_str)
749
750    for i in range(len(cigars)):
751        cigar = cigars[i]
752        cigar_length = int(cigar[:-1])
753        cigar_op = cigar[-1]
754
755        if cigar_op in "MIS":
756            length += cigar_length
757
758        if (i == 0) and (cigar_op == "S"):
759            leftmost_softclip = cigar_length
760        if (i == (len(cigars) - 1)) and (cigar_op == "S"):
761            rightmost_softclip = cigar_length
762
763    return length, leftmost_softclip, rightmost_softclip
764
765def is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM):
766    concord_length = 1000
767    segment_length = sys.maxint
768
769    pairs = {}
770    pairs[0] = [flag, chr, pos, cigar_str, XM, NM]
771    pairs[1] = [mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM]
772
773    if chr != mate_chr:
774        return False, segment_length
775    if (flag & 0x10 == 0x10) or (mate_flag & 0x10 == 0):
776        return False, segment_length
777
778    assert pos <= mate_pos
779
780    left = pairs[0]
781    right = pairs[1]
782
783    left_start = left[2]
784    left_len, _, _ = cal_read_len(left[3]) # cigar
785
786    right_start = right[2]
787    right_len, _, right_soft = cal_read_len(right[3])
788
789    segment_length = (right_start + right_len) - left_start - right_soft
790    assert segment_length >= 0
791
792    if segment_length > concord_length:
793        return False, segment_length
794
795    return True, segment_length
796
797def pair_stat(pair_filename, gtf_junctions, chr_dic):
798    # pair_stat = NM, junction_pair, gtf_junction, concordant_alignment]
799    pair_stat = [[0, 0, 0, 0] for i in range(MAX_EDIT)]
800    dis_pair_stat = [0 for i in range(MAX_EDIT)]
801    temp_junctions = [set() for i in range(MAX_EDIT)]
802    temp_gtf_junctions = [set() for i in range(MAX_EDIT)]
803
804    alignment, dis_alignments = [], []
805    prev_read_id = ""
806    con_file = open(pair_filename + ".con", "w")
807    discon_file = open(pair_filename + ".discon", "w")
808    pair_file = open(pair_filename, "r")
809    for line in pair_file:
810        read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM = line[:-1].split()
811        flag, pos, XM, NM, mate_flag, mate_pos, mate_XM, mate_NM = \
812             int(flag), int(pos), int(XM[5:]), int(NM[5:]), int(mate_flag), int(mate_pos), int(mate_XM[5:]), int(mate_NM[5:])
813
814        pair_XM = XM + mate_XM
815        pair_NM = NM + mate_NM
816
817        pair_junctions, junction_pair, gtf_junction_pair = \
818            is_junction_pair(chr_dic, gtf_junctions, chr, pos, cigar_str, mate_chr, mate_pos, mate_cigar_str)
819
820        # check concordantly
821        concord_align, segment_len = is_concordantly(read_id, flag, chr, pos, cigar_str, XM, NM, mate_flag, mate_chr, mate_pos, mate_cigar_str, mate_XM, mate_NM)
822        print >> (con_file if concord_align else discon_file), line.strip(), ('none', 'first')[(flag & 0x40 == 0x40)], ('none', 'last')[(mate_flag & 0x80 == 0x80)], segment_len
823
824        if junction_pair:
825            for junction_str, is_gtf_junction in pair_junctions:
826                if pair_NM < len(temp_junctions):
827                    temp_junctions[pair_NM].add(junction_str)
828
829                    if is_gtf_junction:
830                        temp_gtf_junctions[pair_NM].add(junction_str)
831
832        if read_id != prev_read_id:
833            if prev_read_id != "":
834                NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
835                if NM2 < len(pair_stat):
836                    pair_stat[NM2][0] += 1
837
838                    if junction_read2:
839                        pair_stat[NM2][1] += 1
840                        if gtf_junction_read2:
841                            pair_stat[NM2][2] += 1
842                    if concord_align2:
843                        pair_stat[NM2][3] += 1
844
845            for NM2 in dis_alignments:
846                if NM2 < len(dis_pair_stat):
847                    dis_pair_stat[NM2] += 1
848
849            alignment = []
850            dis_alignment = []
851
852        prev_read_id = read_id
853
854        if not alignment:
855            alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
856        elif alignment[0] > pair_NM or \
857                (alignment[0] == pair_NM and not alignment[2] and junction_pair):
858            alignment = [pair_NM, junction_pair, gtf_junction_pair, concord_align]
859
860        if mate_chr != chr or ((flag & 0x10) != 0 or (mate_flag & 0x10) == 0):
861            if len(dis_alignments) == 0:
862                dis_alignments = [pair_NM]
863            elif dis_alignments[0] > pair_NM:
864                dis_alignments = [pair_NM]
865
866    pair_file.close()
867    con_file.close()
868    discon_file.close()
869
870    # process last line
871    if alignment:
872        NM2, junction_read2, gtf_junction_read2, concord_align2 = alignment
873        if NM2 < len(pair_stat):
874            pair_stat[NM2][0] += 1
875
876            if junction_read2:
877                pair_stat[NM2][1] += 1
878                if gtf_junction_read2:
879                    pair_stat[NM2][2] += 1
880
881            if concord_align2:
882                pair_stat[NM2][3] += 1
883
884    assert len(dis_alignments) <= 1
885    for NM2 in dis_alignments:
886        if NM2 < len(dis_pair_stat):
887            dis_pair_stat[NM2] += 1
888
889    for i in range(len(pair_stat)):
890        temp_junctions[i] = remove_redundant_junctions(temp_junctions[i])
891        temp_gtf_junctions[i] = remove_redundant_junctions(temp_gtf_junctions[i])
892
893    for i in range(len(pair_stat)):
894        pair_stat[i].append(len(temp_junctions[i]))
895        pair_stat[i].append(len(temp_gtf_junctions[i]))
896
897    return pair_stat, dis_pair_stat
898
899
900def sql_execute(sql_db, sql_query):
901    sql_cmd = [
902        "sqlite3", sql_db,
903        "-separator", "\t",
904        "%s;" % sql_query
905        ]
906    # print >> sys.stderr, sql_cmd
907    sql_process = subprocess.Popen(sql_cmd, stdout=subprocess.PIPE)
908    output = sql_process.communicate()[0][:-1]
909    return output
910
911
912def create_sql_db(sql_db):
913    if os.path.exists(sql_db):
914        print >> sys.stderr, sql_db, "already exists!"
915        return
916
917    columns = [
918        ["id", "integer primary key autoincrement"],
919        ["reads", "text"],
920        ["genome", "text"],
921        ["end_type", "text"],
922        ["aligner", "text"],
923        ["version", "test"],
924        ["use_annotation", "text"],
925        ["edit_distance", "integer"],
926        ["mapped_reads", "integer"],
927        ["junction_reads", "integer"],
928        ["gtf_junction_reads", "integer"],
929        ["junctions", "integer"],
930        ["gtf_junctions", "integer"],
931        ["runtime", "real"],
932        ["host", "text"],
933        ["created", "text"],
934        ["cmd", "text"]
935        ]
936
937    sql_create_table = "CREATE TABLE Mappings ("
938    for i in range(len(columns)):
939        name, type = columns[i]
940        if i != 0:
941            sql_create_table += ", "
942        sql_create_table += ("%s %s" % (name, type))
943    sql_create_table += ");"
944    sql_execute(sql_db, sql_create_table)
945
946
947def write_analysis_data(sql_db, database_name, paired):
948    if not os.path.exists(sql_db):
949        return
950
951    if paired:
952        paired = "paired"
953    else:
954        paired = "single"
955
956    aligners = []
957    sql_aligners = "SELECT aligner FROM Mappings WHERE end_type = '%s' GROUP BY aligner" % (paired)
958    output = sql_execute(sql_db, sql_aligners)
959    aligners = output.split()
960
961    database_fname = database_name + "_" + paired + ".analysis"
962    database_file = open(database_fname, "w")
963
964    print >> database_file, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
965    for aligner in aligners:
966        for edit_distance in range(MAX_EDIT):
967            sql_row = "SELECT aligner, use_annotation, end_type, edit_distance, mapped_reads, junction_reads, gtf_junction_reads, junctions, gtf_junctions, runtime FROM Mappings"
968            sql_row += " WHERE reads = '%s' and aligner = '%s' and edit_distance = %d and end_type = '%s' ORDER BY created DESC LIMIT 1" % (database_name, aligner, edit_distance, paired)
969            output = sql_execute(sql_db, sql_row)
970            if output:
971                print >> database_file, output
972
973    database_file.close()
974
975
976def calculate_read_cost(single_end,
977                        paired_end,
978                        test_aligners,
979                        fresh,
980                        runtime_only,
981                        verbose):
982    sql_db_name = "analysis.db"
983    if not os.path.exists(sql_db_name):
984        create_sql_db(sql_db_name)
985
986    full_workdir = os.getcwd()
987    workdir = full_workdir.split("/")[-1]
988
989    num_cpus = multiprocessing.cpu_count()
990    if num_cpus > 8:
991        num_threads = min(8, num_cpus)
992        desktop = False
993    else:
994        num_threads = min(3, num_cpus)
995        desktop = True
996
997    verbose = False
998    sql_write = True
999    is_large_file = False
1000    gz_file = False
1001    if os.path.exists("1.fq.gz"):
1002        gz_file = True
1003        if os.path.getsize("1.fq.gz") > (1024 * 1024 * 1024):
1004            is_large_file = True
1005
1006    elif os.path.exists("1.fq"):
1007        gz_file = False
1008        if os.path.getsize("1.fq") > (2 * 1024 * 1024 * 1024):
1009            is_large_file = True
1010
1011    else:
1012        assert(False)
1013
1014
1015    aligners = [
1016        # ["hisat2", "", "", "", ""],
1017        # ["hisat2", "", "", "", "--sensitive"],
1018        # ["hisat2", "", "", "", "--very-sensitive"],
1019        # ["hisat2", "", "", "", "-k 50 --score-min C,-50,0"],
1020        # ["hisat2", "", "snp", "", ""],
1021        # ["hisat2", "", "snp", "", "--sensitive"],
1022        # ["hisat2", "", "snp", "", "-k 50"],
1023        # ["hisat2", "", "", "205", ""],
1024        # ["hisat2", "", "snp", "205", ""],
1025        # ["hisat2", "", "snp_tran", "205", ""],
1026        # ["hisat2", "", "tran", "", ""],
1027        # ["hisat2", "x1", "snp", "", ""],
1028        # ["hisat2", "x1", "", "", ""],
1029        # ["hisat2", "x2", "", "", ""],
1030        # ["hisat2", "", "tran", "", ""],
1031        # ["hisat2", "", "snp_tran", "204", ""],
1032        # ["hisat2", "", "snp_tran", "", ""],
1033        # ["hisat2", "", "", "210", ""],
1034        ["hisat2", "", "rep", "", ""],
1035        # ["hisat2", "", "rep", "", "--read-lengths 101"],
1036        # ["hisat2", "", "rep", "", "--sensitive"],
1037        # ["hisat2", "", "rep-100-300", "", ""],
1038        # ["hisat2", "", "rep-101-300", "", "--sensitive"],
1039        # ["hisat2", "", "rep-101-300", "", "-k 10 --score-min C,-50,0"],
1040        # ["hisat2", "", "rep-150-300", "", ""],
1041        # ["tophat2", "", "", "", ""],
1042        # ["bowtie", "", "", "", ""],
1043        ["bowtie2", "", "", "", ""],
1044        # ["bowtie2", "", "", "", "-k 10"],
1045        ["bwa", "mem", "", "", ""],
1046        # ["bwa", "mem", "", "", "-a"],
1047        # ["bwa", "sw", "", "", ""],
1048        # ["star", "", "", "", ""],
1049        # ["star", "x2", "", "", ""],
1050        # ["vg", "", "", "", ""],
1051        # ["vg", "", "", "", "-M 10"],
1052        # ["vg", "", "snp", "", ""],
1053        # ["vg", "", "snp", "", "-M 10"],
1054        # ["minimap2", "", "", "", ""],
1055        ]
1056
1057    # sql_write = False
1058    verbose = True
1059    debug = False
1060
1061    genome = "genome"
1062    cwd = os.getcwd()
1063    RNA = (cwd.find("RNA") != -1)
1064
1065    chr_dic = read_genome("../../../data/" + genome + ".fa")
1066    snp_dic = read_snp("../../../data/" + genome + ".snp")
1067    gtf_junction_strs = extract_splice_sites("../../../data/" + genome + ".gtf")
1068    gene = "no"
1069    gtf_junctions = []
1070    for junction_str in gtf_junction_strs:
1071        junction = to_junction(junction_str)
1072        gtf_junctions.append(junction)
1073    gtf_junctions = sorted(gtf_junctions, cmp=junction_cmp)
1074
1075    print >> sys.stderr, "aligner\tuse_annotation\tend_type\tedit_distance\tmapped_reads\tjunction_reads\tgtf_junction_reads\tjunctions\tgtf_junctions\truntime"
1076
1077    for paired in [False, True]:
1078        if not paired and not single_end:
1079            continue
1080        if paired and not paired_end:
1081            continue
1082
1083        type_read1_fname = "1.fq"
1084        if gz_file:
1085            type_read1_fname += ".gz"
1086
1087        if paired:
1088            type_read2_fname = "2.fq"
1089            if gz_file:
1090                type_read2_fname += ".gz"
1091
1092        else:
1093            type_read2_fname = ""
1094
1095        aligner_bin_base = "../../../../aligners/bin"
1096        def get_aligner_version(aligner):
1097            version = ""
1098            if aligner == "hisat2" or \
1099                    aligner == "hisat" or \
1100                    aligner == "bowtie" or \
1101                    aligner == "bowtie2":
1102                if version:
1103                    cmd = ["%s/%s_%s/%s" % (aligner_bin_base, aligner, version, aligner)]
1104                else:
1105                    cmd = ["%s/%s" % (aligner_bin_base, aligner)]
1106                cmd += ["--version"]
1107                cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1108                version = cmd_process.communicate()[0][:-1].split("\n")[0]
1109                version = version.split()[-1]
1110            elif aligner == "tophat2":
1111                cmd = ["%s/tophat" % (aligner_bin_base)]
1112                cmd += ["--version"]
1113                cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1114                version = cmd_process.communicate()[0][:-1].split()[-1]
1115            elif aligner in ["star", "starx2"]:
1116                version = "2.4.2a"
1117            elif aligner == "gsnap":
1118                cmd = ["%s/gsnap" % (aligner_bin_base)]
1119                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
1120                version = cmd_process.communicate()[1][:-1].split("\n")[0]
1121                version = version.split()[2]
1122            elif aligner == "bwa":
1123                if version:
1124                    cmd = ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
1125                else:
1126                    cmd = ["%s/bwa" % (aligner_bin_base)]
1127                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
1128                version = cmd_process.communicate()[1][:-1].split("\n")[2]
1129                version = version.split()[1]
1130            elif aligner == "vg":
1131                cmd = ["%s/vg" % (aligner_bin_base)]
1132                cmd_process = subprocess.Popen(cmd, stderr=subprocess.PIPE)
1133                version = cmd_process.communicate()[1][:-1].split("\n")[0]
1134                version = version.split()[5]
1135            elif aligner == "minimap2":
1136                cmd = ["%s/minimap2" % (aligner_bin_base)]
1137                cmd += ["--version"]
1138                cmd_process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1139                version = cmd_process.communicate()[0][:-1].split("\n")[0]
1140
1141            return version
1142
1143        index_base = "../../../../indexes"
1144        index_add = ""
1145        if genome != "genome":
1146            index_add = "_" + genome
1147        def get_aligner_cmd(RNA, aligner, type, index_type, version, options, read1_fname, read2_fname, out_fname, cmd_idx = 0):
1148            cmd = ["/usr/bin/time"]
1149            if osx_mode:
1150                cmd += ['-l']
1151            if aligner == "hisat2":
1152                if version:
1153                    cmd += ["%s/hisat2_%s/hisat2" % (aligner_bin_base, version)]
1154                else:
1155                    cmd += ["%s/hisat2" % (aligner_bin_base)]
1156                if num_threads > 1:
1157                    cmd += ["-p", str(num_threads)]
1158
1159                # cmd += ["-k", "5"]
1160                # cmd += ["--score-min", "C,-18"]
1161
1162                # daehwan - for debugging purposes
1163                # cmd += ["--score-min", "C,-50"]
1164                # cmd += ["--pen-cansplice", "0"]
1165                # cmd += ["--pen-noncansplice", "12"]
1166                # cmd += ["--pen-intronlen", "G,-8,1"]
1167                # cmd += ["--metrics", "1",
1168                #         "--metrics-file", "metrics.out"]
1169
1170                if version == "204":
1171                    cmd += ["--sp", "2,1"]
1172
1173                if not RNA:
1174                    cmd += ["--no-spliced-alignment"]
1175
1176                if type in ["x1", "x2"]:
1177                    cmd += ["--no-temp-splicesite"]
1178
1179                # DK - for debugging purposes
1180                # cmd += ["--dta"]
1181                """
1182                if index_type == "tran":
1183                    cmd += ["--no-anchorstop"]
1184                    cmd += ["-k", "100"]
1185                """
1186
1187                if options != "":
1188                    cmd += options.split(' ')
1189
1190                if type == "x2":
1191                    if cmd_idx == 0:
1192                        cmd += ["--novel-splicesite-outfile"]
1193                    else:
1194                        cmd += ["--novel-splicesite-infile"]
1195                    cmd += ["splicesites.txt"]
1196
1197                # "--novel-splicesite-infile",
1198                # "../splicesites.txt",
1199                # "--rna-strandness",
1200                # "FR",
1201                if version:
1202                    index_cmd = "%s/HISAT2_%s%s/" % (index_base, version, index_add) + genome
1203                else:
1204                    index_cmd = "%s/HISAT2%s/" % (index_base, index_add) + genome
1205                if index_type:
1206                    index_cmd += ("_" + index_type)
1207                cmd += [index_cmd]
1208                if paired:
1209                    cmd += ["-1", read1_fname,
1210                            "-2", read2_fname]
1211                else:
1212                    cmd += ["-U", read1_fname]
1213            elif aligner == "hisat":
1214                cmd += ["%s/hisat" % (aligner_bin_base)]
1215                if num_threads > 1:
1216                    cmd += ["-p", str(num_threads)]
1217                # cmd += ["-k", "5"]
1218                # cmd += ["--score-min", "C,-18"]
1219                if version != "":
1220                    version = int(version)
1221                else:
1222                    version = sys.maxint
1223
1224                if not RNA:
1225                    cmd += ["--no-spliced-alignment"]
1226
1227                if type in ["x1", "x2"] or not RNA:
1228                    cmd += ["--no-temp-splicesite"]
1229
1230                """
1231                cmd += ["--rdg", "100,100",
1232                        "--rfg", "100,100"]
1233                """
1234
1235                if type == "x2":
1236                    if cmd_idx == 0:
1237                        cmd += ["--novel-splicesite-outfile"]
1238                    else:
1239                        cmd += ["--novel-splicesite-infile"]
1240                    cmd += ["splicesites.txt"]
1241
1242                # "--novel-splicesite-infile",
1243                # "../splicesites.txt",
1244                # "--rna-strandness",
1245                # "FR",
1246                cmd += ["%s/HISAT%s/" % (index_base, index_add) + genome]
1247                if paired:
1248                    cmd += ["-1", read1_fname,
1249                            "-2", read2_fname]
1250                else:
1251                    cmd += [read1_fname]
1252            elif aligner == "tophat2":
1253                cmd += ["%s/tophat" % (aligner_bin_base)]
1254                if num_threads > 1:
1255                    cmd += ["-p", str(num_threads)]
1256                cmd += ["--read-edit-dist", "3"]
1257                cmd += ["--no-sort-bam"]
1258                cmd += ["--read-realign-edit-dist", "0"]
1259                cmd += ["--keep-tmp",
1260                        "%s/HISAT%s/" % (index_base, index_add) + genome,
1261                        read1_fname]
1262                if paired:
1263                    cmd += [read2_fname]
1264            elif aligner == "star":
1265                cmd += ["%s/STAR" % (aligner_bin_base)]
1266                if num_threads > 1:
1267                    cmd += ["--runThreadN", str(num_threads)]
1268                if type == "x2" and cmd_idx == 1:
1269                    cmd += ["--genomeDir", "."]
1270                else:
1271                    cmd += ["--genomeDir", "%s/STAR%s" % (index_base, index_add)]
1272                if desktop:
1273                    cmd += ["--genomeLoad", "NoSharedMemory"]
1274                else:
1275                    cmd += ["--genomeLoad", "LoadAndKeep"]
1276                if type == "x2":
1277                    if cmd_idx == 1:
1278                        cmd += ["--alignSJDBoverhangMin", "1"]
1279                cmd += ["--readFilesIn",
1280                        read1_fname]
1281                if paired:
1282                    cmd += [read2_fname]
1283                if paired:
1284                    cmd += ["--outFilterMismatchNmax", "6"]
1285                else:
1286                    cmd += ["--outFilterMismatchNmax", "3"]
1287            elif aligner == "bowtie":
1288                cmd += ["%s/bowtie" % (aligner_bin_base)]
1289                if num_threads > 1:
1290                    cmd += ["-p", str(num_threads)]
1291                cmd += ["--sam",
1292                        "-k", "10"]
1293                cmd += ["-n", "3"]
1294                if paired:
1295                    cmd += ["-X", "500"]
1296                cmd += ["%s/Bowtie%s/" % (index_base, index_add) + genome]
1297                if paired:
1298                    cmd += ["-1", read1_fname,
1299                            "-2", read2_fname]
1300                else:
1301                    cmd += [read1_fname]
1302            elif aligner == "bowtie2":
1303                if version:
1304                    cmd += ["%s/bowtie2_%s/bowtie2" % (aligner_bin_base, version)]
1305                else:
1306                    cmd += ["%s/bowtie2" % (aligner_bin_base)]
1307                if num_threads > 1:
1308                    cmd += ["-p", str(num_threads)]
1309                #cmd += ["-k", "10"]
1310                #cmd += ["--score-min", "C,-18"]
1311                cmd += ["-X", "1000"]
1312
1313                if options:
1314                    cmd += options.split(' ')
1315
1316                if version:
1317                    cmd += ["-x %s/Bowtie2_%s%s/" % (index_base, version, index_add) + genome]
1318                else:
1319                    cmd += ["-x %s/Bowtie2%s/" % (index_base, index_add) + genome]
1320                if paired:
1321                    cmd += ["-1", read1_fname,
1322                            "-2", read2_fname]
1323                else:
1324                    cmd += [read1_fname]
1325            elif aligner == "gsnap":
1326                cmd += ["%s/gsnap" % (aligner_bin_base),
1327                       "-A",
1328                       "sam"]
1329                if num_threads > 1:
1330                    cmd += ["-t", str(num_threads)]
1331                cmd += ["--max-mismatches=3",
1332                        "-D", "%s/GSNAP%s" % (index_base, index_add),
1333                        "-N", "1",
1334                        "-d", genome,
1335                        read1_fname]
1336                if paired:
1337                    cmd += [read2_fname]
1338            elif aligner == "bwa":
1339                if version:
1340                    cmd += ["%s/bwa_%s/bwa" % (aligner_bin_base, version)]
1341                else:
1342                    cmd += ["%s/bwa" % (aligner_bin_base)]
1343                if type in ["mem", "aln"]:
1344                    cmd += [type]
1345                elif type == "sw":
1346                    cmd += ["bwa" + type]
1347                if num_threads > 1:
1348                    cmd += ["-t", str(num_threads)]
1349                if options:
1350                    cmd += options.split(' ')
1351                if version:
1352                    cmd += ["%s/BWA_%s%s/%s.fa" % (index_base, version, index_add, genome)]
1353                else:
1354                    cmd += ["%s/BWA%s/%s.fa" % (index_base, index_add, genome)]
1355                cmd += [read1_fname]
1356                if paired:
1357                    cmd += [read2_fname]
1358            elif aligner == "vg":
1359                # vg map -d 22 -t 6 -M 10 -f ../sim-1.fa -f ../sim-2.fa --surject-to sam > result.sam
1360                cmd += ["%s/vg" % (aligner_bin_base)]
1361                cmd += ["map"]
1362                cmd += ["-t", str(num_threads)]
1363                cmd += ["--surject-to", "sam"]
1364                index_cmd = "%s/VG%s/" % (index_base, index_add) + genome
1365                if index_type:
1366                    index_cmd += ("_" + index_type)
1367
1368                if options:
1369                    cmd += options.split(' ')
1370
1371                cmd += ["-d", index_cmd]
1372
1373                cmd += ["-f", read1_fname]
1374                if paired:
1375                    cmd += ["-f", read2_fname]
1376
1377            elif aligner == "minimap2":
1378                # minimap2 -a -x sr 22.mmi sim_1.fa sim_2.fa > result.sam
1379                cmd += ["%s/minimap2" % (aligner_bin_base)]
1380                cmd += ["-a"]
1381                cmd += ["-x", "sr"]
1382                index_cmd = "%s/minimap2%s/" % (index_base, index_add) + genome
1383                if index_type:
1384                    index_cmd += ("_" + index_type)
1385                index_cmd += ".mmi"
1386                cmd += [index_cmd]
1387                cmd += [read1_fname]
1388                if paired:
1389                    cmd += [read2_fname]
1390            else:
1391                assert False
1392
1393            return cmd
1394
1395        for aligner, type, index_type, version, options in aligners:
1396            skip = False
1397            if len(test_aligners) > 0:
1398                skip = True
1399                for test_aligner in test_aligners:
1400                    if aligner == test_aligner:
1401                        skip = False
1402            if skip:
1403                continue
1404
1405            aligner_name = aligner + type + version
1406            if (aligner == "hisat2" or aligner == "vg") and index_type != "":
1407                aligner_name += ("_" + index_type)
1408
1409            if options != "":
1410                option_name = options.replace(' ', '').replace('-', '').replace(',', '')
1411                aligner_name = aligner_name + '_' + option_name
1412            if paired:
1413                aligner_dir = aligner_name + "_paired"
1414            else:
1415                aligner_dir = aligner_name + "_single"
1416
1417            if fresh and os.path.exists(aligner_dir):
1418                os.system("rm -rf %s" % aligner_dir)
1419
1420            if not os.path.exists(aligner_dir):
1421                os.mkdir(aligner_dir)
1422            os.chdir(aligner_dir)
1423
1424            out_fname = "accepted.sam"
1425            aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
1426            duration = 0.1
1427            mem_usage = ''
1428            if not os.path.exists(out_fname):
1429                if not os.path.exists("../one.fq") or not os.path.exists("../two.fq"):
1430                    if gz_file:
1431                        os.system("gzip -cd ../1.fq.gz | head -400 > ../one.fq")
1432                        os.system("gzip -cd ../2.fq.gz | head -400 > ../two.fq")
1433                    else:
1434                        os.system("head -400 ../1.fq > ../one.fq")
1435                        os.system("head -400 ../2.fq > ../two.fq")
1436
1437                # dummy commands for caching index
1438                loading_time = 0
1439                if aligner not in ["tophat2"]:
1440                    for i in range(3):
1441                        dummy_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../one.fq", "../two.fq", "/dev/null")
1442                        start_time = datetime.now()
1443                        if verbose:
1444                            print >> sys.stderr, start_time, "\t", " ".join(dummy_cmd)
1445                        if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa"]:
1446                            proc = subprocess.Popen(dummy_cmd, stdout=open("/dev/null", "w"), stderr=subprocess.PIPE)
1447                        else:
1448                            proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1449                        proc.communicate()
1450                        finish_time = datetime.now()
1451                        duration = finish_time - start_time
1452                        duration = duration.total_seconds()
1453                        if verbose:
1454                            print >> sys.stderr, finish_time, "duration:", duration
1455                        loading_time = duration
1456
1457                # align all reads
1458                if paired:
1459                    sweep_read_cmd = "cat ../%s ../%s > /dev/null" % (type_read1_fname, type_read2_fname)
1460                else:
1461                    sweep_read_cmd = "cat ../%s > /dev/null" % (type_read1_fname)
1462                print >> sys.stderr, datetime.now(), "\t", sweep_read_cmd
1463                os.system(sweep_read_cmd)
1464
1465                skip_alignment = False
1466                if paired and aligner == "olego" and os.path.exists(out_fname + "1"):
1467                    skip_alignment = True
1468
1469                if not skip_alignment:
1470                    aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname)
1471                    start_time = datetime.now()
1472                    if verbose:
1473                        print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
1474                    if aligner in ["hisat2", "hisat", "bowtie", "bowtie2", "gsnap", "bwa", "vg", "minimap2"]:
1475                        proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
1476                    else:
1477                        proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1478                    _, mem_usage = proc.communicate()
1479                    mem_usage = parse_mem_usage(mem_usage)
1480                    finish_time = datetime.now()
1481                    duration = finish_time - start_time
1482                    duration = duration.total_seconds() - loading_time
1483                    if duration < 0.1:
1484                        duration = 0.1
1485                    if verbose:
1486                        print >> sys.stderr, finish_time, "duration:", duration
1487
1488                    if verbose:
1489                        print >> sys.stderr, finish_time, "Memory Usage: %dMB" % (int(mem_usage) / 1024)
1490
1491                    if debug and aligner == "hisat" and type == "x1":
1492                        os.system("cat metrics.out")
1493                        print >> sys.stderr, "\ttime: %.4f" % (duration)
1494                        # break
1495
1496                if aligner == "star" and type in ["", "gtf"]:
1497                    os.system("mv Aligned.out.sam %s" % out_fname)
1498                elif aligner in ["hisat2", "hisat"] and type == "x2":
1499                    aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
1500                    if verbose:
1501                        print >> sys.stderr, start_time, "\t", " ".join(aligner_cmd)
1502                    start_time = datetime.now()
1503                    proc = subprocess.Popen(aligner_cmd, stdout=open(out_fname, "w"), stderr=subprocess.PIPE)
1504                    proc.communicate()
1505                    finish_time = datetime.now()
1506                    duration += (finish_time - start_time).total_seconds()
1507                    duration -= loading_time
1508                    if duration < 0.1:
1509                        duration = 0.1
1510                    if verbose:
1511                        print >> sys.stderr, finish_time, "duration:", duration
1512                elif aligner == "star" and type == "x2":
1513                    assert os.path.exists("SJ.out.tab")
1514                    os.system("awk 'BEGIN {OFS=\"\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";} {if($5>0){print $1,$2,$3,strChar[$4]}}' SJ.out.tab > SJ.out.tab.Pass1.sjdb")
1515                    for file in os.listdir("."):
1516                        if file in ["SJ.out.tab.Pass1.sjdb", "genome.fa"]:
1517                            continue
1518                        os.remove(file)
1519                    star_index_cmd = "STAR --genomeDir ./ --runMode genomeGenerate --genomeFastaFiles ../../../../data/genome.fa --sjdbFileChrStartEnd SJ.out.tab.Pass1.sjdb --sjdbOverhang 100 --runThreadN %d" % (num_threads)
1520                    print >> sys.stderr, "\t", datetime.now(), star_index_cmd
1521                    os.system(star_index_cmd)
1522                    if verbose:
1523                        print >> sys.stderr, "\t", datetime.now(), " ".join(dummy_cmd)
1524                    proc = subprocess.Popen(dummy_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1525                    proc.communicate()
1526                    if verbose:
1527                        print >> sys.stderr, "\t", datetime.now(), "finished"
1528                    aligner_cmd = get_aligner_cmd(RNA, aligner, type, index_type, version, options, "../" + type_read1_fname, "../" + type_read2_fname, out_fname, 1)
1529                    start_time = datetime.now()
1530                    if verbose:
1531                        print >> sys.stderr, "\t", start_time, " ".join(aligner_cmd)
1532                    proc = subprocess.Popen(aligner_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1533                    proc.communicate()
1534                    finish_time = datetime.now()
1535                    duration += (finish_time - start_time).total_seconds()
1536                    duration -= loading_time
1537                    if duration < 0.1:
1538                        duration = 0.1
1539                    if verbose:
1540                        print >> sys.stderr, "\t", finish_time, "finished:", duration
1541                    os.system("mv Aligned.out.sam %s" % out_fname)
1542                elif aligner == "tophat2":
1543                    os.system("samtools sort -n tophat_out/accepted_hits.bam accepted_hits; samtools view -h accepted_hits.bam > %s" % out_fname)
1544                elif aligner == "vg":
1545                    index_name = '%s/VG%s/' % (index_base, index_add) + genome
1546                    if index_type:
1547                        index_name += ('_' + index_type)
1548
1549                os.system("echo %s %s %f >> runtime" % (str(datetime.now()), aligner, duration))
1550
1551                if aligner in ["star", "tophat2", "gsnap"]:
1552                    os.system("tar cvzf %s.tar.gz %s &> /dev/null" % (out_fname, out_fname))
1553
1554            if runtime_only:
1555                os.chdir("..")
1556                continue
1557
1558            suffix = aligner
1559            read_sam, pair_sam = suffix + ".read.sam", suffix + ".pair.sam"
1560            unmapped_read_1_fq, unmapped_read_2_fq = suffix + ".unmapped.1.fq", suffix + ".unmapped.2.fq"
1561            if not os.path.exists(read_sam) or not os.path.exists(pair_sam):
1562                if index_type == 'snp':
1563                    extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq, snp_dic)
1564                else:
1565                    extract_reads_and_pairs(chr_dic, out_fname, read_sam, pair_sam, unmapped_read_1_fq, unmapped_read_2_fq)
1566
1567
1568
1569            out = ''
1570            if gz_file:
1571                out = subprocess.check_output("gzip -cd ../%s | wc -l" % type_read1_fname, shell=True)
1572            else:
1573                out = subprocess.check_output("wc -l ../%s" % type_read1_fname, shell=True)
1574
1575            numreads = int(out.split()[0]) / 4
1576
1577            done_filename = suffix + ".done"
1578            if not os.path.exists(done_filename):
1579                done_file = open(done_filename, "w")
1580                if paired:
1581                    sum = [0, 0, 0, 0, 0, 0] # mappep_read, junction_read, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions
1582                    dis_sum = 0
1583                    stat, dis_stat = pair_stat(pair_sam, gtf_junctions, chr_dic)
1584                    output = ""
1585                    for i in range(len(stat)):
1586                        for j in range(len(sum)):
1587                            sum[j] += stat[i][j]
1588
1589                        dis_sum += dis_stat[i]
1590                        mapped_reads, junction_reads, gtf_junction_reads, concord_mapped_read, num_junctions, num_gtf_junctions = sum
1591                        output += "%s\t%s\tpaired\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\t%d\t%.2f%%\n" % \
1592                                  (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)), concord_mapped_read, float(concord_mapped_read) * 100.0 / numreads)
1593
1594                        if sql_write and os.path.exists("../" + sql_db_name):
1595                            sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
1596                                    (workdir, genome, "paired", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
1597                            sql_execute("../" + sql_db_name, sql_insert)
1598
1599
1600                    print >> sys.stderr, output,
1601                    print >> done_file, output
1602                else:
1603                    sum = [0, 0, 0, 0, 0]
1604                    stat = read_stat(read_sam, gtf_junctions, chr_dic)
1605                    output = ""
1606                    for i in range(len(stat)):
1607                        for j in range(len(sum)):
1608                            sum[j] += stat[i][j]
1609
1610                        mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions = sum
1611                        output += "%s\t%s\tsingle\t%d\t%d\t%.2f%%\t%d\t%d\t%d\t%d\t%f\t%d\n" % \
1612                                  (aligner_name, gene, i, mapped_reads, float(mapped_reads) * 100.0 / numreads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, (numreads / max(1.0, duration)))
1613
1614                        if sql_write and os.path.exists("../" + sql_db_name):
1615                            sql_insert = "INSERT INTO \"Mappings\" VALUES(NULL, '%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, %d, %d, %f, '%s', datetime('now', 'localtime'), '%s');" % \
1616                                    (workdir, genome, "single", aligner_name, get_aligner_version(aligner), "no", i, mapped_reads, junction_reads, gtf_junction_reads, num_junctions, num_gtf_junctions, duration, platform.node(), " ".join(aligner_cmd))
1617                            sql_execute("../" + sql_db_name, sql_insert)
1618
1619                    print >> sys.stderr, output,
1620                    print >> done_file, output
1621
1622                done_file.close()
1623
1624
1625            os.chdir("..")
1626
1627        if os.path.exists(sql_db_name):
1628            write_analysis_data(sql_db_name, workdir, paired)
1629
1630
1631
1632if __name__ == "__main__":
1633    parser = ArgumentParser(
1634        description='test HISAT2, and compare HISAT2 with other popular aligners such as TopHat2, STAR, Bowtie1/2, GSNAP, BWA-mem, etc.')
1635    parser.add_argument('--single-end',
1636                        dest='paired_end',
1637                        action='store_false',
1638                        help='run single-end only')
1639    parser.add_argument('--paired-end',
1640                        dest='single_end',
1641                        action='store_false',
1642                        help='run paired_end only')
1643    parser.add_argument('--aligner-list',
1644                        dest='aligner_list',
1645                        type=str,
1646                        default="",
1647                        help='comma-separated list of aligners (e.g. hisat2,bowtie2,bwa')
1648    parser.add_argument('--fresh',
1649                        dest='fresh',
1650                        action='store_true',
1651                        help='delete existing alignment related directories (e.g. hisat2_single)')
1652    parser.add_argument('--runtime-only',
1653                        dest='runtime_only',
1654                        action='store_true',
1655                        help='run programs without evaluation')
1656    parser.add_argument('-v', '--verbose',
1657                        dest='verbose',
1658                        action='store_true',
1659                        help='also print some statistics to stderr')
1660
1661    args = parser.parse_args()
1662
1663    aligners = []
1664    for aligner in args.aligner_list.split(','):
1665        if aligner == "":
1666            continue
1667        aligners.append(aligner)
1668
1669    calculate_read_cost(args.single_end,
1670                        args.paired_end,
1671                        aligners,
1672                        args.fresh,
1673                        args.runtime_only,
1674                        args.verbose)
1675
1676