1 #include "samtools.pysam.h"
2 
3 /*  cut_target.c -- targetcut subcommand.
4 
5     Copyright (C) 2011 Broad Institute.
6     Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd.
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <config.h>
29 
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include "htslib/hts.h"
34 #include "htslib/sam.h"
35 #include "htslib/faidx.h"
36 #include "samtools.h"
37 #include "sam_opts.h"
38 
39 #define ERR_DEP 0.83
40 
41 typedef struct {
42     int e[2][3], p[2][2];
43 } score_param_t;
44 
45 /* Note that although the two matrics have 10 parameters in total, only 4
46  * (probably 3) are free.  Changing the scoring matrices in a sort of symmetric
47  * way will not change the result. */
48 static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
49 
50 typedef struct {
51     int min_baseQ, tid, max_bases;
52     uint16_t *bases;
53     samFile *fp;
54     sam_hdr_t *h;
55     char *ref;
56     hts_pos_t len;
57     faidx_t *fai;
58     errmod_t *em;
59 } ct_t;
60 
gencns(ct_t * g,int n,const bam_pileup1_t * plp)61 static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
62 {
63     int i, j, ret, tmp, k, sum[4], qual;
64     float q[16];
65     if (n > g->max_bases) { // enlarge g->bases
66         g->max_bases = n;
67         kroundup32(g->max_bases);
68         g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
69     }
70     for (i = k = 0; i < n; ++i) {
71         const bam_pileup1_t *p = plp + i;
72         uint8_t *seq;
73         int q, baseQ, b;
74         if (p->is_refskip || p->is_del) continue;
75         baseQ = bam_get_qual(p->b)[p->qpos];
76         if (baseQ < g->min_baseQ) continue;
77         seq = bam_get_seq(p->b);
78         b = seq_nt16_int[bam_seqi(seq, p->qpos)];
79         if (b > 3) continue;
80         q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
81         if (q < 4) q = 4;
82         if (q > 63) q = 63;
83         g->bases[k++] = q<<5 | bam_is_rev(p->b)<<4 | b;
84     }
85     if (k == 0) return 0;
86     errmod_cal(g->em, k, 4, g->bases, q);
87     for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
88     for (i = 1; i < 4; ++i) // insertion sort
89         for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
90             tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
91     qual = (sum[1]>>2) - (sum[0]>>2);
92     k = k < 256? k : 255;
93     ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
94     return ret<<8|k;
95 }
96 
process_cns(sam_hdr_t * h,int tid,hts_pos_t l,uint16_t * cns)97 static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns)
98 {
99     int64_t i, s;
100     int f[2][2], *prev, *curr, *swap_tmp;
101     uint8_t *b; // backtrack array
102     b = calloc(l, 1);
103     f[0][0] = f[0][1] = 0;
104     prev = f[0]; curr = f[1];
105     // fill the backtrack matrix
106     for (i = 0; i < l; ++i) {
107         int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
108         int tmp0, tmp1;
109         // compute f[0]
110         tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
111         tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
112         if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
113         else curr[0] = tmp1, b[i] = 1;
114         // compute f[1]
115         tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
116         tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
117         if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
118         else curr[1] = tmp1, b[i] |= 1<<1;
119         // swap
120         swap_tmp = prev; prev = curr; curr = swap_tmp;
121     }
122     // backtrack
123     s = prev[0] > prev[1]? 0 : 1;
124     for (i = l - 1; i > 0; --i) {
125         b[i] |= s<<2;
126         s = b[i]>>s&1;
127     }
128     // print
129     for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) {
130         if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
131             if (s >= 0) {
132                 int64_t j;
133                 fprintf(samtools_stdout, "%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s);
134                 for (j = s; j < i; ++j) {
135                     int c = cns[j]>>8;
136                     if (c == 0) fputc('N', samtools_stdout);
137                     else fputc("ACGT"[c&3], samtools_stdout);
138                 }
139                 fputc('\t', samtools_stdout);
140                 for (j = s; j < i; ++j)
141                     fputc(33 + (cns[j]>>8>>2), samtools_stdout);
142                 fputc('\n', samtools_stdout);
143             }
144             //if (s >= 0) fprintf(samtools_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
145             s = -1;
146         } else if ((b[i]>>2&3) && s < 0) s = i;
147     }
148     free(b);
149 }
150 
read_aln(void * data,bam1_t * b)151 static int read_aln(void *data, bam1_t *b)
152 {
153     ct_t *g = (ct_t*)data;
154     int ret;
155     while (1)
156     {
157         ret = sam_read1(g->fp, g->h, b);
158         if ( ret<0 ) break;
159         if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
160         if ( g->fai && b->core.tid >= 0 ) {
161             if (b->core.tid != g->tid) { // then load the sequence
162                 free(g->ref);
163                 g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len);
164                 g->tid = b->core.tid;
165             }
166             sam_prob_realn(b, g->ref, g->len, 1<<1|1);
167         }
168         break;
169     }
170     return ret;
171 }
172 
main_cut_target(int argc,char * argv[])173 int main_cut_target(int argc, char *argv[])
174 {
175     int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
176     hts_pos_t l, max_l;
177     const bam_pileup1_t *p;
178     bam_plp_t plp;
179     uint16_t *cns;
180     ct_t g;
181 
182     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
183     static const struct option lopts[] = {
184         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
185         { NULL, 0, NULL, 0 }
186     };
187 
188     memset(&g, 0, sizeof(ct_t));
189     g.min_baseQ = 13; g.tid = -1;
190     while ((c = getopt_long(argc, argv, "f:Q:i:o:0:1:2:", lopts, NULL)) >= 0) {
191         switch (c) {
192             case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
193             case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
194             case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
195             case '1': g_param.e[1][1] = atoi(optarg); break;
196             case '2': g_param.e[1][2] = atoi(optarg); break;
197             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
198                       /* else fall-through */
199             case '?': usage=1; break;
200         }
201     }
202     if (ga.reference) {
203         g.fai = fai_load(ga.reference);
204         if (g.fai == 0) fprintf(samtools_stderr, "[%s] fail to load the fasta index.\n", __func__);
205     }
206     if (usage || argc == optind) {
207         fprintf(samtools_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
208         sam_global_opt_help(samtools_stderr, "-.--f--.");
209         return 1;
210     }
211     l = max_l = 0; cns = 0;
212     g.fp = sam_open_format(argv[optind], "r", &ga.in);
213     if (g.fp == NULL) {
214         print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
215         return 1;
216     }
217 
218     g.h = sam_hdr_read(g.fp);
219     if (g.h == NULL) {
220         print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
221         sam_close(g.fp);
222         return 1;
223     }
224     g.em = errmod_init(1. - ERR_DEP);
225     plp = bam_plp_init(read_aln, &g);
226     while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
227         if (tid < 0) break;
228         if (tid != lasttid) { // change of chromosome
229             if (cns) process_cns(g.h, lasttid, l, cns);
230             if (max_l < sam_hdr_tid2len(g.h, tid)) {
231                 max_l = sam_hdr_tid2len(g.h, tid);
232                 kroundup32(max_l);
233                 cns = realloc(cns, max_l * 2);
234             }
235             l = sam_hdr_tid2len(g.h, tid);
236             memset(cns, 0, max_l * 2);
237             lasttid = tid;
238         }
239         cns[pos] = gencns(&g, n, p);
240     }
241     process_cns(g.h, lasttid, l, cns);
242 
243     if (n < 0) {
244         print_error("targetcut", "error reading from \"%s\"", argv[optind]);
245         status = EXIT_FAILURE;
246     }
247 
248     free(cns);
249     sam_hdr_destroy(g.h);
250     bam_plp_destroy(plp);
251     sam_close(g.fp);
252     if (g.fai) {
253         fai_destroy(g.fai); free(g.ref);
254     }
255     errmod_destroy(g.em);
256     free(g.bases);
257     sam_global_args_free(&ga);
258     return status;
259 }
260