1 #include "samtools.pysam.h"
2
3 /* cut_target.c -- targetcut subcommand.
4
5 Copyright (C) 2011 Broad Institute.
6 Copyright (C) 2012-2013, 2015, 2016, 2019 Genome Research Ltd.
7
8 Author: Heng Li <lh3@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <config.h>
29
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include "htslib/hts.h"
34 #include "htslib/sam.h"
35 #include "htslib/faidx.h"
36 #include "samtools.h"
37 #include "sam_opts.h"
38
39 #define ERR_DEP 0.83
40
41 typedef struct {
42 int e[2][3], p[2][2];
43 } score_param_t;
44
45 /* Note that although the two matrics have 10 parameters in total, only 4
46 * (probably 3) are free. Changing the scoring matrices in a sort of symmetric
47 * way will not change the result. */
48 static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
49
50 typedef struct {
51 int min_baseQ, tid, max_bases;
52 uint16_t *bases;
53 samFile *fp;
54 sam_hdr_t *h;
55 char *ref;
56 hts_pos_t len;
57 faidx_t *fai;
58 errmod_t *em;
59 } ct_t;
60
gencns(ct_t * g,int n,const bam_pileup1_t * plp)61 static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
62 {
63 int i, j, ret, tmp, k, sum[4], qual;
64 float q[16];
65 if (n > g->max_bases) { // enlarge g->bases
66 g->max_bases = n;
67 kroundup32(g->max_bases);
68 g->bases = realloc(g->bases, (size_t) g->max_bases * 2);
69 }
70 for (i = k = 0; i < n; ++i) {
71 const bam_pileup1_t *p = plp + i;
72 uint8_t *seq;
73 int q, baseQ, b;
74 if (p->is_refskip || p->is_del) continue;
75 baseQ = bam_get_qual(p->b)[p->qpos];
76 if (baseQ < g->min_baseQ) continue;
77 seq = bam_get_seq(p->b);
78 b = seq_nt16_int[bam_seqi(seq, p->qpos)];
79 if (b > 3) continue;
80 q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
81 if (q < 4) q = 4;
82 if (q > 63) q = 63;
83 g->bases[k++] = q<<5 | bam_is_rev(p->b)<<4 | b;
84 }
85 if (k == 0) return 0;
86 errmod_cal(g->em, k, 4, g->bases, q);
87 for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
88 for (i = 1; i < 4; ++i) // insertion sort
89 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
90 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
91 qual = (sum[1]>>2) - (sum[0]>>2);
92 k = k < 256? k : 255;
93 ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
94 return ret<<8|k;
95 }
96
process_cns(sam_hdr_t * h,int tid,hts_pos_t l,uint16_t * cns)97 static void process_cns(sam_hdr_t *h, int tid, hts_pos_t l, uint16_t *cns)
98 {
99 int64_t i, s;
100 int f[2][2], *prev, *curr, *swap_tmp;
101 uint8_t *b; // backtrack array
102 b = calloc(l, 1);
103 f[0][0] = f[0][1] = 0;
104 prev = f[0]; curr = f[1];
105 // fill the backtrack matrix
106 for (i = 0; i < l; ++i) {
107 int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
108 int tmp0, tmp1;
109 // compute f[0]
110 tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
111 tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
112 if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
113 else curr[0] = tmp1, b[i] = 1;
114 // compute f[1]
115 tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
116 tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
117 if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
118 else curr[1] = tmp1, b[i] |= 1<<1;
119 // swap
120 swap_tmp = prev; prev = curr; curr = swap_tmp;
121 }
122 // backtrack
123 s = prev[0] > prev[1]? 0 : 1;
124 for (i = l - 1; i > 0; --i) {
125 b[i] |= s<<2;
126 s = b[i]>>s&1;
127 }
128 // print
129 for (i = 0, s = -1; i < INT64_MAX && i <= l; ++i) {
130 if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
131 if (s >= 0) {
132 int64_t j;
133 fprintf(samtools_stdout, "%s:%"PRId64"-%"PRId64"\t0\t%s\t%"PRId64"\t60\t%"PRId64"M\t*\t0\t0\t", sam_hdr_tid2name(h, tid), s+1, i, sam_hdr_tid2name(h, tid), s+1, i-s);
134 for (j = s; j < i; ++j) {
135 int c = cns[j]>>8;
136 if (c == 0) fputc('N', samtools_stdout);
137 else fputc("ACGT"[c&3], samtools_stdout);
138 }
139 fputc('\t', samtools_stdout);
140 for (j = s; j < i; ++j)
141 fputc(33 + (cns[j]>>8>>2), samtools_stdout);
142 fputc('\n', samtools_stdout);
143 }
144 //if (s >= 0) fprintf(samtools_stdout, "%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
145 s = -1;
146 } else if ((b[i]>>2&3) && s < 0) s = i;
147 }
148 free(b);
149 }
150
read_aln(void * data,bam1_t * b)151 static int read_aln(void *data, bam1_t *b)
152 {
153 ct_t *g = (ct_t*)data;
154 int ret;
155 while (1)
156 {
157 ret = sam_read1(g->fp, g->h, b);
158 if ( ret<0 ) break;
159 if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
160 if ( g->fai && b->core.tid >= 0 ) {
161 if (b->core.tid != g->tid) { // then load the sequence
162 free(g->ref);
163 g->ref = fai_fetch64(g->fai, sam_hdr_tid2name(g->h, b->core.tid), &g->len);
164 g->tid = b->core.tid;
165 }
166 sam_prob_realn(b, g->ref, g->len, 1<<1|1);
167 }
168 break;
169 }
170 return ret;
171 }
172
main_cut_target(int argc,char * argv[])173 int main_cut_target(int argc, char *argv[])
174 {
175 int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS;
176 hts_pos_t l, max_l;
177 const bam_pileup1_t *p;
178 bam_plp_t plp;
179 uint16_t *cns;
180 ct_t g;
181
182 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
183 static const struct option lopts[] = {
184 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
185 { NULL, 0, NULL, 0 }
186 };
187
188 memset(&g, 0, sizeof(ct_t));
189 g.min_baseQ = 13; g.tid = -1;
190 while ((c = getopt_long(argc, argv, "f:Q:i:o:0:1:2:", lopts, NULL)) >= 0) {
191 switch (c) {
192 case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
193 case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
194 case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
195 case '1': g_param.e[1][1] = atoi(optarg); break;
196 case '2': g_param.e[1][2] = atoi(optarg); break;
197 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
198 /* else fall-through */
199 case '?': usage=1; break;
200 }
201 }
202 if (ga.reference) {
203 g.fai = fai_load(ga.reference);
204 if (g.fai == 0) fprintf(samtools_stderr, "[%s] fail to load the fasta index.\n", __func__);
205 }
206 if (usage || argc == optind) {
207 fprintf(samtools_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
208 sam_global_opt_help(samtools_stderr, "-.--f--.");
209 return 1;
210 }
211 l = max_l = 0; cns = 0;
212 g.fp = sam_open_format(argv[optind], "r", &ga.in);
213 if (g.fp == NULL) {
214 print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
215 return 1;
216 }
217
218 g.h = sam_hdr_read(g.fp);
219 if (g.h == NULL) {
220 print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
221 sam_close(g.fp);
222 return 1;
223 }
224 g.em = errmod_init(1. - ERR_DEP);
225 plp = bam_plp_init(read_aln, &g);
226 while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
227 if (tid < 0) break;
228 if (tid != lasttid) { // change of chromosome
229 if (cns) process_cns(g.h, lasttid, l, cns);
230 if (max_l < sam_hdr_tid2len(g.h, tid)) {
231 max_l = sam_hdr_tid2len(g.h, tid);
232 kroundup32(max_l);
233 cns = realloc(cns, max_l * 2);
234 }
235 l = sam_hdr_tid2len(g.h, tid);
236 memset(cns, 0, max_l * 2);
237 lasttid = tid;
238 }
239 cns[pos] = gencns(&g, n, p);
240 }
241 process_cns(g.h, lasttid, l, cns);
242
243 if (n < 0) {
244 print_error("targetcut", "error reading from \"%s\"", argv[optind]);
245 status = EXIT_FAILURE;
246 }
247
248 free(cns);
249 sam_hdr_destroy(g.h);
250 bam_plp_destroy(plp);
251 sam_close(g.fp);
252 if (g.fai) {
253 fai_destroy(g.fai); free(g.ref);
254 }
255 errmod_destroy(g.em);
256 free(g.bases);
257 sam_global_args_free(&ga);
258 return status;
259 }
260