1 #include <unistd.h>
2 #include <math.h>
3 #include <stdlib.h>
4 #include <time.h>
5 #include <stdio.h>
6 #include <string.h>
7 #include "bwtaln.h"
8 #include "kvec.h"
9 #include "bntseq.h"
10 #include "utils.h"
11 #include "bwase.h"
12 #include "bwa.h"
13 #include "ksw.h"
14 
15 #ifdef USE_MALLOC_WRAPPERS
16 #  include "malloc_wrap.h"
17 #endif
18 
19 typedef struct {
20 	int n;
21 	bwtint_t *a;
22 } poslist_t;
23 
24 typedef struct {
25 	double avg, std, ap_prior;
26 	bwtint_t low, high, high_bayesian;
27 } isize_info_t;
28 
29 #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
30 #define b128_hash(a) ((uint32_t)(a).x)
31 
32 #include "khash.h"
33 KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
34 
35 typedef struct {
36 	pair64_v arr;
37 	pair64_v pos[2];
38 	kvec_t(bwt_aln1_t) aln[2];
39 } pe_data_t;
40 
41 #define MIN_HASH_WIDTH 1000
42 
43 extern int g_log_n[256]; // in bwase.c
44 static kh_b128_t *g_hash;
45 
46 void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
47 void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
48 int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
49 void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
50 bntseq_t *bwa_open_nt(const char *prefix);
51 void bwa_print_sam_SQ(const bntseq_t *bns);
52 
bwa_init_pe_opt()53 pe_opt_t *bwa_init_pe_opt()
54 {
55 	pe_opt_t *po;
56 	po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
57 	po->max_isize = 500;
58 	po->force_isize = 0;
59 	po->max_occ = 100000;
60 	po->n_multi = 3;
61 	po->N_multi = 10;
62 	po->type = BWA_PET_STD;
63 	po->is_sw = 1;
64 	po->ap_prior = 1e-5;
65 	return po;
66 }
67 /*
68 static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
69 {
70 	const double a = 0.140012;
71 	double b, c;
72 	b = log(x * (2 - x));
73 	c = 2./M_PI/a + b / 2.;
74 	return sqrt(sqrt(c * c - b / a) - c);
75 }
76 */
77 
78 // for normal distribution, this is about 3std
79 #define OUTLIER_BOUND 2.0
80 
infer_isize(int n_seqs,bwa_seq_t * seqs[2],isize_info_t * ii,double ap_prior,int64_t L)81 static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
82 {
83 	uint64_t x, *isizes, n_ap = 0;
84 	int n, i, tot, p25, p75, p50, max_len = 1, tmp;
85 	double skewness = 0.0, kurtosis = 0.0, y;
86 
87 	ii->avg = ii->std = -1.0;
88 	ii->low = ii->high = ii->high_bayesian = 0;
89 	isizes = (uint64_t*)calloc(n_seqs, 8);
90 	for (i = 0, tot = 0; i != n_seqs; ++i) {
91 		bwa_seq_t *p[2];
92 		p[0] = seqs[0] + i; p[1] = seqs[1] + i;
93 		if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
94 			x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
95 			if (x < 100000) isizes[tot++] = x;
96 		}
97 		if (p[0]->len > max_len) max_len = p[0]->len;
98 		if (p[1]->len > max_len) max_len = p[1]->len;
99 	}
100 	if (tot < 20) {
101 		fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
102 		free(isizes);
103 		return -1;
104 	}
105 	ks_introsort_64(tot, isizes);
106 	p25 = isizes[(int)(tot*0.25 + 0.5)];
107 	p50 = isizes[(int)(tot*0.50 + 0.5)];
108 	p75 = isizes[(int)(tot*0.75 + 0.5)];
109 	tmp  = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
110 	ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
111 	ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
112 	if (ii->low > ii->high) {
113 		fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n");
114 		free(isizes);
115 		return -1;
116 	}
117 	for (i = 0, x = n = 0; i < tot; ++i)
118 		if (isizes[i] >= ii->low && isizes[i] <= ii->high)
119 			++n, x += isizes[i];
120 	ii->avg = (double)x / n;
121 	for (i = 0; i < tot; ++i) {
122 		if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
123 			double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
124 			ii->std += tmp;
125 			skewness += tmp * (isizes[i] - ii->avg);
126 			kurtosis += tmp * tmp;
127 		}
128 	}
129 	kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
130 	ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
131 	skewness = skewness / n / (ii->std * ii->std * ii->std);
132 	for (y = 1.0; y < 10.0; y += 0.01)
133 		if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
134 	ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
135 	for (i = 0; i < tot; ++i)
136 		if (isizes[i] > ii->high_bayesian) ++n_ap;
137 	ii->ap_prior = .01 * (n_ap + .01) / tot;
138 	if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
139 	free(isizes);
140 	fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
141 	if (isnan(ii->std) || p75 > 100000) {
142 		ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
143 		fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
144 		return -1;
145 	}
146 	for (y = 1.0; y < 10.0; y += 0.01)
147 		if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
148 	ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
149 	fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high);
150 	fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
151 	fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
152 	fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y);
153 	return 0;
154 }
155 
pairing(bwa_seq_t * p[2],pe_data_t * d,const pe_opt_t * opt,int s_mm,const isize_info_t * ii)156 static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
157 {
158 	int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
159 	uint64_t o_score, subo_score;
160 	pair64_t last_pos[2][2], o_pos[2];
161 	max_len = p[0]->full_len;
162 	if (max_len < p[1]->full_len) max_len = p[1]->full_len;
163 	if (low_bound < max_len) low_bound = max_len;
164 
165 	// here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
166 #define __pairing_aux(u,v) do { \
167 		bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \
168 		if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \
169 			&& ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
170 		{ \
171 			uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \
172 			s *= 10; \
173 			if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
174 			s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \
175 			if (s>>32 == o_score>>32) ++o_n; \
176 			else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
177 			else ++subo_n; \
178 			if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \
179 			else if (s < subo_score) subo_score = s; \
180 		} \
181 	} while (0)
182 
183 #define __pairing_aux2(q, w) do { \
184 		const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \
185 		(q)->extra_flag |= SAM_FPP; \
186 		if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \
187 			(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \
188 			(q)->score = r->score; \
189 			(q)->pos = (w).x; \
190 			if ((q)->mapQ > 0) ++cnt_chg; \
191 		} \
192 	} while (0)
193 
194 	o_score = subo_score = (uint64_t)-1;
195 	o_n = subo_n = 0;
196 	ks_introsort_128(d->arr.n, d->arr.a);
197 	for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
198 	if (opt->type == BWA_PET_STD) {
199 		for (i = 0; i < d->arr.n; ++i) {
200 			pair64_t x = d->arr.a[i];
201 			int strand = x.y>>1&1;
202 			if (strand == 1) { // reverse strand, then check
203 				int y = 1 - (x.y&1);
204 				__pairing_aux(last_pos[y][1], x);
205 				__pairing_aux(last_pos[y][0], x);
206 			} else { // forward strand, then push
207 				last_pos[x.y&1][0] = last_pos[x.y&1][1];
208 				last_pos[x.y&1][1] = x;
209 			}
210 		}
211 	} else {
212 		fprintf(stderr, "[paring] not implemented yet!\n");
213 		exit(1);
214 	}
215 	// set pairing
216 	//fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
217 	if (o_score != (uint64_t)-1) {
218 		int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
219 		//fprintf(stderr, "%d, %d\n", o_n, subo_n);
220 		if (o_n == 1) {
221 			if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
222 			else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
223 			else {
224 				int n = subo_n > 255? 255 : subo_n;
225 				mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
226 				if (mapQ_p < 0) mapQ_p = 0;
227 			}
228 		}
229 		if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved
230 			if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
231 				int mapQ = p[0]->mapQ + p[1]->mapQ;
232 				if (mapQ > 60) mapQ = 60;
233 				p[0]->mapQ = p[1]->mapQ = mapQ;
234 			} else {
235 				if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
236 				if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
237 			}
238 		} else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved
239 			p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
240 			if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
241 		} else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved
242 			p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
243 			if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
244 		} else { // both ends moved
245 			p[0]->seQ = p[1]->seQ = 0;
246 			mapQ_p -= 20;
247 			if (mapQ_p < 0) mapQ_p = 0;
248 			p[0]->mapQ = p[1]->mapQ = mapQ_p;
249 		}
250 		__pairing_aux2(p[0], o_pos[0]);
251 		__pairing_aux2(p[1], o_pos[1]);
252 	}
253 	return cnt_chg;
254 }
255 
256 typedef struct {
257 	kvec_t(bwt_aln1_t) aln;
258 } aln_buf_t;
259 
bwa_cal_pac_pos_pe(const bntseq_t * bns,const char * prefix,bwt_t * const _bwt,int n_seqs,bwa_seq_t * seqs[2],FILE * fp_sa[2],isize_info_t * ii,const pe_opt_t * opt,const gap_opt_t * gopt,const isize_info_t * last_ii)260 int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
261 					   const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
262 {
263 	int i, j, cnt_chg = 0;
264 	char str[1024];
265 	bwt_t *bwt;
266 	pe_data_t *d;
267 	aln_buf_t *buf[2];
268 
269 	d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
270 	buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
271 	buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
272 
273 	if (_bwt == 0) { // load forward SA
274 		strcpy(str, prefix); strcat(str, ".bwt");  bwt = bwt_restore_bwt(str);
275 		strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
276 	} else bwt = _bwt;
277 
278 	// SE
279 	for (i = 0; i != n_seqs; ++i) {
280 		bwa_seq_t *p[2];
281 		for (j = 0; j < 2; ++j) {
282 			int n_aln;
283 			p[j] = seqs[j] + i;
284 			p[j]->n_multi = 0;
285 			p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
286 			err_fread_noeof(&n_aln, 4, 1, fp_sa[j]);
287 			if (n_aln > kv_max(d->aln[j]))
288 				kv_resize(bwt_aln1_t, d->aln[j], n_aln);
289 			d->aln[j].n = n_aln;
290 			err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
291 			kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
292 			// generate SE alignment and mapping quality
293 			bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
294 			if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
295 				int strand;
296 				int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
297 				p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
298 				p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand);
299 				p[j]->strand = strand;
300 				if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH;
301 			}
302 		}
303 	}
304 
305 	// infer isize
306 	infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
307 	if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
308 	if (opt->force_isize) {
309 		fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
310 		ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
311 	}
312 
313 	// PE
314 	for (i = 0; i != n_seqs; ++i) {
315 		bwa_seq_t *p[2];
316 		for (j = 0; j < 2; ++j) {
317 			p[j] = seqs[j] + i;
318 			kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
319 		}
320 		if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
321 			&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
322 		{ // only when both ends mapped
323 			pair64_t x;
324 			int j, k;
325 			long long n_occ[2];
326 			for (j = 0; j < 2; ++j) {
327 				n_occ[j] = 0;
328 				for (k = 0; k < d->aln[j].n; ++k)
329 					n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
330 			}
331 			if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
332 			d->arr.n = 0;
333 			for (j = 0; j < 2; ++j) {
334 				for (k = 0; k < d->aln[j].n; ++k) {
335 					bwt_aln1_t *r = d->aln[j].a + k;
336 					bwtint_t l;
337 					if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
338 						pair64_t key;
339 						int ret;
340 						key.x = r->k; key.y = r->l;
341 						khint_t iter = kh_put(b128, g_hash, key, &ret);
342 						if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
343 							poslist_t *z = &kh_val(g_hash, iter);
344 							z->n = r->l - r->k + 1;
345 							z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
346 							for (l = r->k; l <= r->l; ++l) {
347 								int strand;
348 								z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1;
349 								z->a[l - r->k] |= strand;
350 							}
351 						}
352 						for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
353 							x.x = kh_val(g_hash, iter).a[l]>>1;
354 							x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
355 							kv_push(pair64_t, d->arr, x);
356 						}
357 					} else { // then calculate on the fly
358 						for (l = r->k; l <= r->l; ++l) {
359 							int strand;
360 							x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand);
361 							x.y = k<<2 | strand<<1 | j;
362 							kv_push(pair64_t, d->arr, x);
363 						}
364 					}
365 				}
366 			}
367 			cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
368 		}
369 
370 		if (opt->N_multi || opt->n_multi) {
371 			for (j = 0; j < 2; ++j) {
372 				if (p[j]->type != BWA_TYPE_NO_MATCH) {
373 					int k, n_multi;
374 					if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
375 						bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
376 					} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
377 					for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) {
378 						int strand;
379 						bwt_multi1_t *q = p[j]->multi + k;
380 						q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand);
381 						q->strand = strand;
382 						if (q->pos != p[j]->pos && q->pos != (bwtint_t)-1)
383 							p[j]->multi[n_multi++] = *q;
384 					}
385 					p[j]->n_multi = n_multi;
386 				}
387 			}
388 		}
389 	}
390 
391 	// free
392 	for (i = 0; i < n_seqs; ++i) {
393 		kv_destroy(buf[0][i].aln);
394 		kv_destroy(buf[1][i].aln);
395 	}
396 	free(buf[0]); free(buf[1]);
397 	if (_bwt == 0) bwt_destroy(bwt);
398 	kv_destroy(d->arr);
399 	kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
400 	kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
401 	free(d);
402 	return cnt_chg;
403 }
404 
405 #define SW_MIN_MATCH_LEN 20
406 #define SW_MIN_MAPQ 17
407 
408 // cnt = n_mm<<16 | n_gapo<<8 | n_gape
bwa_sw_core(bwtint_t l_pac,const ubyte_t * pacseq,int len,const ubyte_t * seq,int64_t * beg,int reglen,int * n_cigar,uint32_t * _cnt)409 bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
410 {
411 	kswr_t r;
412 	uint32_t *cigar32 = 0;
413 	bwa_cigar_t *cigar = 0;
414 	ubyte_t *ref_seq;
415 	bwtint_t k, x, y, l;
416 	int xtra, gscore;
417 	int8_t mat[25];
418 
419 	bwa_fill_scmat(1, 3, mat);
420 	// check whether there are too many N's
421 	if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
422 	for (k = 0, x = 0; k < len; ++k)
423 		if (seq[k] >= 4) ++x;
424 	if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
425 
426 	// get reference subsequence
427 	ref_seq = (ubyte_t*)calloc(reglen, 1);
428 	for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
429 		ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
430 
431 	// do alignment
432 	xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
433 	r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
434 	gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
435 	cigar = (bwa_cigar_t*)cigar32;
436 	for (k = 0; k < *n_cigar; ++k)
437 		cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
438 
439 	if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment
440 		free(cigar); free(ref_seq); *n_cigar = 0;
441 		return 0;
442 	}
443 
444 	// check whether the alignment is good enough
445 	for (k = 0, x = y = 0; k < *n_cigar; ++k) {
446 		bwa_cigar_t c = cigar[k];
447 		if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
448 		else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
449 		else y += __cigar_len(c);
450 	}
451 	if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
452 		free(cigar); free(ref_seq);
453 		*n_cigar = 0;
454 		return 0;
455 	}
456 
457 	{ // update cigar and coordinate;
458 		int start = r.qb, end = r.qe + 1;
459 		*beg += r.tb;
460 		cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
461 		if (start) {
462 			memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
463 			cigar[0] = __cigar_create(3, start);
464 			++(*n_cigar);
465 		}
466 		if (end < len) {
467 			/*cigar[*n_cigar] = 3<<14 | (len - end);*/
468 			cigar[*n_cigar] = __cigar_create(3, (len - end));
469 			++(*n_cigar);
470 		}
471 	}
472 
473 	{ // set *cnt
474 		int n_mm, n_gapo, n_gape;
475 		n_mm = n_gapo = n_gape = 0;
476 		x = r.tb; y = r.qb;
477 		for (k = 0; k < *n_cigar; ++k) {
478 			bwa_cigar_t c = cigar[k];
479 			if (__cigar_op(c) == FROM_M) {
480 				for (l = 0; l < (__cigar_len(c)); ++l)
481 					if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
482 				x += __cigar_len(c), y += __cigar_len(c);
483 			} else if (__cigar_op(c) == FROM_D) {
484 				x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
485 			} else if (__cigar_op(c) == FROM_I) {
486 				y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
487 			}
488 		}
489 		*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
490 	}
491 
492 	free(ref_seq);
493 	return cigar;
494 }
495 
bwa_paired_sw(const bntseq_t * bns,const ubyte_t * _pacseq,int n_seqs,bwa_seq_t * seqs[2],const pe_opt_t * popt,const isize_info_t * ii)496 ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
497 {
498 	ubyte_t *pacseq;
499 	int i;
500 	uint64_t n_tot[2], n_mapped[2];
501 
502 	// load reference sequence
503 	if (_pacseq == 0) {
504 		pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
505 		err_rewind(bns->fp_pac);
506 		err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
507 	} else pacseq = (ubyte_t*)_pacseq;
508 	if (!popt->is_sw || ii->avg < 0.0) return pacseq;
509 
510 	// perform mate alignment
511 	n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
512 	for (i = 0; i != n_seqs; ++i) {
513 		bwa_seq_t *p[2];
514 		p[0] = seqs[0] + i; p[1] = seqs[1] + i;
515 		if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
516 			int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
517 			int64_t beg[2], end[2];
518 			bwa_cigar_t *cigar[2];
519 			uint32_t cnt[2];
520 
521 			/* In the following, _pref points to the reference read
522 			 * which must be aligned; _pmate points to its mate which is
523 			 * considered to be modified. */
524 
525 #define __set_rght_coor(_a, _b, _pref, _pmate) do {						\
526 				(_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
527 				(_b) = (_a) + 6 * ii->std + 2 * _pmate->len;			\
528 				if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
529 				if ((_b) > bns->l_pac) (_b) = bns->l_pac;				\
530 			} while (0)
531 
532 #define __set_left_coor(_a, _b, _pref, _pmate) do {						\
533 				(_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
534 				(_b) = (_a) + 6 * ii->std + 2 * _pmate->len;			\
535 				if ((_a) < 0) (_a) = 0;									\
536 				if ((_b) > _pref->pos) (_b) = _pref->pos;				\
537 			} while (0)
538 
539 #define __set_fixed(_pref, _pmate, _beg, _cnt) do {						\
540 				_pmate->type = BWA_TYPE_MATESW;							\
541 				_pmate->pos = _beg;										\
542 				_pmate->seQ = _pref->seQ;								\
543 				_pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
544 				_pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
545 				_pmate->extra_flag |= SAM_FPP;							\
546 				_pref->extra_flag |= SAM_FPP;							\
547 			} while (0)
548 
549 			mq_adjust[0] = mq_adjust[1] = 255; // not effective
550 			is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
551 
552 			++n_tot[is_singleton];
553 			cigar[0] = cigar[1] = 0;
554 			n_cigar[0] = n_cigar[1] = 0;
555 			if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
556 			for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
557 				ubyte_t *seq;
558 				if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
559 				{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
560 					if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
561 						__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
562 						seq = p[k]->rseq;
563 					} else { // then the mate is on forward stand and has smaller coordinate
564 						__set_left_coor(beg[k], end[k], p[1-k], p[k]);
565 						seq = p[k]->seq;
566 						seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
567 					}
568 				}
569 				// perform SW alignment
570 				cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
571 				if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
572 					int s_old, clip = 0, s_new;
573 					if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
574 					if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
575 					s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
576 					s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
577 					s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
578 					s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
579 					if (s_old < s_new) { // reject SW alignment
580 						mq_adjust[k] = s_new - s_old;
581 						free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
582 					} else mq_adjust[k] = s_old - s_new;
583 				}
584 				// now revserse sequence back such that p[*]->seq looks untouched
585 				if (popt->type == BWA_PET_STD) {
586 					if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
587 				} else {
588 					if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
589 				}
590 			}
591 			k = -1; // no read to be changed
592 			if (cigar[0] && cigar[1]) {
593 				k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
594 				mapQ = abs(p[1]->mapQ - p[0]->mapQ);
595 			} else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
596 			else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
597 			if (k >= 0 && p[k]->pos != beg[k]) {
598 				++n_mapped[is_singleton];
599 				{ // recalculate mapping quality
600 					int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
601 					if (tmp <= 0) tmp = 1;
602 					if (mapQ > tmp) mapQ = tmp;
603 					p[k]->mapQ = p[1-k]->mapQ = mapQ;
604 					p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
605 					if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
606 					if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
607 				}
608 				// update CIGAR
609 				free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
610 				p[k]->n_cigar = n_cigar[k];
611 				// update the rest of information
612 				__set_fixed(p[1-k], p[k], beg[k], cnt[k]);
613 			}
614 			free(cigar[0]); free(cigar[1]);
615 		}
616 	}
617 	fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
618 			(long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
619 	fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
620 			(long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
621 	return pacseq;
622 }
623 
bwa_sai2sam_pe_core(const char * prefix,char * const fn_sa[2],char * const fn_fa[2],pe_opt_t * popt,const char * rg_line)624 void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
625 {
626 	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
627 	int i, j, n_seqs;
628 	long long tot_seqs = 0;
629 	bwa_seq_t *seqs[2];
630 	bwa_seqio_t *ks[2];
631 	clock_t t;
632 	bntseq_t *bns;
633 	FILE *fp_sa[2];
634 	gap_opt_t opt, opt0;
635 	khint_t iter;
636 	isize_info_t last_ii; // this is for the last batch of reads
637 	char str[1024], magic[2][4];
638 	bwt_t *bwt;
639 	uint8_t *pac;
640 
641 	// initialization
642 	bwase_initialize(); // initialize g_log_n[] in bwase.c
643 	pac = 0; bwt = 0;
644 	for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
645 	bns = bns_restore(prefix);
646 	srand48(bns->seed);
647 	fp_sa[0] = xopen(fn_sa[0], "r");
648 	fp_sa[1] = xopen(fn_sa[1], "r");
649 	g_hash = kh_init(b128);
650 	last_ii.avg = -1.0;
651 
652 	err_fread_noeof(magic[0], 1, 4, fp_sa[0]);
653 	err_fread_noeof(magic[1], 1, 4, fp_sa[1]);
654 	if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) {
655 		fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
656 		exit(1);
657 	}
658 	err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
659 	ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
660 	opt0 = opt;
661 	err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
662 	ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
663 	{ // for Illumina alignment only
664 		if (popt->is_preload) {
665 			strcpy(str, prefix); strcat(str, ".bwt");  bwt = bwt_restore_bwt(str);
666 			strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
667 			pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
668 			err_rewind(bns->fp_pac);
669 			err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
670 		}
671 	}
672 
673 	// core loop
674 	bwa_print_sam_hdr(bns, rg_line);
675 	while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
676 		int cnt_chg;
677 		isize_info_t ii;
678 		ubyte_t *pacseq;
679 
680 		seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
681 		tot_seqs += n_seqs;
682 		t = clock();
683 
684 		fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
685 		cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
686 		fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
687 		fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
688 
689 		fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
690 		pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
691 		fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
692 
693 		fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
694 		for (j = 0; j < 2; ++j)
695 			bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
696 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
697 		if (pac == 0) free(pacseq);
698 
699 		fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
700 		for (i = 0; i < n_seqs; ++i) {
701 			bwa_seq_t *p[2];
702 			p[0] = seqs[0] + i; p[1] = seqs[1] + i;
703 			if (p[0]->bc[0] || p[1]->bc[0]) {
704 				strcat(p[0]->bc, p[1]->bc);
705 				strcpy(p[1]->bc, p[0]->bc);
706 			}
707 			bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
708 			bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
709 			if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name);
710 		}
711 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
712 
713 		for (j = 0; j < 2; ++j)
714 			bwa_free_read_seq(n_seqs, seqs[j]);
715 		fprintf(stderr, "[bwa_sai2sam_pe_core] %lld sequences have been processed.\n", tot_seqs);
716 		last_ii = ii;
717 	}
718 
719 	// destroy
720 	bns_destroy(bns);
721 	for (i = 0; i < 2; ++i) {
722 		bwa_seq_close(ks[i]);
723 		err_fclose(fp_sa[i]);
724 	}
725 	for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
726 		if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
727 	kh_destroy(b128, g_hash);
728 	if (pac) {
729 		free(pac); bwt_destroy(bwt);
730 	}
731 }
732 
bwa_sai2sam_pe(int argc,char * argv[])733 int bwa_sai2sam_pe(int argc, char *argv[])
734 {
735 	int c;
736 	pe_opt_t *popt;
737 	char *prefix, *rg_line = 0;
738 
739 	popt = bwa_init_pe_opt();
740 	while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
741 		switch (c) {
742 		case 'r':
743 			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
744 			break;
745 		case 'a': popt->max_isize = atoi(optarg); break;
746 		case 'o': popt->max_occ = atoi(optarg); break;
747 		case 's': popt->is_sw = 0; break;
748 		case 'P': popt->is_preload = 1; break;
749 		case 'n': popt->n_multi = atoi(optarg); break;
750 		case 'N': popt->N_multi = atoi(optarg); break;
751 		case 'c': popt->ap_prior = atof(optarg); break;
752 		case 'f': xreopen(optarg, "w", stdout); break;
753 		case 'A': popt->force_isize = 1; break;
754 		default: return 1;
755 		}
756 	}
757 
758 	if (optind + 5 > argc) {
759 		fprintf(stderr, "\n");
760 		fprintf(stderr, "Usage:   bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
761 		fprintf(stderr, "Options: -a INT   maximum insert size [%d]\n", popt->max_isize);
762 		fprintf(stderr, "         -o INT   maximum occurrences for one end [%d]\n", popt->max_occ);
763 		fprintf(stderr, "         -n INT   maximum hits to output for paired reads [%d]\n", popt->n_multi);
764 		fprintf(stderr, "         -N INT   maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
765 		fprintf(stderr, "         -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
766         fprintf(stderr, "         -f FILE  sam file to output results to [stdout]\n");
767 		fprintf(stderr, "         -r STR   read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
768 		fprintf(stderr, "         -P       preload index into memory (for base-space reads only)\n");
769 		fprintf(stderr, "         -s       disable Smith-Waterman for the unmapped mate\n");
770 		fprintf(stderr, "         -A       disable insert size estimate (force -s)\n\n");
771 		fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
772 		fprintf(stderr, "       2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
773 		fprintf(stderr, "          to get a sensible speed at the cost of pairing accuracy.\n");
774 		fprintf(stderr, "\n");
775 		return 1;
776 	}
777 	if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
778 		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
779 		return 1;
780 	}
781 	bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
782 	free(prefix); free(popt);
783 	return 0;
784 }
785