1 /* The MIT License
2
3 Copyright (c) 2008, 2018 Genome Research Ltd (GRL).
4 2011 Heng Li <lh3@live.co.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 SOFTWARE.
25 */
26
27 /* This program is separated from maq's read simulator with Colin
28 * Hercus' modification to allow longer indels. */
29
30 #include <config.h>
31
32 #include <stdlib.h>
33 #include <math.h>
34 #include <time.h>
35 #include <assert.h>
36 #include <stdio.h>
37 #include <unistd.h>
38 #include <stdint.h>
39 #include <ctype.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <zlib.h>
43 #include "../version.h"
44 #include "htslib/kseq.h"
45 #include "htslib/hts_os.h"
46 KSEQ_INIT(gzFile, gzread)
47
48 const uint8_t nst_nt4_table[256] = {
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
64 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
65 };
66
67 /* Simple normal random number generator, copied from genran.c */
68
ran_normal(void)69 double ran_normal(void)
70 {
71 static int iset = 0;
72 static double gset;
73 double fac, rsq, v1, v2;
74 if (iset == 0) {
75 do {
76 v1 = 2.0 * drand48() - 1.0;
77 v2 = 2.0 * drand48() - 1.0;
78 rsq = v1 * v1 + v2 * v2;
79 } while (rsq >= 1.0 || rsq == 0.0);
80 fac = sqrt(-2.0 * log(rsq) / rsq);
81 gset = v1 * fac;
82 iset = 1;
83 return v2 * fac;
84 } else {
85 iset = 0;
86 return gset;
87 }
88 }
89
90 /* wgsim */
91
92 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
93 typedef unsigned short mut_t;
94 static mut_t mutmsk = (mut_t)0xf000;
95
96 typedef struct {
97 int l, m; /* length and maximum buffer size */
98 mut_t *s; /* sequence */
99 } mutseq_t;
100
101 static double ERR_RATE = 0.02;
102 static double MUT_RATE = 0.001;
103 static double INDEL_FRAC = 0.15;
104 static double INDEL_EXTEND = 0.3;
105 static double MAX_N_RATIO = 0.05;
106
wgsim_mut_diref(const kseq_t * ks,int is_hap,mutseq_t * hap1,mutseq_t * hap2)107 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
108 {
109 int i, deleting = 0;
110 mutseq_t *ret[2];
111
112 ret[0] = hap1; ret[1] = hap2;
113 ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
114 ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
115 ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
116 ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
117 for (i = 0; i != ks->seq.l; ++i) {
118 int c;
119 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
120 if (deleting) {
121 if (drand48() < INDEL_EXTEND) {
122 if (deleting & 1) ret[0]->s[i] |= DELETE;
123 if (deleting & 2) ret[1]->s[i] |= DELETE;
124 continue;
125 } else deleting = 0;
126 }
127 if (c < 4 && drand48() < MUT_RATE) { // mutation
128 if (drand48() >= INDEL_FRAC) { // substitution
129 double r = drand48();
130 c = (c + (int)(r * 3.0 + 1)) & 3;
131 if (is_hap || drand48() < 0.333333) { // hom
132 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
133 } else { // het
134 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
135 }
136 } else { // indel
137 if (drand48() < 0.5) { // deletion
138 if (is_hap || drand48() < 0.333333) { // hom-del
139 ret[0]->s[i] = ret[1]->s[i] = DELETE;
140 deleting = 3;
141 } else { // het-del
142 deleting = drand48()<0.5?1:2;
143 ret[deleting-1]->s[i] = DELETE;
144 }
145 } else { // insertion
146 int num_ins = 0, ins = 0;
147 do {
148 num_ins++;
149 ins = (ins << 2) | (int)(drand48() * 4.0);
150 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
151
152 if (is_hap || drand48() < 0.333333) { // hom-ins
153 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
154 } else { // het-ins
155 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
156 }
157 }
158 }
159 }
160 }
161 }
wgsim_print_mutref(const char * name,const kseq_t * ks,mutseq_t * hap1,mutseq_t * hap2)162 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
163 {
164 int i, j = 0; // j keeps the end of the last deletion
165 for (i = 0; i != ks->seq.l; ++i) {
166 int c[3];
167 c[0] = nst_nt4_table[(int)ks->seq.s[i]];
168 c[1] = hap1->s[i]; c[2] = hap2->s[i];
169 if (c[0] >= 4) continue;
170 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
171 if (c[1] == c[2]) { // hom
172 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
173 printf("%s\t%d\t%c\t%c\t-\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
174 } else if ((c[1]&mutmsk) == DELETE) { // del
175 if (i >= j) {
176 printf("%s\t%d\t", name, i+1);
177 for (j = i; j < ks->seq.l && hap1->s[j] == hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
178 putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
179 printf("\t-\t-\n");
180 }
181 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins
182 printf("%s\t%d\t-\t", name, i+1);
183 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
184 while (n > 0) {
185 putchar("ACGTN"[ins & 0x3]);
186 ins >>= 2;
187 n--;
188 }
189 printf("\t-\n");
190 } // else: deleted base in a long deletion
191 } else { // het
192 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
193 printf("%s\t%d\t%c\t%c\t+\n", name, i+1, "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
194 } else if ((c[1]&mutmsk) == DELETE) {
195 if (i >= j) {
196 printf("%s\t%d\t", name, i+1);
197 for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
198 putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
199 printf("\t-\t-\n");
200 }
201 } else if ((c[2]&mutmsk) == DELETE) {
202 if (i >= j) {
203 printf("%s\t%d\t", name, i+1);
204 for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap2->s[j]&mutmsk) == DELETE; ++j)
205 putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
206 printf("\t-\t-\n");
207 }
208 } else if (((c[1] & mutmsk) >> 12) <= 4 && ((c[1] & mutmsk) >> 12) > 0) { // ins1
209 printf("%s\t%d\t-\t", name, i+1);
210 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
211 while (n > 0) {
212 putchar("ACGTN"[ins & 0x3]);
213 ins >>= 2;
214 n--;
215 }
216 printf("\t+\n");
217 } else if (((c[2] & mutmsk) >> 12) <= 4 || ((c[2] & mutmsk) >> 12) > 0) { // ins2
218 printf("%s\t%d\t-\t", name, i+1);
219 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
220 while (n > 0) {
221 putchar("ACGTN"[ins & 0x3]);
222 ins >>= 2;
223 n--;
224 }
225 printf("\t+\n");
226 } // else: deleted base in a long deletion
227 }
228 }
229 }
230 }
231
wgsim_core(FILE * fpout1,FILE * fpout2,const char * fn,int is_hap,uint64_t N,int dist,int std_dev,int size_l,int size_r)232 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
233 {
234 kseq_t *ks;
235 mutseq_t rseq[2];
236 gzFile fp_fa;
237 uint64_t tot_len, ii;
238 int i, l, n_ref;
239 char *qstr;
240 int size[2], Q, max_size;
241 uint8_t *tmp_seq[2];
242 mut_t *target;
243 int max_loop, max_loop_err = 0;
244
245 fp_fa = gzopen(fn, "r");
246 if (fp_fa == NULL) {
247 fprintf(stderr, "[wgsim] can't open '%s': %s\n", fn, strerror(errno));
248 return;
249 }
250
251 l = size_l > size_r? size_l : size_r;
252 qstr = (char*)calloc(l+1, 1);
253 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
254 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
255 size[0] = size_l; size[1] = size_r;
256 max_size = size_l > size_r? size_l : size_r;
257
258 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
259
260 ks = kseq_init(fp_fa);
261 tot_len = n_ref = 0;
262 fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
263 while ((l = kseq_read(ks)) >= 0) {
264 tot_len += l;
265 ++n_ref;
266 }
267 fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
268 kseq_destroy(ks);
269 gzclose(fp_fa);
270
271 fp_fa = gzopen(fn, "r");
272 ks = kseq_init(fp_fa);
273 while ((l = kseq_read(ks)) >= 0) {
274 uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
275 if (l < dist + 3 * std_dev) {
276 fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
277 continue;
278 }
279
280 // generate mutations and print them out
281 wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
282 wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
283
284 for (ii = 0; max_loop = 1000, ii != n_pairs; ++ii) { // the core loop
285 double ran;
286 int d, pos, s[2], is_flip;
287 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
288 FILE *fpo[2];
289
290 try_again:
291 is_flip = 0;
292
293 do { // avoid boundary failure
294 ran = ran_normal();
295 ran = ran * std_dev + dist;
296 d = (int)(ran + 0.5);
297 d = d > max_size? d : max_size;
298 pos = (int)((l - d + 1) * drand48());
299 } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
300
301 // flip or not
302 if (drand48() < 0.5) {
303 fpo[0] = fpout1; fpo[1] = fpout2;
304 s[0] = size[0]; s[1] = size[1];
305 } else {
306 fpo[1] = fpout1; fpo[0] = fpout2;
307 s[1] = size[0]; s[0] = size[1];
308 is_flip = 1;
309 }
310
311 // generate the read sequences
312 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
313 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
314
315 // forward read
316 for (i = pos, k = 0, ext_coor[0] = -10; i >= 0 && i < ks->seq.l && k < s[0]; ++i) {
317 int c = target[i], mut_type = c & mutmsk;
318 if (ext_coor[0] < 0) {
319 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue;
320 ext_coor[0] = i;
321 }
322 if (mut_type == DELETE) ++n_indel[0];
323 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
324 tmp_seq[0][k++] = c & 0xf;
325 if (mut_type == SUBSTITUTE) ++n_sub[0];
326 } else {
327 int n, ins;
328 ++n_indel[0];
329 tmp_seq[0][k++] = c & 0xf;
330 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[0]; --n, ins >>= 2)
331 tmp_seq[0][k++] = ins & 0x3;
332 }
333 }
334 if (k != s[0]) ext_coor[0] = -10;
335
336 // reverse read
337 for (i = pos + d - 1, k = 0, ext_coor[1] = -10; i >= 0 && i < ks->seq.l && k < s[1]; --i) {
338 int c = target[i], mut_type = c & mutmsk;
339 if (ext_coor[1] < 0) {
340 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue;
341 ext_coor[1] = i;
342 }
343 if (mut_type == DELETE) ++n_indel[1];
344 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
345 tmp_seq[1][k++] = c & 0xf;
346 if (mut_type == SUBSTITUTE) ++n_sub[1];
347 } else {
348 int n, ins;
349 ++n_indel[1];
350 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[1];)
351 tmp_seq[1][k++] = (ins>>(2*--n)) & 0x3;
352 tmp_seq[1][k++] = c & 0xf;
353 }
354 }
355 if (k != s[1]) ext_coor[1] = -10;
356
357
358
359 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
360 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
361 --ii;
362 continue;
363 }
364
365 // generate sequencing errors
366 for (j = 0; j < 2; ++j) {
367 int n_n = 0;
368 for (i = 0; i < s[j]; ++i) {
369 int c = tmp_seq[j][i];
370 if (c >= 4) { // actually c should be never larger than 4 if everything is correct
371 c = 4;
372 ++n_n;
373 } else if (drand48() < ERR_RATE) {
374 // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
375 c = (c + 1) & 3; // recurrent sequencing errors
376 ++n_err[j];
377 }
378 tmp_seq[j][i] = c;
379 }
380 if ((double)n_n / s[j] > MAX_N_RATIO) break;
381 }
382 if (j < 2) { // too many ambiguous bases on one of the reads
383 if (max_loop--) goto try_again;
384 if (!max_loop_err) {
385 fprintf(stderr, "Failed to produce a sequence with insufficient Ns. "
386 "Omitting some sequence-pairs\n");
387 max_loop_err = 1;
388 }
389 continue;
390 }
391
392 // print
393 for (j = 0; j < 2; ++j) {
394 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
395 qstr[i] = 0;
396 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
397 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
398 (long long)ii, j==0? is_flip+1 : 2-is_flip);
399 for (i = 0; i < s[j]; ++i)
400 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
401 fprintf(fpo[j], "\n+\n%s\n", qstr);
402 }
403 }
404 free(rseq[0].s); free(rseq[1].s);
405 }
406 kseq_destroy(ks);
407 gzclose(fp_fa);
408 free(qstr);
409 free(tmp_seq[0]); free(tmp_seq[1]);
410 }
411
simu_usage(void)412 static int simu_usage(void)
413 {
414 fprintf(stderr, "\n");
415 fprintf(stderr, "Program: wgsim (short read simulator)\n");
416 fprintf(stderr, "Version: %s\n", SAMTOOLS_VERSION);
417 fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
418 fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
419 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
420 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
421 fprintf(stderr, " -s INT standard deviation [50]\n");
422 fprintf(stderr, " -N INT number of read pairs [1000000]\n");
423 fprintf(stderr, " -1 INT length of the first read [70]\n");
424 fprintf(stderr, " -2 INT length of the second read [70]\n");
425 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
426 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
427 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
428 fprintf(stderr, " -S INT seed for random generator [0, use the current time]\n");
429 fprintf(stderr, " -A FLOAT discard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO);
430 fprintf(stderr, " -h haplotype mode\n");
431 fprintf(stderr, "\n");
432 return 1;
433 }
434
main(int argc,char * argv[])435 int main(int argc, char *argv[])
436 {
437 int64_t N;
438 int dist, std_dev, c, size_l, size_r, is_hap = 0;
439 FILE *fpout1, *fpout2;
440 int seed = 0;
441
442 N = 1000000; dist = 500; std_dev = 50;
443 size_l = size_r = 70;
444 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:A:")) >= 0) {
445 switch (c) {
446 case 'd': dist = atoi(optarg); break;
447 case 's': std_dev = atoi(optarg); break;
448 case 'N': N = atoi(optarg); break;
449 case '1': size_l = atoi(optarg); break;
450 case '2': size_r = atoi(optarg); break;
451 case 'e': ERR_RATE = atof(optarg); break;
452 case 'r': MUT_RATE = atof(optarg); break;
453 case 'R': INDEL_FRAC = atof(optarg); break;
454 case 'X': INDEL_EXTEND = atof(optarg); break;
455 case 'A': MAX_N_RATIO = atof(optarg); break;
456 case 'S': seed = atoi(optarg); break;
457 case 'h': is_hap = 1; break;
458 }
459 }
460 if (argc - optind < 3) return simu_usage();
461 fpout1 = fopen(argv[optind+1], "w");
462 fpout2 = fopen(argv[optind+2], "w");
463 if (!fpout1 || !fpout2) {
464 fprintf(stderr, "[wgsim] file open error\n");
465 return 1;
466 }
467 if (seed <= 0) seed = time(0)&0x7fffffff;
468 fprintf(stderr, "[wgsim] seed = %d\n", seed);
469 hts_srand48(seed);
470 wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
471
472 fclose(fpout1); fclose(fpout2);
473 return 0;
474 }
475