1 /* The MIT License
2 
3    Copyright (c) 2008, 2018 Genome Research Ltd (GRL).
4                  2011 Heng Li <lh3@live.co.uk>
5 
6    Permission is hereby granted, free of charge, to any person obtaining
7    a copy of this software and associated documentation files (the
8    "Software"), to deal in the Software without restriction, including
9    without limitation the rights to use, copy, modify, merge, publish,
10    distribute, sublicense, and/or sell copies of the Software, and to
11    permit persons to whom the Software is furnished to do so, subject to
12    the following conditions:
13 
14    The above copyright notice and this permission notice shall be
15    included in all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24    SOFTWARE.
25 */
26 
27 /* This program is separated from maq's read simulator with Colin
28  * Hercus' modification to allow longer indels. */
29 
30 #include <config.h>
31 
32 #include <stdlib.h>
33 #include <math.h>
34 #include <time.h>
35 #include <assert.h>
36 #include <stdio.h>
37 #include <unistd.h>
38 #include <stdint.h>
39 #include <ctype.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <zlib.h>
43 #include "../version.h"
44 #include "htslib/kseq.h"
45 #include "htslib/hts_os.h"
46 KSEQ_INIT(gzFile, gzread)
47 
48 const uint8_t nst_nt4_table[256] = {
49     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
50     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
51     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
52     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
53     4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
54     4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
55     4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
56     4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
57     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
58     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
59     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
60     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
61     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
62     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
63     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
64     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
65 };
66 
67 /* Simple normal random number generator, copied from genran.c */
68 
ran_normal(void)69 double ran_normal(void)
70 {
71     static int iset = 0;
72     static double gset;
73     double fac, rsq, v1, v2;
74     if (iset == 0) {
75         do {
76             v1 = 2.0 * drand48() - 1.0;
77             v2 = 2.0 * drand48() - 1.0;
78             rsq = v1 * v1 + v2 * v2;
79         } while (rsq >= 1.0 || rsq == 0.0);
80         fac = sqrt(-2.0 * log(rsq) / rsq);
81         gset = v1 * fac;
82         iset = 1;
83         return v2 * fac;
84     } else {
85         iset = 0;
86         return gset;
87     }
88 }
89 
90 /* wgsim */
91 
92 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
93 typedef unsigned short mut_t;
94 static mut_t mutmsk = (mut_t)0xf000;
95 
96 typedef struct {
97     int l, m; /* length and maximum buffer size */
98     mut_t *s; /* sequence */
99 } mutseq_t;
100 
101 static double ERR_RATE = 0.02;
102 static double MUT_RATE = 0.001;
103 static double INDEL_FRAC = 0.15;
104 static double INDEL_EXTEND = 0.3;
105 static double MAX_N_RATIO = 0.05;
106 
wgsim_mut_diref(const kseq_t * ks,int is_hap,mutseq_t * hap1,mutseq_t * hap2)107 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
108 {
109     int i, deleting = 0;
110     mutseq_t *ret[2];
111 
112     ret[0] = hap1; ret[1] = hap2;
113     ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
114     ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
115     ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
116     ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
117     for (i = 0; i != ks->seq.l; ++i) {
118         int c;
119         c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
120         if (deleting) {
121             if (drand48() < INDEL_EXTEND) {
122                 if (deleting & 1) ret[0]->s[i] |= DELETE;
123                 if (deleting & 2) ret[1]->s[i] |= DELETE;
124                 continue;
125             } else deleting = 0;
126         }
127         if (c < 4 && drand48() < MUT_RATE) { // mutation
128             if (drand48() >= INDEL_FRAC) { // substitution
129                 double r = drand48();
130                 c = (c + (int)(r * 3.0 + 1)) & 3;
131                 if (is_hap || drand48() < 0.333333) { // hom
132                     ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
133                 } else { // het
134                     ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
135                 }
136             } else { // indel
137                 if (drand48() < 0.5) { // deletion
138                     if (is_hap || drand48() < 0.333333) { // hom-del
139                         ret[0]->s[i] = ret[1]->s[i] = DELETE;
140                         deleting = 3;
141                     } else { // het-del
142                         deleting = drand48()<0.5?1:2;
143                         ret[deleting-1]->s[i] = DELETE;
144                     }
145                 } else { // insertion
146                     int num_ins = 0, ins = 0;
147                     do {
148                         num_ins++;
149                         ins = (ins << 2) | (int)(drand48() * 4.0);
150                     } while (num_ins < 4 && drand48() < INDEL_EXTEND);
151 
152                     if (is_hap || drand48() < 0.333333) { // hom-ins
153                         ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
154                     } else { // het-ins
155                         ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
156                     }
157                 }
158             }
159         }
160     }
161 }
wgsim_print_mutref(const char * name,const kseq_t * ks,mutseq_t * hap1,mutseq_t * hap2)162 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
163 {
164     int i, j = 0; // j keeps the end of the last deletion
165     for (i = 0; i != ks->seq.l; ++i) {
166         int c[3];
167         c[0] = nst_nt4_table[(int)ks->seq.s[i]];
168         c[1] = hap1->s[i]; c[2] = hap2->s[i];
169         if (c[0] >= 4) continue;
170         if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
171             if (c[1] == c[2]) { // hom
172                 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
173                     printf("%s\t%d\t%c\t%c\t-\n", name, i+1, "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
174                 } else if ((c[1]&mutmsk) == DELETE) { // del
175                     if (i >= j) {
176                         printf("%s\t%d\t", name, i+1);
177                         for (j = i; j < ks->seq.l && hap1->s[j] == hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
178                             putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
179                         printf("\t-\t-\n");
180                     }
181                 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins
182                     printf("%s\t%d\t-\t", name, i+1);
183                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
184                     while (n > 0) {
185                         putchar("ACGTN"[ins & 0x3]);
186                         ins >>= 2;
187                         n--;
188                     }
189                     printf("\t-\n");
190                 } // else: deleted base in a long deletion
191             } else { // het
192                 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
193                     printf("%s\t%d\t%c\t%c\t+\n", name, i+1, "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
194                 } else if ((c[1]&mutmsk) == DELETE) {
195                     if (i >= j) {
196                         printf("%s\t%d\t", name, i+1);
197                         for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
198                             putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
199                         printf("\t-\t-\n");
200                     }
201                 } else if ((c[2]&mutmsk) == DELETE) {
202                     if (i >= j) {
203                         printf("%s\t%d\t", name, i+1);
204                         for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap2->s[j]&mutmsk) == DELETE; ++j)
205                             putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
206                         printf("\t-\t-\n");
207                     }
208                 } else if (((c[1] & mutmsk) >> 12) <= 4 && ((c[1] & mutmsk) >> 12) > 0) { // ins1
209                     printf("%s\t%d\t-\t", name, i+1);
210                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
211                     while (n > 0) {
212                         putchar("ACGTN"[ins & 0x3]);
213                         ins >>= 2;
214                         n--;
215                     }
216                     printf("\t+\n");
217                 } else if (((c[2] & mutmsk) >> 12) <= 4 || ((c[2] & mutmsk) >> 12) > 0) { // ins2
218                     printf("%s\t%d\t-\t", name, i+1);
219                     int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
220                     while (n > 0) {
221                         putchar("ACGTN"[ins & 0x3]);
222                         ins >>= 2;
223                         n--;
224                     }
225                     printf("\t+\n");
226                 } // else: deleted base in a long deletion
227             }
228         }
229     }
230 }
231 
wgsim_core(FILE * fpout1,FILE * fpout2,const char * fn,int is_hap,uint64_t N,int dist,int std_dev,int size_l,int size_r)232 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r)
233 {
234     kseq_t *ks;
235     mutseq_t rseq[2];
236     gzFile fp_fa;
237     uint64_t tot_len, ii;
238     int i, l, n_ref;
239     char *qstr;
240     int size[2], Q, max_size;
241     uint8_t *tmp_seq[2];
242     mut_t *target;
243     int max_loop, max_loop_err = 0;
244 
245     fp_fa = gzopen(fn, "r");
246     if (fp_fa == NULL) {
247         fprintf(stderr, "[wgsim] can't open '%s': %s\n", fn, strerror(errno));
248         return;
249     }
250 
251     l = size_l > size_r? size_l : size_r;
252     qstr = (char*)calloc(l+1, 1);
253     tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
254     tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
255     size[0] = size_l; size[1] = size_r;
256     max_size = size_l > size_r? size_l : size_r;
257 
258     Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
259 
260     ks = kseq_init(fp_fa);
261     tot_len = n_ref = 0;
262     fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
263     while ((l = kseq_read(ks)) >= 0) {
264         tot_len += l;
265         ++n_ref;
266     }
267     fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
268     kseq_destroy(ks);
269     gzclose(fp_fa);
270 
271     fp_fa = gzopen(fn, "r");
272     ks = kseq_init(fp_fa);
273     while ((l = kseq_read(ks)) >= 0) {
274         uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
275         if (l < dist + 3 * std_dev) {
276             fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
277             continue;
278         }
279 
280         // generate mutations and print them out
281         wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
282         wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
283 
284         for (ii = 0; max_loop = 1000, ii != n_pairs; ++ii) { // the core loop
285             double ran;
286             int d, pos, s[2], is_flip;
287             int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
288             FILE *fpo[2];
289 
290         try_again:
291             is_flip = 0;
292 
293             do { // avoid boundary failure
294                 ran = ran_normal();
295                 ran = ran * std_dev + dist;
296                 d = (int)(ran + 0.5);
297                 d = d > max_size? d : max_size;
298                 pos = (int)((l - d + 1) * drand48());
299             } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
300 
301             // flip or not
302             if (drand48() < 0.5) {
303                 fpo[0] = fpout1; fpo[1] = fpout2;
304                 s[0] = size[0]; s[1] = size[1];
305             } else {
306                 fpo[1] = fpout1; fpo[0] = fpout2;
307                 s[1] = size[0]; s[0] = size[1];
308                 is_flip = 1;
309             }
310 
311             // generate the read sequences
312             target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
313             n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
314 
315             // forward read
316             for (i = pos, k = 0, ext_coor[0] = -10; i >= 0 && i < ks->seq.l && k < s[0]; ++i) {
317                 int c = target[i], mut_type = c & mutmsk;
318                 if (ext_coor[0] < 0) {
319                     if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue;
320                     ext_coor[0] = i;
321                 }
322                 if (mut_type == DELETE) ++n_indel[0];
323                 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
324                     tmp_seq[0][k++] = c & 0xf;
325                     if (mut_type == SUBSTITUTE) ++n_sub[0];
326                 } else {
327                     int n, ins;
328                     ++n_indel[0];
329                     tmp_seq[0][k++] = c & 0xf;
330                     for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[0]; --n, ins >>= 2)
331                         tmp_seq[0][k++] = ins & 0x3;
332                 }
333             }
334             if (k != s[0]) ext_coor[0] = -10;
335 
336             // reverse read
337             for (i = pos + d - 1, k = 0, ext_coor[1] = -10; i >= 0 && i < ks->seq.l && k < s[1]; --i) {
338                 int c = target[i], mut_type = c & mutmsk;
339                 if (ext_coor[1] < 0) {
340                     if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue;
341                     ext_coor[1] = i;
342                 }
343                 if (mut_type == DELETE) ++n_indel[1];
344                 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
345                     tmp_seq[1][k++] = c & 0xf;
346                     if (mut_type == SUBSTITUTE) ++n_sub[1];
347                 } else {
348                     int n, ins;
349                     ++n_indel[1];
350                     for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[1];)
351                         tmp_seq[1][k++] = (ins>>(2*--n)) & 0x3;
352                     tmp_seq[1][k++] = c & 0xf;
353                 }
354             }
355             if (k != s[1]) ext_coor[1] = -10;
356 
357 
358 
359             for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
360             if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
361                 --ii;
362                 continue;
363             }
364 
365             // generate sequencing errors
366             for (j = 0; j < 2; ++j) {
367                 int n_n = 0;
368                 for (i = 0; i < s[j]; ++i) {
369                     int c = tmp_seq[j][i];
370                     if (c >= 4) { // actually c should be never larger than 4 if everything is correct
371                         c = 4;
372                         ++n_n;
373                     } else if (drand48() < ERR_RATE) {
374                         // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
375                         c = (c + 1) & 3; // recurrent sequencing errors
376                         ++n_err[j];
377                     }
378                     tmp_seq[j][i] = c;
379                 }
380                 if ((double)n_n / s[j] > MAX_N_RATIO) break;
381             }
382             if (j < 2) { // too many ambiguous bases on one of the reads
383                 if (max_loop--) goto try_again;
384                 if (!max_loop_err) {
385                     fprintf(stderr, "Failed to produce a sequence with insufficient Ns. "
386                             "Omitting some sequence-pairs\n");
387                     max_loop_err = 1;
388                 }
389                 continue;
390             }
391 
392             // print
393             for (j = 0; j < 2; ++j) {
394                 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
395                 qstr[i] = 0;
396                 fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
397                         n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
398                         (long long)ii, j==0? is_flip+1 : 2-is_flip);
399                 for (i = 0; i < s[j]; ++i)
400                     fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
401                 fprintf(fpo[j], "\n+\n%s\n", qstr);
402             }
403         }
404         free(rseq[0].s); free(rseq[1].s);
405     }
406     kseq_destroy(ks);
407     gzclose(fp_fa);
408     free(qstr);
409     free(tmp_seq[0]); free(tmp_seq[1]);
410 }
411 
simu_usage(void)412 static int simu_usage(void)
413 {
414     fprintf(stderr, "\n");
415     fprintf(stderr, "Program: wgsim (short read simulator)\n");
416     fprintf(stderr, "Version: %s\n", SAMTOOLS_VERSION);
417     fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
418     fprintf(stderr, "Usage:   wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
419     fprintf(stderr, "Options: -e FLOAT      base error rate [%.3f]\n", ERR_RATE);
420     fprintf(stderr, "         -d INT        outer distance between the two ends [500]\n");
421     fprintf(stderr, "         -s INT        standard deviation [50]\n");
422     fprintf(stderr, "         -N INT        number of read pairs [1000000]\n");
423     fprintf(stderr, "         -1 INT        length of the first read [70]\n");
424     fprintf(stderr, "         -2 INT        length of the second read [70]\n");
425     fprintf(stderr, "         -r FLOAT      rate of mutations [%.4f]\n", MUT_RATE);
426     fprintf(stderr, "         -R FLOAT      fraction of indels [%.2f]\n", INDEL_FRAC);
427     fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", INDEL_EXTEND);
428     fprintf(stderr, "         -S INT        seed for random generator [0, use the current time]\n");
429     fprintf(stderr, "         -A FLOAT      discard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO);
430     fprintf(stderr, "         -h            haplotype mode\n");
431     fprintf(stderr, "\n");
432     return 1;
433 }
434 
main(int argc,char * argv[])435 int main(int argc, char *argv[])
436 {
437     int64_t N;
438     int dist, std_dev, c, size_l, size_r, is_hap = 0;
439     FILE *fpout1, *fpout2;
440     int seed = 0;
441 
442     N = 1000000; dist = 500; std_dev = 50;
443     size_l = size_r = 70;
444     while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:A:")) >= 0) {
445         switch (c) {
446         case 'd': dist = atoi(optarg); break;
447         case 's': std_dev = atoi(optarg); break;
448         case 'N': N = atoi(optarg); break;
449         case '1': size_l = atoi(optarg); break;
450         case '2': size_r = atoi(optarg); break;
451         case 'e': ERR_RATE = atof(optarg); break;
452         case 'r': MUT_RATE = atof(optarg); break;
453         case 'R': INDEL_FRAC = atof(optarg); break;
454         case 'X': INDEL_EXTEND = atof(optarg); break;
455         case 'A': MAX_N_RATIO = atof(optarg); break;
456         case 'S': seed = atoi(optarg); break;
457         case 'h': is_hap = 1; break;
458         }
459     }
460     if (argc - optind < 3) return simu_usage();
461     fpout1 = fopen(argv[optind+1], "w");
462     fpout2 = fopen(argv[optind+2], "w");
463     if (!fpout1 || !fpout2) {
464         fprintf(stderr, "[wgsim] file open error\n");
465         return 1;
466     }
467     if (seed <= 0) seed = time(0)&0x7fffffff;
468     fprintf(stderr, "[wgsim] seed = %d\n", seed);
469     hts_srand48(seed);
470     wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r);
471 
472     fclose(fpout1); fclose(fpout2);
473     return 0;
474 }
475