1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4 2011 Heng Li <lh3@live.co.uk>
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 SOFTWARE.
25 */
26
27 /* This program is separated from maq's read simulator with Colin
28 * Hercus' modification to allow longer indels. */
29
30 #include <stdlib.h>
31 #include <math.h>
32 #include <time.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <stdint.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <zlib.h>
40 #include "kseq.h"
41 KSEQ_INIT(gzFile, gzread)
42
43 #define PACKAGE_VERSION "0.3.1.mod1"
44
45 const uint8_t nst_nt4_table[256] = {
46 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
62 };
63
64 /* Simple normal random number generator, copied from genran.c */
65
ran_normal()66 double ran_normal()
67 {
68 static int iset = 0;
69 static double gset;
70 double fac, rsq, v1, v2;
71 if (iset == 0) {
72 do {
73 v1 = 2.0 * drand48() - 1.0;
74 v2 = 2.0 * drand48() - 1.0;
75 rsq = v1 * v1 + v2 * v2;
76 } while (rsq >= 1.0 || rsq == 0.0);
77 fac = sqrt(-2.0 * log(rsq) / rsq);
78 gset = v1 * fac;
79 iset = 1;
80 return v2 * fac;
81 } else {
82 iset = 0;
83 return gset;
84 }
85 }
86
87 /* wgsim */
88
89 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
90 typedef unsigned short mut_t;
91 static mut_t mutmsk = (mut_t)0xf000;
92
93 typedef struct {
94 int l, m; /* length and maximum buffer size */
95 mut_t *s; /* sequence */
96 } mutseq_t;
97
98 static double ERR_RATE = 0.02;
99 static double MUT_RATE = 0.001;
100 static double INDEL_FRAC = 0.15;
101 static double INDEL_EXTEND = 0.3;
102 static double MAX_N_RATIO = 0.05;
103
wgsim_mut_diref(const kseq_t * ks,int is_hap,mutseq_t * hap1,mutseq_t * hap2)104 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
105 {
106 int i, deleting = 0;
107 mutseq_t *ret[2];
108
109 ret[0] = hap1; ret[1] = hap2;
110 ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
111 ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
112 ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
113 ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
114 for (i = 0; i != ks->seq.l; ++i) {
115 int c;
116 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
117 if (deleting) {
118 if (drand48() < INDEL_EXTEND) {
119 if (deleting & 1) ret[0]->s[i] |= DELETE;
120 if (deleting & 2) ret[1]->s[i] |= DELETE;
121 continue;
122 } else deleting = 0;
123 }
124 if (c < 4 && drand48() < MUT_RATE) { // mutation
125 if (drand48() >= INDEL_FRAC) { // substitution
126 double r = drand48();
127 c = (c + (int)(r * 3.0 + 1)) & 3;
128 if (is_hap || drand48() < 0.333333) { // hom
129 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
130 } else { // het
131 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
132 }
133 } else { // indel
134 if (drand48() < 0.5) { // deletion
135 if (is_hap || drand48() < 0.333333) { // hom-del
136 ret[0]->s[i] = ret[1]->s[i] = DELETE;
137 deleting = 3;
138 } else { // het-del
139 deleting = drand48()<0.5?1:2;
140 ret[deleting-1]->s[i] = DELETE;
141 }
142 } else { // insertion
143 int num_ins = 0, ins = 0;
144 do {
145 num_ins++;
146 ins = (ins << 2) | (int)(drand48() * 4.0);
147 } while (num_ins < 4 && drand48() < INDEL_EXTEND);
148
149 if (is_hap || drand48() < 0.333333) { // hom-ins
150 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
151 } else { // het-ins
152 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
153 }
154 }
155 }
156 }
157 }
158 }
wgsim_print_mutref(const char * name,const kseq_t * ks,mutseq_t * hap1,mutseq_t * hap2)159 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
160 {
161 int i, j = 0; // j keeps the end of the last deletion
162 char ref[50]="", alt[50]="";
163 char *source="wgsim";
164 char *type="indel";
165 char *strand="+";
166 char *zygo="hom"; //zygosity
167 int start=0, end=0;
168 char attr[500];
169
170 for (i = 0; i != ks->seq.l; ++i) {
171 int c[3];
172
173 zygo="hom";
174 // GFF coordinates
175 start = end = i + 1;
176 c[0] = nst_nt4_table[(int)ks->seq.s[i]];
177 c[1] = hap1->s[i]; c[2] = hap2->s[i];
178 if (c[0] >= 4) continue;
179 alt[0] = ref[0]='\0';
180
181 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
182
183 if (c[1] == c[2]) { // hom
184 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
185 strncat(ref, &"ACGTN"[c[0]], 1);
186 strncat(alt, &"ACGTN"[c[1]&0xf], 1);
187 type= "snp";
188 //printf("%s\t%s\t%s\t%d\t%d\t.\t.\t.\tName=%s/%s;Ref=%s;Alt=%s\n", name, source, type, i+1, i+2, ref, alt, ref, alt);
189 } else if ((c[1]&mutmsk) == DELETE) { // del
190 if (i >= j) {
191 //printf("%s\tsim\tdel\t%d\t%d\t", name, i+1, i+2);
192 for (j = i; j < ks->seq.l && hap1->s[j] == hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
193 strncat(ref, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
194 //putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
195 strncat(alt, &"-"[0], 1);
196 //printf("\t-\t-\n");
197 }
198 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins
199 //printf("%s\twgsim\tins\t%d\t%d\t-\t", name, i+1, i+2);
200 strncat(ref, &"-"[0], 1);
201 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
202 while (n > 0) {
203 strncat(alt, &"ACGTN"[ins & 0x3], 1);
204 //putchar("ACGTN"[ins & 0x3]);
205 ins >>= 2;
206 n--;
207 }
208 //printf("\t-\n");
209 } // else: deleted base in a long deletion
210 } else { // het
211 zygo="het";
212 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
213 type="snp";
214 strncat(ref, &"ACGTN"[c[0]], 1);
215 strncat(alt, &"XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)], 1);
216
217 //printf("%s\twgsim\tsnp\t%d\t%c\t%c\t+\n", name, i+1, "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
218 } else if ((c[1]&mutmsk) == DELETE) {
219 if (i >= j) {
220 //printf("%s\t%d\t", name, i+1);
221 for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
222 strncat(ref, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
223 //putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
224 strncat(alt, &"-"[0], 1);
225 //printf("\t-\t-\n");
226 }
227 } else if ((c[2]&mutmsk) == DELETE) {
228 if (i >= j) {
229 //printf("%s\t%d\t", name, i+1);
230 strncat(ref, &"-"[0], 1);
231 for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap2->s[j]&mutmsk) == DELETE; ++j)
232 strncat(alt, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
233 //putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
234 //printf("\t-\t-\n");
235 }
236 } else if (((c[1] & mutmsk) >> 12) <= 4 && ((c[1] & mutmsk) >> 12) > 0) { // ins1
237 strncat(ref, &"-"[0], 1);
238
239 //printf("%s\t%d\t-\t", name, i+1);
240 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
241
242 while (n > 0) {
243 strncat(alt, &"ACGTN"[ins & 0x3], 1);
244 //putchar("ACGTN"[ins & 0x3]);
245 ins >>= 2;
246 n--;
247 }
248 //printf("\t+\n");
249 } else if (((c[2] & mutmsk) >> 12) <= 4 || ((c[2] & mutmsk) >> 12) > 0) { // ins2
250 //printf("%s\t%d\t-\t", name, i+1);
251 strncat(ref, &"-"[0], 1);
252 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
253 while (n > 0) {
254 strncat(alt, &"ACGTN"[ins & 0x3], 1);
255 //putchar("ACGTN"[ins & 0x3]);
256 ins >>= 2;
257 n--;
258 }
259 //printf("\t+\n");
260 } // else: deleted base in a long deletion
261 }
262 snprintf(attr, 500, "Name=%s/%s;Ref=%s;Alt=%s;Type=%s",ref, alt, ref, alt, zygo);
263 printf("%s\t%s\t%s\t%d\t%d\t.\t%s\t.\t%s\n",name, source, type, start, end, strand, attr);
264 }
265 }
266 }
267
wgsim_core(FILE * fpout1,FILE * fpout2,const char * fn,int is_hap,uint64_t N,int dist,int std_dev,int size_l,int size_r,int is_fixed)268 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r, int is_fixed)
269 {
270 kseq_t *ks;
271 mutseq_t rseq[2];
272 gzFile fp_fa;
273 uint64_t tot_len, ii;
274 int i, l, n_ref;
275 char *qstr;
276 int size[2], Q, max_size;
277 uint8_t *tmp_seq[2];
278 mut_t *target;
279 uint64_t n_pairs;
280
281 l = size_l > size_r? size_l : size_r;
282 qstr = (char*)calloc(l+1, 1);
283 tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
284 tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
285 size[0] = size_l; size[1] = size_r;
286 max_size = size_l > size_r? size_l : size_r;
287
288 Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
289
290 fp_fa = gzopen(fn, "r");
291 ks = kseq_init(fp_fa);
292 tot_len = n_ref = 0;
293 //fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
294 while ((l = kseq_read(ks)) >= 0) {
295 tot_len += l;
296 ++n_ref;
297 }
298 //fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
299 kseq_destroy(ks);
300 gzclose(fp_fa);
301
302 fp_fa = gzopen(fn, "r");
303 ks = kseq_init(fp_fa);
304
305 if (is_fixed) {
306 //fprintf(stderr, "[%s] fixed mode, generating exactly %llu reads for each contig\n", __func__, N);
307 } else {
308 //float cov = N * (size_l + size_r) / tot_len;
309 //fprintf(stderr, "[%s] generating %llu reads in total for an average coverage %0.1fx per contig\n", __func__, N, cov);
310 }
311
312 while ((l = kseq_read(ks)) >= 0) {
313
314 if (is_fixed) {
315 n_pairs = N;
316 } else {
317 n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
318 }
319
320 if (l < dist + 3 * std_dev) {
321 fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
322 continue;
323 }
324
325 // generate mutations and print them out
326 wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
327 wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
328
329 for (ii = 0; ii != n_pairs; ++ii) { // the core loop
330 double ran;
331 int d, pos, s[2], is_flip = 0;
332 int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
333 FILE *fpo[2];
334
335 do { // avoid boundary failure
336 ran = ran_normal();
337 ran = ran * std_dev + dist;
338 d = (int)(ran + 0.5);
339 d = d > max_size? d : max_size;
340 pos = (int)((l - d + 1) * drand48());
341 } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
342
343 // flip or not
344 if (drand48() < 0.5) {
345 fpo[0] = fpout1; fpo[1] = fpout2;
346 s[0] = size[0]; s[1] = size[1];
347 } else {
348 fpo[1] = fpout1; fpo[0] = fpout2;
349 s[1] = size[0]; s[0] = size[1];
350 is_flip = 1;
351 }
352
353 // generate the read sequences
354 target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
355 n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
356
357 #define __gen_read(x, start, iter) do { \
358 for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \
359 int c = target[i], mut_type = c & mutmsk; \
360 if (ext_coor[x] < 0) { \
361 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
362 ext_coor[x] = i; \
363 } \
364 if (mut_type == DELETE) ++n_indel[x]; \
365 else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
366 tmp_seq[x][k++] = c & 0xf; \
367 if (mut_type == SUBSTITUTE) ++n_sub[x]; \
368 } else { \
369 int n, ins; \
370 ++n_indel[x]; \
371 tmp_seq[x][k++] = c & 0xf; \
372 for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
373 tmp_seq[x][k++] = ins & 0x3; \
374 } \
375 } \
376 if (k != s[x]) ext_coor[x] = -10; \
377 } while (0)
378
379 __gen_read(0, pos, ++i);
380 __gen_read(1, pos + d - 1, --i);
381 for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
382 if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
383 --ii;
384 continue;
385 }
386
387 // generate sequencing errors
388 for (j = 0; j < 2; ++j) {
389 int n_n = 0;
390 for (i = 0; i < s[j]; ++i) {
391 int c = tmp_seq[j][i];
392 if (c >= 4) { // actually c should be never larger than 4 if everything is correct
393 c = 4;
394 ++n_n;
395 } else if (drand48() < ERR_RATE) {
396 // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
397 c = (c + 1) & 3; // recurrent sequencing errors
398 ++n_err[j];
399 }
400 tmp_seq[j][i] = c;
401 }
402 if ((double)n_n / s[j] > MAX_N_RATIO) break;
403 }
404 if (j < 2) { // too many ambiguous bases on one of the reads
405 --ii;
406 continue;
407 }
408
409 // print
410 for (j = 0; j < 2; ++j) {
411 for (i = 0; i < s[j]; ++i) qstr[i] = Q;
412 qstr[i] = 0;
413 fprintf(fpo[j], "@%s|%u|%u|%d:%d:%d|%d:%d:%d|%llx\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
414 n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
415 (long long)ii);
416 for (i = 0; i < s[j]; ++i)
417 fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
418 fprintf(fpo[j], "\n+\n%s\n", qstr);
419 }
420 }
421 free(rseq[0].s); free(rseq[1].s);
422 }
423 kseq_destroy(ks);
424 gzclose(fp_fa);
425 free(qstr);
426 free(tmp_seq[0]); free(tmp_seq[1]);
427 }
428
simu_usage()429 static int simu_usage()
430 {
431 fprintf(stderr, "\n");
432 fprintf(stderr, "Program: pywgsim (short read simulator)\n");
433 fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
434 fprintf(stderr, "Original: https://github.com/lh3/wgsim\n");
435 fprintf(stderr, "Modified: https://github.com/ialbert/pywgsim\n\n");
436 fprintf(stderr, "Usage: pywgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
437 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
438 fprintf(stderr, " -d INT outer distance between the two ends [500]\n");
439 fprintf(stderr, " -s INT standard deviation [50]\n");
440 fprintf(stderr, " -N INT number of read pairs [10000]\n");
441 fprintf(stderr, " -1 INT length of the first read [100]\n");
442 fprintf(stderr, " -2 INT length of the second read [100]\n");
443 fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE);
444 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
445 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
446 fprintf(stderr, " -S INT seed for random generator [-1]\n");
447 fprintf(stderr, " -A FLOAT disgard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO);
448 fprintf(stderr, " -h haploid mode\n");
449 fprintf(stderr, " -f fix the number of read pairs to N per contig\n");
450
451 fprintf(stderr, "\n");
452 return 1;
453 }
454
455 /*
456 The Python wrapped function exposes all parameters as Python primitives.
457 */
wgsim_wrap(const char * r1,const char * r2,const char * ref,float err_rate,float mut_rate,float indel_frac,float indel_extend,float max_n,int is_hap,long N,int dist,int std_dev,int size_l,int size_r,int is_fixed,int seed)458 int wgsim_wrap(const char *r1, const char *r2, const char *ref, float err_rate, float mut_rate, float indel_frac, float indel_extend, float max_n, int is_hap, long N, int dist, int std_dev, int size_l, int size_r, int is_fixed, int seed)
459 {
460 FILE *fpout1, *fpout2;
461
462 // Set the global variables.
463 ERR_RATE = err_rate;
464 MUT_RATE = mut_rate;
465 INDEL_FRAC = indel_frac;
466 INDEL_EXTEND = indel_extend;
467 MAX_N_RATIO = max_n;
468
469 if (seed <= 0) seed = time(0)&0x7fffffff;
470
471 srand48(seed);
472
473 fpout1 = fopen(r1, "w");
474 fpout2 = fopen(r2, "w");
475
476 if (!fpout1 || !fpout2) {
477 fprintf(stderr, "[wgsim] file open error\n");
478 return 1;
479 }
480
481 // Print GFF header and simulation paramteres
482 printf("##gff-version 3\n");
483 printf("#\n# N=%ld err=%0.4g mut=%0.4g frac=%0.4g ext=%0.4g dist=%d stdev=%d L1=%d L2=%d seed=%d\n#\n",\
484 N, err_rate, mut_rate, indel_frac, indel_extend, dist, std_dev, size_l, size_r, seed);
485
486 // Call main function
487 wgsim_core(fpout1, fpout2, ref, is_hap, (uint64_t) N, dist, std_dev, size_l, size_r, is_fixed);
488
489 fclose(fpout1); fclose(fpout2);
490
491 return 0;
492 }
493
main(int argc,char * argv[])494 int main(int argc, char *argv[])
495 {
496 int64_t N;
497 int dist, std_dev, c, size_l, size_r, is_hap = 0, is_fixed=0;
498 FILE *fpout1, *fpout2;
499 int seed = -1;
500
501
502 N = 10000; dist = 500; std_dev = 50;
503 size_l = size_r = 100;
504
505 while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:fhX:S:A:")) >= 0) {
506 switch (c) {
507 case 'd': dist = atoi(optarg); break;
508 case 's': std_dev = atoi(optarg); break;
509 case 'N': N = atoi(optarg); break;
510 case '1': size_l = atoi(optarg); break;
511 case '2': size_r = atoi(optarg); break;
512 case 'e': ERR_RATE = atof(optarg); break;
513 case 'r': MUT_RATE = atof(optarg); break;
514 case 'R': INDEL_FRAC = atof(optarg); break;
515 case 'X': INDEL_EXTEND = atof(optarg); break;
516 case 'A': MAX_N_RATIO = atof(optarg); break;
517 case 'S': seed = atoi(optarg); break;
518 case 'h': is_hap = 1; break;
519 case 'f': is_fixed = 1; break;
520 }
521 }
522 if (argc - optind < 3) return simu_usage();
523 fpout1 = fopen(argv[optind+1], "w");
524 fpout2 = fopen(argv[optind+2], "w");
525 if (!fpout1 || !fpout2) {
526 fprintf(stderr, "[wgsim] file open error\n");
527 return 1;
528 }
529 if (seed <= 0) seed = time(0)&0x7fffffff;
530 //fprintf(stderr, "[wgsim] seed = %d\n", seed);
531 srand48(seed);
532
533 printf("##gff-version 3\n");
534
535 wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r, is_fixed);
536
537 fclose(fpout1); fclose(fpout2);
538 return 0;
539 }
540