1 /* The MIT License
2 
3    Copyright (c) 2008 Genome Research Ltd (GRL).
4                  2011 Heng Li <lh3@live.co.uk>
5 
6    Permission is hereby granted, free of charge, to any person obtaining
7    a copy of this software and associated documentation files (the
8    "Software"), to deal in the Software without restriction, including
9    without limitation the rights to use, copy, modify, merge, publish,
10    distribute, sublicense, and/or sell copies of the Software, and to
11    permit persons to whom the Software is furnished to do so, subject to
12    the following conditions:
13 
14    The above copyright notice and this permission notice shall be
15    included in all copies or substantial portions of the Software.
16 
17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24    SOFTWARE.
25 */
26 
27 /* This program is separated from maq's read simulator with Colin
28  * Hercus' modification to allow longer indels. */
29 
30 #include <stdlib.h>
31 #include <math.h>
32 #include <time.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <unistd.h>
36 #include <stdint.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <zlib.h>
40 #include "kseq.h"
41 KSEQ_INIT(gzFile, gzread)
42 
43 #define PACKAGE_VERSION "0.3.1.mod1"
44 
45 const uint8_t nst_nt4_table[256] = {
46 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
47 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
48 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
49 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
50 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
51 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
52 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
53 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
54 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
55 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
56 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
57 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
58 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
59 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
60 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
61 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
62 };
63 
64 /* Simple normal random number generator, copied from genran.c */
65 
ran_normal()66 double ran_normal()
67 {
68 	static int iset = 0;
69 	static double gset;
70 	double fac, rsq, v1, v2;
71 	if (iset == 0) {
72 		do {
73 			v1 = 2.0 * drand48() - 1.0;
74 			v2 = 2.0 * drand48() - 1.0;
75 			rsq = v1 * v1 + v2 * v2;
76 		} while (rsq >= 1.0 || rsq == 0.0);
77 		fac = sqrt(-2.0 * log(rsq) / rsq);
78 		gset = v1 * fac;
79 		iset = 1;
80 		return v2 * fac;
81 	} else {
82 		iset = 0;
83 		return gset;
84 	}
85 }
86 
87 /* wgsim */
88 
89 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
90 typedef unsigned short mut_t;
91 static mut_t mutmsk = (mut_t)0xf000;
92 
93 typedef struct {
94 	int l, m; /* length and maximum buffer size */
95 	mut_t *s; /* sequence */
96 } mutseq_t;
97 
98 static double ERR_RATE = 0.02;
99 static double MUT_RATE = 0.001;
100 static double INDEL_FRAC = 0.15;
101 static double INDEL_EXTEND = 0.3;
102 static double MAX_N_RATIO = 0.05;
103 
wgsim_mut_diref(const kseq_t * ks,int is_hap,mutseq_t * hap1,mutseq_t * hap2)104 void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
105 {
106 	int i, deleting = 0;
107 	mutseq_t *ret[2];
108 
109 	ret[0] = hap1; ret[1] = hap2;
110 	ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l;
111 	ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m;
112 	ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
113 	ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t));
114 	for (i = 0; i != ks->seq.l; ++i) {
115 		int c;
116 		c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]];
117         if (deleting) {
118             if (drand48() < INDEL_EXTEND) {
119                 if (deleting & 1) ret[0]->s[i] |= DELETE;
120                 if (deleting & 2) ret[1]->s[i] |= DELETE;
121                 continue;
122             } else deleting = 0;
123         }
124 		if (c < 4 && drand48() < MUT_RATE) { // mutation
125 			if (drand48() >= INDEL_FRAC) { // substitution
126 				double r = drand48();
127 				c = (c + (int)(r * 3.0 + 1)) & 3;
128 				if (is_hap || drand48() < 0.333333) { // hom
129 					ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
130 				} else { // het
131 					ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
132 				}
133 			} else { // indel
134 				if (drand48() < 0.5) { // deletion
135 					if (is_hap || drand48() < 0.333333) { // hom-del
136 						ret[0]->s[i] = ret[1]->s[i] = DELETE;
137                         deleting = 3;
138 					} else { // het-del
139                         deleting = drand48()<0.5?1:2;
140 						ret[deleting-1]->s[i] = DELETE;
141 					}
142 				} else { // insertion
143                     int num_ins = 0, ins = 0;
144                     do {
145                         num_ins++;
146                         ins = (ins << 2) | (int)(drand48() * 4.0);
147                     } while (num_ins < 4 && drand48() < INDEL_EXTEND);
148 
149 					if (is_hap || drand48() < 0.333333) { // hom-ins
150 						ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
151 					} else { // het-ins
152 						ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
153 					}
154 				}
155 			}
156 		}
157 	}
158 }
wgsim_print_mutref(const char * name,const kseq_t * ks,mutseq_t * hap1,mutseq_t * hap2)159 void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2)
160 {
161 	int i, j = 0; // j keeps the end of the last deletion
162 	char ref[50]="", alt[50]="";
163     char *source="wgsim";
164     char *type="indel";
165     char *strand="+";
166     char *zygo="hom"; //zygosity
167     int start=0, end=0;
168     char attr[500];
169 
170 	for (i = 0; i != ks->seq.l; ++i) {
171 		int c[3];
172 
173         zygo="hom";
174 		// GFF coordinates
175         start = end = i + 1;
176 		c[0] = nst_nt4_table[(int)ks->seq.s[i]];
177 		c[1] = hap1->s[i]; c[2] = hap2->s[i];
178 		if (c[0] >= 4) continue;
179 		alt[0] = ref[0]='\0';
180 
181 		if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
182 
183 			if (c[1] == c[2]) { // hom
184 				if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
185 				    strncat(ref, &"ACGTN"[c[0]], 1);
186 				    strncat(alt, &"ACGTN"[c[1]&0xf], 1);
187 				    type= "snp";
188 					//printf("%s\t%s\t%s\t%d\t%d\t.\t.\t.\tName=%s/%s;Ref=%s;Alt=%s\n", name, source, type, i+1, i+2, ref, alt, ref, alt);
189 				} else if ((c[1]&mutmsk) == DELETE) { // del
190 					if (i >= j) {
191 						//printf("%s\tsim\tdel\t%d\t%d\t", name, i+1, i+2);
192 						for (j = i; j < ks->seq.l && hap1->s[j] == hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
193 						    strncat(ref, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
194 							//putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
195 						strncat(alt, &"-"[0], 1);
196 						//printf("\t-\t-\n");
197 					}
198 				} else if (((c[1] & mutmsk) >> 12) <= 4) { // ins
199 					//printf("%s\twgsim\tins\t%d\t%d\t-\t", name, i+1, i+2);
200                     strncat(ref, &"-"[0], 1);
201                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
202                     while (n > 0) {
203                         strncat(alt, &"ACGTN"[ins & 0x3], 1);
204                         //putchar("ACGTN"[ins & 0x3]);
205 						ins >>= 2;
206                         n--;
207                     }
208                     //printf("\t-\n");
209 				} // else: deleted base in a long deletion
210 			} else { // het
211 			    zygo="het";
212 				if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
213 				    type="snp";
214 				    strncat(ref, &"ACGTN"[c[0]], 1);
215 				    strncat(alt, &"XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)], 1);
216 
217 					//printf("%s\twgsim\tsnp\t%d\t%c\t%c\t+\n", name, i+1, "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
218 				} else if ((c[1]&mutmsk) == DELETE) {
219 					if (i >= j) {
220 						//printf("%s\t%d\t", name, i+1);
221 						for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap1->s[j]&mutmsk) == DELETE; ++j)
222 							strncat(ref, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
223 							//putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
224 						strncat(alt, &"-"[0], 1);
225 						//printf("\t-\t-\n");
226 					}
227 				} else if ((c[2]&mutmsk) == DELETE) {
228 					if (i >= j) {
229 						//printf("%s\t%d\t", name, i+1);
230 						strncat(ref, &"-"[0], 1);
231 						for (j = i; j < ks->seq.l && hap1->s[j] != hap2->s[j] && (hap2->s[j]&mutmsk) == DELETE; ++j)
232 							strncat(alt, &"ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]], 1);
233 							//putchar("ACGTN"[nst_nt4_table[(int)ks->seq.s[j]]]);
234 						//printf("\t-\t-\n");
235 					}
236 				} else if (((c[1] & mutmsk) >> 12) <= 4 && ((c[1] & mutmsk) >> 12) > 0) { // ins1
237 					strncat(ref, &"-"[0], 1);
238 
239 					//printf("%s\t%d\t-\t", name, i+1);
240                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
241 
242                     while (n > 0) {
243                         strncat(alt, &"ACGTN"[ins & 0x3], 1);
244                         //putchar("ACGTN"[ins & 0x3]);
245 						ins >>= 2;
246                         n--;
247                     }
248                     //printf("\t+\n");
249 				} else if (((c[2] & mutmsk) >> 12) <= 4 || ((c[2] & mutmsk) >> 12) > 0) { // ins2
250 					//printf("%s\t%d\t-\t", name, i+1);
251 					strncat(ref, &"-"[0], 1);
252                     int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
253                     while (n > 0) {
254                         strncat(alt, &"ACGTN"[ins & 0x3], 1);
255                         //putchar("ACGTN"[ins & 0x3]);
256                         ins >>= 2;
257                         n--;
258                     }
259                     //printf("\t+\n");
260 				} // else: deleted base in a long deletion
261 			}
262 			snprintf(attr, 500, "Name=%s/%s;Ref=%s;Alt=%s;Type=%s",ref, alt, ref, alt, zygo);
263 			printf("%s\t%s\t%s\t%d\t%d\t.\t%s\t.\t%s\n",name, source, type, start, end, strand, attr);
264 		}
265 	}
266 }
267 
wgsim_core(FILE * fpout1,FILE * fpout2,const char * fn,int is_hap,uint64_t N,int dist,int std_dev,int size_l,int size_r,int is_fixed)268 void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r, int is_fixed)
269 {
270 	kseq_t *ks;
271     mutseq_t rseq[2];
272 	gzFile fp_fa;
273 	uint64_t tot_len, ii;
274 	int i, l, n_ref;
275 	char *qstr;
276 	int size[2], Q, max_size;
277 	uint8_t *tmp_seq[2];
278     mut_t *target;
279     uint64_t n_pairs;
280 
281 	l = size_l > size_r? size_l : size_r;
282 	qstr = (char*)calloc(l+1, 1);
283 	tmp_seq[0] = (uint8_t*)calloc(l+2, 1);
284 	tmp_seq[1] = (uint8_t*)calloc(l+2, 1);
285 	size[0] = size_l; size[1] = size_r;
286 	max_size = size_l > size_r? size_l : size_r;
287 
288 	Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
289 
290 	fp_fa = gzopen(fn, "r");
291 	ks = kseq_init(fp_fa);
292 	tot_len = n_ref = 0;
293 	//fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__);
294 	while ((l = kseq_read(ks)) >= 0) {
295 		tot_len += l;
296 		++n_ref;
297 	}
298 	//fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len);
299 	kseq_destroy(ks);
300 	gzclose(fp_fa);
301 
302 	fp_fa = gzopen(fn, "r");
303 	ks = kseq_init(fp_fa);
304 
305 	if (is_fixed) {
306 	    //fprintf(stderr, "[%s] fixed mode, generating exactly %llu reads for each contig\n", __func__, N);
307 	} else {
308 	    //float cov = N * (size_l + size_r) / tot_len;
309 	    //fprintf(stderr, "[%s] generating %llu reads in total for an average coverage %0.1fx per contig\n",  __func__, N, cov);
310 	}
311 
312 	while ((l = kseq_read(ks)) >= 0) {
313 
314 		if (is_fixed) {
315 		    n_pairs = N;
316 		} else {
317 		    n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5);
318 		}
319 
320 		if (l < dist + 3 * std_dev) {
321 			fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev);
322 			continue;
323 		}
324 
325 		// generate mutations and print them out
326 		wgsim_mut_diref(ks, is_hap, rseq, rseq+1);
327 		wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1);
328 
329 		for (ii = 0; ii != n_pairs; ++ii) { // the core loop
330 			double ran;
331 			int d, pos, s[2], is_flip = 0;
332 			int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k;
333 			FILE *fpo[2];
334 
335 			do { // avoid boundary failure
336 				ran = ran_normal();
337 				ran = ran * std_dev + dist;
338 				d = (int)(ran + 0.5);
339 				d = d > max_size? d : max_size;
340 				pos = (int)((l - d + 1) * drand48());
341 			} while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l);
342 
343 			// flip or not
344 			if (drand48() < 0.5) {
345 				fpo[0] = fpout1; fpo[1] = fpout2;
346 				s[0] = size[0]; s[1] = size[1];
347 			} else {
348 				fpo[1] = fpout1; fpo[0] = fpout2;
349 				s[1] = size[0]; s[0] = size[1];
350 				is_flip = 1;
351 			}
352 
353 			// generate the read sequences
354 			target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated
355 			n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0;
356 
357 #define __gen_read(x, start, iter) do {									\
358 				for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) {	\
359 					int c = target[i], mut_type = c & mutmsk;			\
360 					if (ext_coor[x] < 0) {								\
361 						if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \
362 						ext_coor[x] = i;								\
363 					}													\
364 					if (mut_type == DELETE) ++n_indel[x];				\
365 					else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \
366 						tmp_seq[x][k++] = c & 0xf;						\
367 						if (mut_type == SUBSTITUTE) ++n_sub[x];			\
368 					} else {											\
369 						int n, ins;										\
370 						++n_indel[x];									\
371 						tmp_seq[x][k++] = c & 0xf;						\
372 						for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \
373 							tmp_seq[x][k++] = ins & 0x3;				\
374 					}													\
375 				}														\
376 				if (k != s[x]) ext_coor[x] = -10;						\
377 			} while (0)
378 
379 			__gen_read(0, pos, ++i);
380 			__gen_read(1, pos + d - 1, --i);
381 			for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement
382 			if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s)
383 				--ii;
384 				continue;
385 			}
386 
387 			// generate sequencing errors
388 			for (j = 0; j < 2; ++j) {
389 				int n_n = 0;
390 				for (i = 0; i < s[j]; ++i) {
391 					int c = tmp_seq[j][i];
392 					if (c >= 4) { // actually c should be never larger than 4 if everything is correct
393 						c = 4;
394 						++n_n;
395 					} else if (drand48() < ERR_RATE) {
396 						// c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors
397 						c = (c + 1) & 3; // recurrent sequencing errors
398 						++n_err[j];
399 					}
400 					tmp_seq[j][i] = c;
401 				}
402 				if ((double)n_n / s[j] > MAX_N_RATIO) break;
403 			}
404 			if (j < 2) { // too many ambiguous bases on one of the reads
405 				--ii;
406 				continue;
407 			}
408 
409 			// print
410 			for (j = 0; j < 2; ++j) {
411 				for (i = 0; i < s[j]; ++i) qstr[i] = Q;
412 				qstr[i] = 0;
413 				fprintf(fpo[j], "@%s|%u|%u|%d:%d:%d|%d:%d:%d|%llx\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1,
414 						n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1],
415 						(long long)ii);
416 				for (i = 0; i < s[j]; ++i)
417 					fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]);
418 				fprintf(fpo[j], "\n+\n%s\n", qstr);
419 			}
420 		}
421 		free(rseq[0].s); free(rseq[1].s);
422 	}
423 	kseq_destroy(ks);
424 	gzclose(fp_fa);
425 	free(qstr);
426 	free(tmp_seq[0]); free(tmp_seq[1]);
427 }
428 
simu_usage()429 static int simu_usage()
430 {
431 	fprintf(stderr, "\n");
432 	fprintf(stderr, "Program: pywgsim (short read simulator)\n");
433 	fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
434 	fprintf(stderr, "Original: https://github.com/lh3/wgsim\n");
435 	fprintf(stderr, "Modified: https://github.com/ialbert/pywgsim\n\n");
436 	fprintf(stderr, "Usage:   pywgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
437 	fprintf(stderr, "Options: -e FLOAT      base error rate [%.3f]\n", ERR_RATE);
438 	fprintf(stderr, "         -d INT        outer distance between the two ends [500]\n");
439 	fprintf(stderr, "         -s INT        standard deviation [50]\n");
440 	fprintf(stderr, "         -N INT        number of read pairs [10000]\n");
441 	fprintf(stderr, "         -1 INT        length of the first read [100]\n");
442 	fprintf(stderr, "         -2 INT        length of the second read [100]\n");
443 	fprintf(stderr, "         -r FLOAT      rate of mutations [%.4f]\n", MUT_RATE);
444 	fprintf(stderr, "         -R FLOAT      fraction of indels [%.2f]\n", INDEL_FRAC);
445 	fprintf(stderr, "         -X FLOAT      probability an indel is extended [%.2f]\n", INDEL_EXTEND);
446 	fprintf(stderr, "         -S INT        seed for random generator [-1]\n");
447 	fprintf(stderr, "         -A FLOAT      disgard if the fraction of ambiguous bases higher than FLOAT [%.2f]\n", MAX_N_RATIO);
448 	fprintf(stderr, "         -h            haploid mode\n");
449 	fprintf(stderr, "         -f            fix the number of read pairs to N per contig\n");
450 
451 	fprintf(stderr, "\n");
452 	return 1;
453 }
454 
455 /*
456 The Python wrapped function exposes all parameters as Python primitives.
457 */
wgsim_wrap(const char * r1,const char * r2,const char * ref,float err_rate,float mut_rate,float indel_frac,float indel_extend,float max_n,int is_hap,long N,int dist,int std_dev,int size_l,int size_r,int is_fixed,int seed)458 int wgsim_wrap(const char *r1, const char *r2, const char *ref, float err_rate, float mut_rate, float indel_frac, float indel_extend, float max_n, int is_hap, long N, int dist, int std_dev, int size_l, int size_r, int is_fixed, int seed)
459 {
460 	FILE *fpout1, *fpout2;
461 
462     // Set the global variables.
463     ERR_RATE = err_rate;
464     MUT_RATE = mut_rate;
465     INDEL_FRAC = indel_frac;
466     INDEL_EXTEND = indel_extend;
467     MAX_N_RATIO = max_n;
468 
469 	if (seed <= 0) seed = time(0)&0x7fffffff;
470 
471 	srand48(seed);
472 
473 	fpout1 = fopen(r1, "w");
474 	fpout2 = fopen(r2, "w");
475 
476 	if (!fpout1 || !fpout2) {
477 		fprintf(stderr, "[wgsim] file open error\n");
478 		return 1;
479 	}
480 
481     // Print GFF header and simulation paramteres
482     printf("##gff-version 3\n");
483     printf("#\n# N=%ld err=%0.4g mut=%0.4g frac=%0.4g ext=%0.4g dist=%d stdev=%d L1=%d L2=%d seed=%d\n#\n",\
484         N, err_rate, mut_rate, indel_frac, indel_extend, dist, std_dev, size_l, size_r, seed);
485 
486     // Call main function
487     wgsim_core(fpout1, fpout2, ref, is_hap, (uint64_t) N, dist, std_dev, size_l, size_r, is_fixed);
488 
489 	fclose(fpout1); fclose(fpout2);
490 
491     return 0;
492 }
493 
main(int argc,char * argv[])494 int main(int argc, char *argv[])
495 {
496 	int64_t N;
497 	int dist, std_dev, c, size_l, size_r, is_hap = 0, is_fixed=0;
498 	FILE *fpout1, *fpout2;
499 	int seed = -1;
500 
501 
502 	N = 10000; dist = 500; std_dev = 50;
503 	size_l = size_r = 100;
504 
505 	while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:fhX:S:A:")) >= 0) {
506 		switch (c) {
507 		case 'd': dist = atoi(optarg); break;
508 		case 's': std_dev = atoi(optarg); break;
509 		case 'N': N = atoi(optarg); break;
510 		case '1': size_l = atoi(optarg); break;
511 		case '2': size_r = atoi(optarg); break;
512 		case 'e': ERR_RATE = atof(optarg); break;
513 		case 'r': MUT_RATE = atof(optarg); break;
514 		case 'R': INDEL_FRAC = atof(optarg); break;
515 		case 'X': INDEL_EXTEND = atof(optarg); break;
516 		case 'A': MAX_N_RATIO = atof(optarg); break;
517 		case 'S': seed = atoi(optarg); break;
518 		case 'h': is_hap = 1; break;
519 		case 'f': is_fixed = 1; break;
520 		}
521 	}
522 	if (argc - optind < 3) return simu_usage();
523 	fpout1 = fopen(argv[optind+1], "w");
524 	fpout2 = fopen(argv[optind+2], "w");
525 	if (!fpout1 || !fpout2) {
526 		fprintf(stderr, "[wgsim] file open error\n");
527 		return 1;
528 	}
529 	if (seed <= 0) seed = time(0)&0x7fffffff;
530 	//fprintf(stderr, "[wgsim] seed = %d\n", seed);
531 	srand48(seed);
532 
533     printf("##gff-version 3\n");
534 
535 	wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r, is_fixed);
536 
537 	fclose(fpout1); fclose(fpout2);
538 	return 0;
539 }
540