1 #include <stdio.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <time.h>
5 #include <math.h>
6 #include <stdlib.h>
7 #include <unistd.h>
8 #include <ctype.h>
9 #include <assert.h>
10 
11 #define PACKAGE_VERSION "0.1.1"
12 
13 uint8_t nst_nt4_table[256] = {
14 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
15 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
16 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
17 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
18 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
19 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
20 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
21 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
22 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
23 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
24 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
25 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
26 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
27 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
28 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
29 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
30 };
31 
32 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
33 typedef unsigned short mut_t;
34 static mut_t mutmsk = (mut_t)0xf000;
35 
36 typedef struct {
37 	int l, m; /* length and maximum buffer size */
38 	unsigned char *s; /* sequence */
39 } seq_t;
40 
41 typedef struct {
42 	int l, m; /* length and maximum buffer size */
43 	mut_t *s; /* sequence */
44 } mutseq_t;
45 
46 typedef struct {
47 	uint64_t l, m;
48 	uint64_t *idx;
49 } idx_t;
50 
51 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
52 #define INIT_IDX(index) (index).idx = 0; (index).l = (index).m = 0
53 
54 static int SEQ_BLOCK_SIZE = 512;
55 
seq_set_block_size(int size)56 void seq_set_block_size(int size)
57 {
58 	SEQ_BLOCK_SIZE = size;
59 }
60 
seq_read_fasta(FILE * fp,seq_t * seq,char * locus,char * comment)61 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
62 {
63 	int c, l, max;
64 	char *p;
65 
66 	c = 0;
67 	while (!feof(fp) && fgetc(fp) != '>');
68 	if (feof(fp)) return -1;
69 	p = locus;
70 	while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
71 		if (c != '\r') *p++ = c;
72 	*p = '\0';
73 	if (comment) {
74 		p = comment;
75 		if (c != '\n') {
76 			while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
77 			if (c != '\n') {
78 				*p++ = c;
79 				while (!feof(fp) && (c = fgetc(fp)) != '\n')
80 					if (c != '\r') *p++ = c;
81 			}
82 		}
83 		*p = '\0';
84 	} else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
85 	l = 0; max = seq->m;
86 	while (!feof(fp) && (c = fgetc(fp)) != '>') {
87 		if (isalpha(c) || c == '-' || c == '.') {
88 			if (l + 1 >= max) {
89 				max += SEQ_BLOCK_SIZE;
90 				seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
91 			}
92 			seq->s[l++] = (unsigned char)c;
93 		}
94 	}
95 	if (c == '>') ungetc(c,fp);
96 	seq->s[l] = 0;
97 	seq->m = max; seq->l = l;
98 	return l;
99 }
100 
101 /* Error-checking open, copied from utils.c */
102 
103 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
104 
err_xopen_core(const char * func,const char * fn,const char * mode)105 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
106 {
107 	FILE *fp = 0;
108 	if (strcmp(fn, "-") == 0)
109 		return (strstr(mode, "r"))? stdin : stdout;
110 	if ((fp = fopen(fn, mode)) == 0) {
111 		fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
112 		abort();
113 	}
114 	return fp;
115 }
116 
117 static double ERR_RATE = 0.02;
118 static double DEPTH = 10.0;
119 static double MUT_RATE = 0.001;
120 static double HOM_RATE = 0.0;
121 static double INDEL_FRAC = 0.1;
122 static double INDEL_EXTEND = 0.3;
123 
uc(unsigned char * s)124 void uc(unsigned char *s)
125 {
126 	while (*s) {
127 		*s = toupper(*s);
128 		s++;
129 	}
130 }
131 
132 
strindex(idx_t * index,unsigned char * s,unsigned char * t)133 int strindex(idx_t *index, unsigned char *s, unsigned char *t)
134 {
135 	uint64_t i, j, k;
136 	uint64_t l, max;
137 
138 	i = 0;
139 	while (t[i]) {
140 		if (nst_nt4_table[(int)t[i]] == 4) {
141 			//printf("here%c\n", *t);
142 			return -1;
143 		}
144 		i++;
145 	}
146 
147 	l = max = 0;
148 	for (i = 0; s[i] != '\0'; i++) {
149 		for (j=i, k=0; t[k]!='\0' && s[j]==t[k]; j++, k++)
150 			;
151 		if (k > 0 && t[k] == '\0') {
152 			if (l + 1 >= max) {
153 		  		max += SEQ_BLOCK_SIZE;
154 				index->idx = (uint64_t*)realloc(index->idx, sizeof(uint64_t) * max);
155 			}
156 			index->idx[l++] = i;
157 		}
158 	}
159 
160 	if (l) {
161 		index->l = l;
162 		index->m = max;
163 		index->idx[l] = -1;
164 		return l;
165 	}
166 	else
167 		return -1;
168 }
169 
170 /* Simple normal random number generator, copied from genran.c */
171 
ran_normal()172 double ran_normal()
173 {
174 	static int iset = 0;
175 	static double gset;
176 	double fac, rsq, v1, v2;
177 	if (iset == 0) {
178 		do {
179 			v1 = 2.0 * drand48() - 1.0;
180 			v2 = 2.0 * drand48() - 1.0;
181 			rsq = v1 * v1 + v2 * v2;
182 		} while (rsq >= 1.0 || rsq == 0.0);
183 		fac = sqrt(-2.0 * log(rsq) / rsq);
184 		gset = v1 * fac;
185 		iset = 1;
186 		return v2 * fac;
187 	} else {
188 		iset = 0;
189 		return gset;
190 	}
191 }
maq_mut_diref(const seq_t * seq,int is_hap,mutseq_t * hap1,mutseq_t * hap2)192 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
193 {
194 	int i, deleting = 0;
195 	mutseq_t *ret[2];
196 
197 	ret[0] = hap1; ret[1] = hap2;
198 	ret[0]->l = seq->l; ret[1]->l = seq->l;
199 	ret[0]->m = seq->m; ret[1]->m = seq->m;
200 	ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
201 	ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
202 	for (i = 0; i != seq->l; ++i) {
203 		int c;
204 		c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
205         if (deleting) {
206             if (drand48() < INDEL_EXTEND) {
207                 if (deleting & 1) ret[0]->s[i] |= DELETE;
208                 if (deleting & 2) ret[1]->s[i] |= DELETE;
209                 continue;
210             } else deleting = 0;
211         }
212 		if (c < 4 && drand48() < MUT_RATE) { // mutation
213 			if (drand48() >= INDEL_FRAC) { // substitution
214 				double r = drand48();
215 				c = (c + (int)(r * 3.0 + 1)) & 3;
216 				if (is_hap || drand48() < HOM_RATE) { // hom
217 					ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
218 				} else { // het
219 					ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
220 				}
221 			} else { // indel
222 				if (drand48() < 0.5) { // deletion
223 					if (is_hap || drand48() < HOM_RATE) { // hom-del
224 						ret[0]->s[i] = ret[1]->s[i] = DELETE;
225                         deleting = 3;
226 					} else { // het-del
227                         deleting = drand48()<0.5?1:2;
228 						ret[deleting-1]->s[i] = DELETE;
229 					}
230 				} else { // insertion
231                     int num_ins = 0, ins = 0;
232                     do {
233                         num_ins++;
234                         ins = (ins << 2) | (int)(drand48() * 4.0);
235                     } while(num_ins < 4 && drand48() < INDEL_EXTEND);
236 
237 					if (is_hap || drand48() < HOM_RATE) { // hom-ins
238 						ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
239 					} else { // het-ins
240 						ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
241 					}
242 				}
243 			}
244 		}
245 	}
246 }
maq_print_mutref(const char * name,const seq_t * seq,mutseq_t * hap1,mutseq_t * hap2)247 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
248 {
249 	int i;
250 	for (i = 0; i != seq->l; ++i) {
251 		int c[3];
252 		c[0] = nst_nt4_table[(int)seq->s[i]];
253 		c[1] = hap1->s[i]; c[2] = hap2->s[i];
254 		if (c[0] >= 4) continue;
255 		if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
256 			printf("%s\t%d\t", name, i+1);
257 			if (c[1] == c[2]) { // hom
258 				if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
259 					printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
260 				} else if ((c[1]&mutmsk) == DELETE) { // del
261 					printf("%c\t-\t-\n", "ACGTN"[c[0]]);
262 				} else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
263 					printf("-\t");
264                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
265                     while(n > 0) {
266                         putchar("ACGTN"[ins & 0x3]);
267                         n--;
268                     }
269                     printf("\t-\n");
270 				}  else assert(0);
271 			} else { // het
272 				if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
273 					printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
274 				} else if ((c[1]&mutmsk) == DELETE) {
275 					printf("%c\t-\t+\n", "ACGTN"[c[0]]);
276 				} else if ((c[2]&mutmsk) == DELETE) {
277 					printf("%c\t-\t+\n", "ACGTN"[c[0]]);
278 				} else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
279 					printf("-\t");
280                     int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
281                     while (n > 0) {
282                         putchar("ACGTN"[ins & 0x3]);
283                         n--;
284                     }
285                     printf("\t+\n");
286 				} else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
287 					printf("-\t");
288                     int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
289                     while (n > 0) {
290                         putchar("ACGTN"[ins & 0x3]);
291                         ins >>= 2;
292                         n--;
293                     }
294                     printf("\t+\n");
295 				} else assert(0);
296 			}
297 		}
298 	}
299 }
300 
301 
302 //wiki knuth method
poisson_num_gen(double lamda)303 int poisson_num_gen(double lamda)
304 {
305 	int k = 0;
306 	double L = exp(-lamda);
307 	double p = 1.0;
308 
309 	do {
310 		k++;
311 		p = p * drand48();
312 	} while (p > L);
313 
314 	return k-1;
315 }
316 
ezmsim_LR_core(FILE * fpout1,FILE * fpout2,FILE * fp_fa,int size_l,int size_r,unsigned char * cut,int pos)317 void ezmsim_LR_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int size_l, int size_r, unsigned char *cut, int pos)
318 {
319 	idx_t index;
320 	seq_t seq;
321 	uint64_t tot_len, dep;
322 	unsigned int i, k;
323 	int len, n_ref, j, size[2], Q, m;
324 	char name[256], *qstr;
325 	uint8_t *tmp_seq[2], c;
326 	uint64_t id;
327 
328 	INIT_SEQ(seq);
329 	INIT_IDX(index);
330 
331 	Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
332 
333 	srand48(time(0));
334 	seq_set_block_size(0x1000000);
335 	len = size_l > size_r?size_l:size_r;
336 	qstr = (char*)calloc(len+1, 1);
337 	tmp_seq[0] = (uint8_t*)calloc(len+2, 1);
338 	tmp_seq[1] = (uint8_t*)calloc(len+2, 1);
339 	size[0] = size_l; size[1] = size_r;
340 	tot_len = n_ref = 0; id = 0;
341 	while ((len = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
342 		uc(seq.s);
343 		uc(cut);
344 		if (strindex(&index, seq.s, cut) != -1)
345 			printf("chromsome %s has %llu digest sites\n", name, (unsigned long long)index.l);
346 
347 		for (i = 0; i < index.l; i++) {
348 			if (i - size_l <= 0 || i + size_r > (unsigned int)len)
349 				continue;
350 			dep = poisson_num_gen(DEPTH);
351 			for (k = 0; k < dep; k++) {
352 				FILE *fpo[2];
353 				int is_flip = 0, s[2];
354 				id++;
355 
356 				if (drand48() < 0.5) {
357 					fpo[0] = fpout1; fpo[1] = fpout2;
358 					s[0] = size[0]; s[1] = size[1];
359 				} else {
360 					fpo[1] = fpout1; fpo[0] = fpout2;
361 					s[1] = size[0]; s[0] = size[1];
362 					is_flip = 1;
363 				}
364 
365 				for (j = 0; j < s[0]; j++) {
366 					c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-j-1]];
367 					if (c >= 4) c = 4;
368 					else if (drand48() < ERR_RATE) {
369 						c = (c + (int)(drand48()*3.0 + 1)) & 3;
370 					}
371 					tmp_seq[0][j] = c;
372 				}
373 
374 				for (j = 0; j < s[1]; j++) {
375 					c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+j]];
376 					if (c >= 4) c = 4;
377 					else if (drand48() < ERR_RATE) {
378 						c = (c + (int)(drand48()*3.0 + 1)) & 3;
379 					}
380 					tmp_seq[1][j] = c < 4?3-c:4;
381 				}
382 
383 				for (m = 0; m < 2; m++) {
384 					fprintf(fpo[m], "@%s_%s_%llu_%llu/%d\n", name, cut, (unsigned long long)index.idx[i], (unsigned long long)id, m==0?is_flip+1:2-is_flip);
385 					for (j = 0; j < s[m]; j++) {
386 						qstr[j] = Q;
387 						fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
388 					}
389 					qstr[j] = 0;
390 					fprintf(fpo[m], "\n+\n%s\n", qstr);
391 				}
392 			}
393 			//fprintf(stderr, "%llu ", index.idx[i]);
394 		}
395 		printf("\n");
396 
397 		tot_len += len;
398 		++n_ref;
399 	}
400 	fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (unsigned long long)tot_len);
401 	rewind(fp_fa);
402 
403 	free(seq.s);
404 	free(index.idx);
405 }
406 
LR_usage()407 int LR_usage() {
408 	fprintf(stderr, "\n");
409 	fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
410 	fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
411 	fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
412 	fprintf(stderr, "Usage: ezmsim LR [options] <-z enzyme> <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
413 	fprintf(stderr, "Options: -e FLOAT    base error rate [%.3f]\n", ERR_RATE);
414 	fprintf(stderr, "         -D FLOAT    read depth [%.1f]\n", DEPTH);
415 	fprintf(stderr, "         -1 INT      length of the first read [100]\n");
416 	fprintf(stderr, "         -2 INT      length of the second read [100]\n");
417 	fprintf(stderr, "         -z STRING   enzyme sequence (must be specified)\n");
418 	fprintf(stderr, "         -p INT      enzyme cut position [length(enzyme)/2]\n");
419 	fprintf(stderr, "\n");
420 	return 1;
421 }
422 
LR_main(int argc,char * argv[])423 int LR_main(int argc, char *argv[])
424 {
425 	unsigned int size_l, size_r;
426 	FILE *fpout1, *fpout2, *fp_fa;
427 	char *cut = "";
428 
429 	int c, pos;
430 	pos = -1;
431 	//dist = 250;
432 	//std_dev = 20;
433 	size_l = size_r = 100;
434 	while ((c = getopt(argc, argv, "e:D:1:2:d:s:z:p:")) != -1) {
435 	switch (c) {
436 		case 'e': ERR_RATE = atof(optarg); break;
437 		case 'D': DEPTH = atof(optarg); break;
438 		case '1': size_l = atoi(optarg); break;
439 		case '2': size_r = atoi(optarg); break;
440 		case 'z': cut = strdup(optarg);
441 		case 'p': pos = atoi(optarg);
442 				  uc((unsigned char*)cut); break;
443 		//case 'd': dist = atoi(optarg); break;
444 		//case 's': std_dev = atoi(optarg); break;
445 		default:
446 				  //printf("unknown option: %c\n", optopt);
447 				  //return 1;
448 				  return LR_usage();
449 		}
450 	}
451 	if (strlen(cut)==0) {
452 		//fprintf(stderr, "parameter z (enzyme cut) must be specified\n");
453 		return LR_usage();
454 	}
455 	if (argc - optind < 3) {
456 		fprintf(stderr, "files must be specified\n");
457 		return LR_usage();
458 	}
459 	fp_fa = (strcmp(argv[optind+0], "-") == 0)?stdin:xopen(argv[optind+0], "r");
460 	fpout1 = xopen(argv[optind+1], "w");
461 	fpout2 = xopen(argv[optind+2], "w");
462 
463 	if (pos == -1) {
464 		pos = strlen(cut)/2;
465 	}
466 	ezmsim_LR_core(fpout1, fpout2, fp_fa, size_l, size_r, (unsigned char*)cut, pos);
467 
468 	fclose(fp_fa); fclose(fpout1); fclose(fpout2);
469 	return 0;
470 }
471 
ezmsim_EF_core(FILE * fpout1,FILE * fpout2,FILE * fp_fa,unsigned int size_l,unsigned int size_r,unsigned char * cut,int pos,int distance,int ovlp,int stp,int reverse,int is_hap)472 void ezmsim_EF_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, unsigned int size_l, unsigned int size_r, unsigned char *cut, int pos, int distance, int ovlp, int stp, int reverse, int is_hap)
473 {
474 	idx_t index;
475 	seq_t seq;
476 	uint64_t tot_len, dep, i, k;
477 	int len, n_ref, j, size[2], Q, m, n;
478 	char name[256], *qstr;
479 	uint8_t *tmp_seq[2], c;
480 	uint64_t id;
481 	int dist, overlap, step, rev;
482 	mutseq_t rseq[2];
483 	mut_t *target;
484 
485 	INIT_SEQ(seq);
486 	INIT_IDX(index);
487 
488 	Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
489 
490 	srand48(time(0));
491 	seq_set_block_size(0x1000000);
492 	len = size_l > size_r?size_l:size_r;
493 	qstr = (char*)calloc(len+1, 1);
494 	tmp_seq[0] = (uint8_t*)calloc(len+2, 1);
495 	tmp_seq[1] = (uint8_t*)calloc(len+2, 1);
496 	size[0] = size_l; size[1] = size_r;
497 	tot_len = n_ref = 0; id = 0;
498 	dist = distance; overlap = ovlp; step = stp; rev = reverse;
499 
500 	while ((len = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
501 		uc(seq.s);
502 		uc(cut);
503 		if (strindex(&index, seq.s, cut) != -1)
504 			printf("chromsome %s has %llu digest sites\n", name, (unsigned long long)index.l);
505 
506 		maq_mut_diref(&seq, is_hap, rseq, rseq+1);
507 		maq_print_mutref(name, &seq, rseq, rseq+1);
508 
509 		for (i = 0; i < index.l; i++) {
510 			if (len < (dist + overlap*step) * 2) {
511 				fprintf(stderr, "[ezmsim_core] skip sequence '%s' as it is shorter than %d!\n", name, (dist + overlap*step)*2);
512 				continue;
513 			}
514 
515 			for (n = 0; n < step; n++) {
516 				dep = poisson_num_gen(DEPTH);
517 				for (k = 0; k < dep; k++) {
518 					FILE *fpo[2];
519 					int is_flip = 0, s[2], d;
520 					id++;
521 
522 					d = dist + (int)(drand48()*overlap);
523 
524 					//if (drand48() < 0.5) {
525 					if (!rev) {
526 						fpo[0] = fpout1; fpo[1] = fpout2;
527 						s[0] = size[0]; s[1] = size[1];
528 					} else {
529 						fpo[1] = fpout1; fpo[0] = fpout2;
530 						s[1] = size[0]; s[0] = size[1];
531 						is_flip = 1;
532 					}
533 					//generate the read sequences
534 					target = rseq[drand48()<0.5?0:1].s;
535 					int ii, begin, end;
536 					for (ii = index.idx[i]+pos, j = 0, begin = 0; ii < seq.l && j < s[0]; ++ii) {
537 						int c = target[ii];
538 						int mut_type = c & mutmsk;
539 						if (mut_type == DELETE) continue; // deletion
540 						if (begin == 0) {
541 							begin = ii;
542 							if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) mut_type = NOCHANGE; // skip ins at the first base
543 						}
544 						if(mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
545 							tmp_seq[0][j++] = c&0xf;
546 							continue;
547 						}
548 						int n = mut_type >> 12, ins = c >> 4;
549 						while (n > 0) {
550 							tmp_seq[0][j++] = ins & 0x3;
551 							ins >>= 2;
552 							n--;
553 							if ((int)k == s[0]) break;
554 						}
555 						tmp_seq[0][j++] = c&0xf;
556 					}
557 					for (ii = index.idx[i]+pos+d-1, j = 0, end = 0; ii>=0 && j < s[1];--ii) {
558 						int c = target[ii];
559 						if ((c&mutmsk) == DELETE) continue; // deletion
560 						if (end == 0) end = i;
561 						tmp_seq[1][j++] = c&0xf;
562 						if((c&mutmsk) == NOCHANGE || (c&mutmsk) == SUBSTITUTE) continue;
563 						int n = (c&mutmsk) >> 12, ins = c >> 4;
564 						while (n > 0) {
565 							if (j == s[1]) break;
566 							tmp_seq[1][j++] = ins & 0x3;
567 							ins >>= 2;
568 							n--;
569 						}
570 					}
571 					for (j = 0; j < s[0]; j++) {
572 						c = tmp_seq[0][j];
573 						//c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+j]];
574 						if (c >= 4) c = 4;
575 						else if (drand48() < ERR_RATE) {
576 							c = (c + (int)(drand48()*3.0+1)) & 3;
577 						}
578 						tmp_seq[0][j] = c;
579 					}
580 
581 					for (j = 0; j < s[1]; j++) {
582 						c = tmp_seq[1][j];
583 						//c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+d-j]];
584 						if (c >= 4) c = 4;
585 						else if (drand48() < ERR_RATE) {
586 							c = (c + (int)(drand48()*3.0 + 1)) & 3;
587 						}
588 						tmp_seq[1][j] = c<4?3-c:4;
589 					}
590 
591 					for (m = 0; m < 2; m++) {
592 						fprintf(fpo[m], "@%s_%s_%llu_%llu_%d/%d\n", name, cut, (unsigned long long)index.idx[i], (unsigned long long)id, d, m==0?is_flip+1:2-is_flip);
593 						for (j = 0; j < s[m]; j++) {
594 							qstr[j] = Q;
595 							fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
596 						}
597 						qstr[j] = 0;
598 						fprintf(fpo[m], "\n+\n%s\n", qstr);
599 					}
600 
601 				}
602 				/*
603 				dep = poisson_num_gen(DEPTH);
604 				for (k = 0; k < dep; k++) {
605 					FILE *fpo[2];
606 					int is_flip = 0, s[2], d;
607 					id++;
608 
609 					d = dist + (int)(drand48()*step);
610 
611 					//if (drand48() < 0.5) {
612 					if (!rev) {
613 						fpo[0] = fpout1; fpo[1] = fpout2;
614 						s[0] = size[0]; s[1] = size[1];
615 					} else {
616 						fpo[1] = fpout1; fpo[0] = fpout2;
617 						s[1] = size[0]; s[0] = size[1];
618 						is_flip = 1;
619 					}
620 
621 					for (j = 0; j < s[0]; j++) {
622 						c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-j-1]];
623 						if (c >= 4) c = 4;
624 						else if (drand48() < ERR_RATE) {
625 							c = (c + (int)(drand48()*3.0+1)) & 3;
626 						}
627 						tmp_seq[0][j] = c;
628 					}
629 
630 					for (j = 0; j < s[1]; j++) {
631 						c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-d+j]];
632 						if (c >= 4) c = 4;
633 						else if (drand48() < ERR_RATE) {
634 							c = (c + (int)(drand48()*3.0 + 1)) & 3;
635 						}
636 						tmp_seq[1][j] = c<4?3-c:4;
637 					}
638 
639 					for (m = 0; m < 2; m++) {
640 						fprintf(fpo[m], "@%s_%s_%lld_%lld_%d/%d\n", name, cut, index.idx[i], id, d, m==0?is_flip+1:2-is_flip);
641 						for (j = 0; j < s[m]; j++) {
642 							qstr[j] = Q;
643 							fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
644 						}
645 						qstr[j] = 0;
646 						fprintf(fpo[m], "\n+\n%s\n", qstr);
647 					}
648 				}*/
649 				dist += overlap;
650 			}
651 			dist = distance;
652 		}
653 		tot_len += len;
654 		++n_ref;
655 	}
656 	fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (unsigned long long)tot_len);
657 
658 	free(seq.s);
659 	free(index.idx);
660 }
661 
EF_usage()662 int EF_usage()
663 {
664 	fprintf(stderr, "\n");
665 	fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
666 	fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
667 	fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
668 	fprintf(stderr, "Usage: ezmsim RAD [options] <-z enzyme> <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
669 	fprintf(stderr, "Options: -e FLOAT    base error rate [%.3f]\n", ERR_RATE);
670 	fprintf(stderr, "         -D FLOAT    read depth [%.1f]\n", DEPTH);
671 	fprintf(stderr, "         -1 INT      length of the first read [100]\n");
672 	fprintf(stderr, "         -2 INT      length of the second read [100]\n");
673 	fprintf(stderr, "         -z STRING   enzyme sequence (must be specified)\n");
674 	fprintf(stderr, "         -p INT      enzyme cut position [length(enzyme)/2]\n");
675 	fprintf(stderr, "         -d INT      initial insert size distance [120]\n");
676 	//fprintf(stderr, "         -s INT      standard deviation of insert size [20]\n");
677 	fprintf(stderr, "         -o INT      insert size overlap distance [50]\n");
678 	fprintf(stderr, "         -t INT      elongation steps of insert size [10]\n");
679 	fprintf(stderr, "         -h FLOAT    rate of homozygosity[%.4f]\n", HOM_RATE);
680 	fprintf(stderr, "         -m FLOAT    rate of mutation[%.4f]\n", MUT_RATE);
681 	fprintf(stderr, "         -R FLOAT    fraction of indels [%.2f]\n", INDEL_FRAC);
682 	fprintf(stderr, "         -X FLOAT    probability an indel is extended [%.2f]\n", INDEL_EXTEND);
683 	fprintf(stderr, "         -r          reverse or not [forward only]\n");
684 	fprintf(stderr, "         -H          haploid mode\n");
685 	fprintf(stderr, "\n");
686 	return 1;
687 }
688 
EF_main(int argc,char * argv[])689 int EF_main(int argc, char *argv[])
690 {
691 	int c, size_l, size_r, pos, dist, overlap, step, rev, is_hap = 0;
692 	FILE *fpout1, *fpout2, *fp_fa;
693 	char *cut = "";
694 
695 	pos = -1;
696 	dist = 120;  //initial distance
697 	//std_dev = 20;
698 	overlap = 50;
699 	size_l = size_r = 100;
700 	step = 10; rev = 0;
701 	while ((c = getopt(argc, argv, "e:D:1:2:d:s:z:p:o:t:R:rh:Hm:")) != -1) {
702 	switch (c) {
703 		case 'e': ERR_RATE = atof(optarg); break;
704 		case 'D': DEPTH = atof(optarg); break;
705 		case '1': size_l = atoi(optarg); break;
706 		case '2': size_r = atoi(optarg); break;
707 		case 'z': cut = strdup(optarg);
708 		case 'p': pos = atoi(optarg);
709 				  uc((unsigned char*)cut); break;
710 		case 'd': dist = atoi(optarg); break;
711 		//case 's': std_dev = atoi(optarg); break;
712 		case 'o': overlap = atoi(optarg); break;
713 		case 't': step = atoi(optarg); break;
714 		case 'r': rev = 1; break;
715 		case 'h': HOM_RATE = atof(optarg); break;
716 		case 'H': is_hap = 1; break;
717 		case 'm': MUT_RATE = atof(optarg); break;
718 		case 'R': INDEL_FRAC = atof(optarg); break;
719 		case 'X': INDEL_EXTEND = atof(optarg); break;
720 		default:
721 				  //printf("unknown option: %c\n", optopt);
722 				  //return 1;
723 				  return EF_usage();
724 		}
725 	}
726 	if (strlen(cut)==0) {
727 		//fprintf(stderr, "parameter z (enzyme cut) must be specified\n");
728 		return EF_usage();
729 	}
730 	if (argc - optind < 3) {
731 		fprintf(stderr, "files must be specified\n");
732 		return EF_usage();
733 	}
734 	fp_fa = (strcmp(argv[optind+0], "-") == 0)?stdin:xopen(argv[optind+0], "r");
735 	fpout1 = xopen(argv[optind+1], "w");
736 	fpout2 = xopen(argv[optind+2], "w");
737 
738 	if (pos == -1) {
739 		pos = strlen(cut)/2;
740 	}
741 	ezmsim_EF_core(fpout1, fpout2, fp_fa, size_l, size_r, (unsigned char*)cut, pos, dist, overlap, step, rev, is_hap);
742 
743 	fclose(fp_fa); fclose(fpout1); fclose(fpout2);
744 	return 0;
745 }
746 
usage()747 int usage()
748 {
749 	fprintf(stderr, "\n");
750 	fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
751 	fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
752 	fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
753 	fprintf(stderr, "Usage: ezmsim <LR|EF> [options]\n\n");
754 	fprintf(stderr, "Options: LR          simulate LR reads\n");
755 	fprintf(stderr, "         RAD          simulate RAD reads\n");
756 	fprintf(stderr, "\n");
757 	return 1;
758 }
759 
main(int argc,char * argv[])760 int main (int argc, char *argv[])
761 {
762 	if (argc < 2) return usage();
763 	if (strcmp(argv[1], "LR") == 0) return LR_main(argc-1, argv+1);
764 	else if (strcmp(argv[1], "RAD") == 0) return EF_main(argc-1, argv+1);
765 	else {
766 		fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
767 		return 1;
768 	}
769 	return 0;
770 }
771