1 #include <stdio.h>
2 #include <stdint.h>
3 #include <string.h>
4 #include <time.h>
5 #include <math.h>
6 #include <stdlib.h>
7 #include <unistd.h>
8 #include <ctype.h>
9 #include <assert.h>
10
11 #define PACKAGE_VERSION "0.1.1"
12
13 uint8_t nst_nt4_table[256] = {
14 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
15 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
17 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
19 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
21 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
22 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
24 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
30 };
31
32 enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000};
33 typedef unsigned short mut_t;
34 static mut_t mutmsk = (mut_t)0xf000;
35
36 typedef struct {
37 int l, m; /* length and maximum buffer size */
38 unsigned char *s; /* sequence */
39 } seq_t;
40
41 typedef struct {
42 int l, m; /* length and maximum buffer size */
43 mut_t *s; /* sequence */
44 } mutseq_t;
45
46 typedef struct {
47 uint64_t l, m;
48 uint64_t *idx;
49 } idx_t;
50
51 #define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0
52 #define INIT_IDX(index) (index).idx = 0; (index).l = (index).m = 0
53
54 static int SEQ_BLOCK_SIZE = 512;
55
seq_set_block_size(int size)56 void seq_set_block_size(int size)
57 {
58 SEQ_BLOCK_SIZE = size;
59 }
60
seq_read_fasta(FILE * fp,seq_t * seq,char * locus,char * comment)61 int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment)
62 {
63 int c, l, max;
64 char *p;
65
66 c = 0;
67 while (!feof(fp) && fgetc(fp) != '>');
68 if (feof(fp)) return -1;
69 p = locus;
70 while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n')
71 if (c != '\r') *p++ = c;
72 *p = '\0';
73 if (comment) {
74 p = comment;
75 if (c != '\n') {
76 while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t'));
77 if (c != '\n') {
78 *p++ = c;
79 while (!feof(fp) && (c = fgetc(fp)) != '\n')
80 if (c != '\r') *p++ = c;
81 }
82 }
83 *p = '\0';
84 } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n');
85 l = 0; max = seq->m;
86 while (!feof(fp) && (c = fgetc(fp)) != '>') {
87 if (isalpha(c) || c == '-' || c == '.') {
88 if (l + 1 >= max) {
89 max += SEQ_BLOCK_SIZE;
90 seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max);
91 }
92 seq->s[l++] = (unsigned char)c;
93 }
94 }
95 if (c == '>') ungetc(c,fp);
96 seq->s[l] = 0;
97 seq->m = max; seq->l = l;
98 return l;
99 }
100
101 /* Error-checking open, copied from utils.c */
102
103 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
104
err_xopen_core(const char * func,const char * fn,const char * mode)105 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
106 {
107 FILE *fp = 0;
108 if (strcmp(fn, "-") == 0)
109 return (strstr(mode, "r"))? stdin : stdout;
110 if ((fp = fopen(fn, mode)) == 0) {
111 fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn);
112 abort();
113 }
114 return fp;
115 }
116
117 static double ERR_RATE = 0.02;
118 static double DEPTH = 10.0;
119 static double MUT_RATE = 0.001;
120 static double HOM_RATE = 0.0;
121 static double INDEL_FRAC = 0.1;
122 static double INDEL_EXTEND = 0.3;
123
uc(unsigned char * s)124 void uc(unsigned char *s)
125 {
126 while (*s) {
127 *s = toupper(*s);
128 s++;
129 }
130 }
131
132
strindex(idx_t * index,unsigned char * s,unsigned char * t)133 int strindex(idx_t *index, unsigned char *s, unsigned char *t)
134 {
135 uint64_t i, j, k;
136 uint64_t l, max;
137
138 i = 0;
139 while (t[i]) {
140 if (nst_nt4_table[(int)t[i]] == 4) {
141 //printf("here%c\n", *t);
142 return -1;
143 }
144 i++;
145 }
146
147 l = max = 0;
148 for (i = 0; s[i] != '\0'; i++) {
149 for (j=i, k=0; t[k]!='\0' && s[j]==t[k]; j++, k++)
150 ;
151 if (k > 0 && t[k] == '\0') {
152 if (l + 1 >= max) {
153 max += SEQ_BLOCK_SIZE;
154 index->idx = (uint64_t*)realloc(index->idx, sizeof(uint64_t) * max);
155 }
156 index->idx[l++] = i;
157 }
158 }
159
160 if (l) {
161 index->l = l;
162 index->m = max;
163 index->idx[l] = -1;
164 return l;
165 }
166 else
167 return -1;
168 }
169
170 /* Simple normal random number generator, copied from genran.c */
171
ran_normal()172 double ran_normal()
173 {
174 static int iset = 0;
175 static double gset;
176 double fac, rsq, v1, v2;
177 if (iset == 0) {
178 do {
179 v1 = 2.0 * drand48() - 1.0;
180 v2 = 2.0 * drand48() - 1.0;
181 rsq = v1 * v1 + v2 * v2;
182 } while (rsq >= 1.0 || rsq == 0.0);
183 fac = sqrt(-2.0 * log(rsq) / rsq);
184 gset = v1 * fac;
185 iset = 1;
186 return v2 * fac;
187 } else {
188 iset = 0;
189 return gset;
190 }
191 }
maq_mut_diref(const seq_t * seq,int is_hap,mutseq_t * hap1,mutseq_t * hap2)192 void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2)
193 {
194 int i, deleting = 0;
195 mutseq_t *ret[2];
196
197 ret[0] = hap1; ret[1] = hap2;
198 ret[0]->l = seq->l; ret[1]->l = seq->l;
199 ret[0]->m = seq->m; ret[1]->m = seq->m;
200 ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
201 ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t));
202 for (i = 0; i != seq->l; ++i) {
203 int c;
204 c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]];
205 if (deleting) {
206 if (drand48() < INDEL_EXTEND) {
207 if (deleting & 1) ret[0]->s[i] |= DELETE;
208 if (deleting & 2) ret[1]->s[i] |= DELETE;
209 continue;
210 } else deleting = 0;
211 }
212 if (c < 4 && drand48() < MUT_RATE) { // mutation
213 if (drand48() >= INDEL_FRAC) { // substitution
214 double r = drand48();
215 c = (c + (int)(r * 3.0 + 1)) & 3;
216 if (is_hap || drand48() < HOM_RATE) { // hom
217 ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c;
218 } else { // het
219 ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c;
220 }
221 } else { // indel
222 if (drand48() < 0.5) { // deletion
223 if (is_hap || drand48() < HOM_RATE) { // hom-del
224 ret[0]->s[i] = ret[1]->s[i] = DELETE;
225 deleting = 3;
226 } else { // het-del
227 deleting = drand48()<0.5?1:2;
228 ret[deleting-1]->s[i] = DELETE;
229 }
230 } else { // insertion
231 int num_ins = 0, ins = 0;
232 do {
233 num_ins++;
234 ins = (ins << 2) | (int)(drand48() * 4.0);
235 } while(num_ins < 4 && drand48() < INDEL_EXTEND);
236
237 if (is_hap || drand48() < HOM_RATE) { // hom-ins
238 ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c;
239 } else { // het-ins
240 ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c;
241 }
242 }
243 }
244 }
245 }
246 }
maq_print_mutref(const char * name,const seq_t * seq,mutseq_t * hap1,mutseq_t * hap2)247 void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2)
248 {
249 int i;
250 for (i = 0; i != seq->l; ++i) {
251 int c[3];
252 c[0] = nst_nt4_table[(int)seq->s[i]];
253 c[1] = hap1->s[i]; c[2] = hap2->s[i];
254 if (c[0] >= 4) continue;
255 if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) {
256 printf("%s\t%d\t", name, i+1);
257 if (c[1] == c[2]) { // hom
258 if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution
259 printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]);
260 } else if ((c[1]&mutmsk) == DELETE) { // del
261 printf("%c\t-\t-\n", "ACGTN"[c[0]]);
262 } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins
263 printf("-\t");
264 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
265 while(n > 0) {
266 putchar("ACGTN"[ins & 0x3]);
267 n--;
268 }
269 printf("\t-\n");
270 } else assert(0);
271 } else { // het
272 if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution
273 printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]);
274 } else if ((c[1]&mutmsk) == DELETE) {
275 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
276 } else if ((c[2]&mutmsk) == DELETE) {
277 printf("%c\t-\t+\n", "ACGTN"[c[0]]);
278 } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1
279 printf("-\t");
280 int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4;
281 while (n > 0) {
282 putchar("ACGTN"[ins & 0x3]);
283 n--;
284 }
285 printf("\t+\n");
286 } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2
287 printf("-\t");
288 int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4;
289 while (n > 0) {
290 putchar("ACGTN"[ins & 0x3]);
291 ins >>= 2;
292 n--;
293 }
294 printf("\t+\n");
295 } else assert(0);
296 }
297 }
298 }
299 }
300
301
302 //wiki knuth method
poisson_num_gen(double lamda)303 int poisson_num_gen(double lamda)
304 {
305 int k = 0;
306 double L = exp(-lamda);
307 double p = 1.0;
308
309 do {
310 k++;
311 p = p * drand48();
312 } while (p > L);
313
314 return k-1;
315 }
316
ezmsim_LR_core(FILE * fpout1,FILE * fpout2,FILE * fp_fa,int size_l,int size_r,unsigned char * cut,int pos)317 void ezmsim_LR_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int size_l, int size_r, unsigned char *cut, int pos)
318 {
319 idx_t index;
320 seq_t seq;
321 uint64_t tot_len, dep;
322 unsigned int i, k;
323 int len, n_ref, j, size[2], Q, m;
324 char name[256], *qstr;
325 uint8_t *tmp_seq[2], c;
326 uint64_t id;
327
328 INIT_SEQ(seq);
329 INIT_IDX(index);
330
331 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
332
333 srand48(time(0));
334 seq_set_block_size(0x1000000);
335 len = size_l > size_r?size_l:size_r;
336 qstr = (char*)calloc(len+1, 1);
337 tmp_seq[0] = (uint8_t*)calloc(len+2, 1);
338 tmp_seq[1] = (uint8_t*)calloc(len+2, 1);
339 size[0] = size_l; size[1] = size_r;
340 tot_len = n_ref = 0; id = 0;
341 while ((len = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
342 uc(seq.s);
343 uc(cut);
344 if (strindex(&index, seq.s, cut) != -1)
345 printf("chromsome %s has %llu digest sites\n", name, (unsigned long long)index.l);
346
347 for (i = 0; i < index.l; i++) {
348 if (i - size_l <= 0 || i + size_r > (unsigned int)len)
349 continue;
350 dep = poisson_num_gen(DEPTH);
351 for (k = 0; k < dep; k++) {
352 FILE *fpo[2];
353 int is_flip = 0, s[2];
354 id++;
355
356 if (drand48() < 0.5) {
357 fpo[0] = fpout1; fpo[1] = fpout2;
358 s[0] = size[0]; s[1] = size[1];
359 } else {
360 fpo[1] = fpout1; fpo[0] = fpout2;
361 s[1] = size[0]; s[0] = size[1];
362 is_flip = 1;
363 }
364
365 for (j = 0; j < s[0]; j++) {
366 c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-j-1]];
367 if (c >= 4) c = 4;
368 else if (drand48() < ERR_RATE) {
369 c = (c + (int)(drand48()*3.0 + 1)) & 3;
370 }
371 tmp_seq[0][j] = c;
372 }
373
374 for (j = 0; j < s[1]; j++) {
375 c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+j]];
376 if (c >= 4) c = 4;
377 else if (drand48() < ERR_RATE) {
378 c = (c + (int)(drand48()*3.0 + 1)) & 3;
379 }
380 tmp_seq[1][j] = c < 4?3-c:4;
381 }
382
383 for (m = 0; m < 2; m++) {
384 fprintf(fpo[m], "@%s_%s_%llu_%llu/%d\n", name, cut, (unsigned long long)index.idx[i], (unsigned long long)id, m==0?is_flip+1:2-is_flip);
385 for (j = 0; j < s[m]; j++) {
386 qstr[j] = Q;
387 fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
388 }
389 qstr[j] = 0;
390 fprintf(fpo[m], "\n+\n%s\n", qstr);
391 }
392 }
393 //fprintf(stderr, "%llu ", index.idx[i]);
394 }
395 printf("\n");
396
397 tot_len += len;
398 ++n_ref;
399 }
400 fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (unsigned long long)tot_len);
401 rewind(fp_fa);
402
403 free(seq.s);
404 free(index.idx);
405 }
406
LR_usage()407 int LR_usage() {
408 fprintf(stderr, "\n");
409 fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
410 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
411 fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
412 fprintf(stderr, "Usage: ezmsim LR [options] <-z enzyme> <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
413 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
414 fprintf(stderr, " -D FLOAT read depth [%.1f]\n", DEPTH);
415 fprintf(stderr, " -1 INT length of the first read [100]\n");
416 fprintf(stderr, " -2 INT length of the second read [100]\n");
417 fprintf(stderr, " -z STRING enzyme sequence (must be specified)\n");
418 fprintf(stderr, " -p INT enzyme cut position [length(enzyme)/2]\n");
419 fprintf(stderr, "\n");
420 return 1;
421 }
422
LR_main(int argc,char * argv[])423 int LR_main(int argc, char *argv[])
424 {
425 unsigned int size_l, size_r;
426 FILE *fpout1, *fpout2, *fp_fa;
427 char *cut = "";
428
429 int c, pos;
430 pos = -1;
431 //dist = 250;
432 //std_dev = 20;
433 size_l = size_r = 100;
434 while ((c = getopt(argc, argv, "e:D:1:2:d:s:z:p:")) != -1) {
435 switch (c) {
436 case 'e': ERR_RATE = atof(optarg); break;
437 case 'D': DEPTH = atof(optarg); break;
438 case '1': size_l = atoi(optarg); break;
439 case '2': size_r = atoi(optarg); break;
440 case 'z': cut = strdup(optarg);
441 case 'p': pos = atoi(optarg);
442 uc((unsigned char*)cut); break;
443 //case 'd': dist = atoi(optarg); break;
444 //case 's': std_dev = atoi(optarg); break;
445 default:
446 //printf("unknown option: %c\n", optopt);
447 //return 1;
448 return LR_usage();
449 }
450 }
451 if (strlen(cut)==0) {
452 //fprintf(stderr, "parameter z (enzyme cut) must be specified\n");
453 return LR_usage();
454 }
455 if (argc - optind < 3) {
456 fprintf(stderr, "files must be specified\n");
457 return LR_usage();
458 }
459 fp_fa = (strcmp(argv[optind+0], "-") == 0)?stdin:xopen(argv[optind+0], "r");
460 fpout1 = xopen(argv[optind+1], "w");
461 fpout2 = xopen(argv[optind+2], "w");
462
463 if (pos == -1) {
464 pos = strlen(cut)/2;
465 }
466 ezmsim_LR_core(fpout1, fpout2, fp_fa, size_l, size_r, (unsigned char*)cut, pos);
467
468 fclose(fp_fa); fclose(fpout1); fclose(fpout2);
469 return 0;
470 }
471
ezmsim_EF_core(FILE * fpout1,FILE * fpout2,FILE * fp_fa,unsigned int size_l,unsigned int size_r,unsigned char * cut,int pos,int distance,int ovlp,int stp,int reverse,int is_hap)472 void ezmsim_EF_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, unsigned int size_l, unsigned int size_r, unsigned char *cut, int pos, int distance, int ovlp, int stp, int reverse, int is_hap)
473 {
474 idx_t index;
475 seq_t seq;
476 uint64_t tot_len, dep, i, k;
477 int len, n_ref, j, size[2], Q, m, n;
478 char name[256], *qstr;
479 uint8_t *tmp_seq[2], c;
480 uint64_t id;
481 int dist, overlap, step, rev;
482 mutseq_t rseq[2];
483 mut_t *target;
484
485 INIT_SEQ(seq);
486 INIT_IDX(index);
487
488 Q = (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33;
489
490 srand48(time(0));
491 seq_set_block_size(0x1000000);
492 len = size_l > size_r?size_l:size_r;
493 qstr = (char*)calloc(len+1, 1);
494 tmp_seq[0] = (uint8_t*)calloc(len+2, 1);
495 tmp_seq[1] = (uint8_t*)calloc(len+2, 1);
496 size[0] = size_l; size[1] = size_r;
497 tot_len = n_ref = 0; id = 0;
498 dist = distance; overlap = ovlp; step = stp; rev = reverse;
499
500 while ((len = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) {
501 uc(seq.s);
502 uc(cut);
503 if (strindex(&index, seq.s, cut) != -1)
504 printf("chromsome %s has %llu digest sites\n", name, (unsigned long long)index.l);
505
506 maq_mut_diref(&seq, is_hap, rseq, rseq+1);
507 maq_print_mutref(name, &seq, rseq, rseq+1);
508
509 for (i = 0; i < index.l; i++) {
510 if (len < (dist + overlap*step) * 2) {
511 fprintf(stderr, "[ezmsim_core] skip sequence '%s' as it is shorter than %d!\n", name, (dist + overlap*step)*2);
512 continue;
513 }
514
515 for (n = 0; n < step; n++) {
516 dep = poisson_num_gen(DEPTH);
517 for (k = 0; k < dep; k++) {
518 FILE *fpo[2];
519 int is_flip = 0, s[2], d;
520 id++;
521
522 d = dist + (int)(drand48()*overlap);
523
524 //if (drand48() < 0.5) {
525 if (!rev) {
526 fpo[0] = fpout1; fpo[1] = fpout2;
527 s[0] = size[0]; s[1] = size[1];
528 } else {
529 fpo[1] = fpout1; fpo[0] = fpout2;
530 s[1] = size[0]; s[0] = size[1];
531 is_flip = 1;
532 }
533 //generate the read sequences
534 target = rseq[drand48()<0.5?0:1].s;
535 int ii, begin, end;
536 for (ii = index.idx[i]+pos, j = 0, begin = 0; ii < seq.l && j < s[0]; ++ii) {
537 int c = target[ii];
538 int mut_type = c & mutmsk;
539 if (mut_type == DELETE) continue; // deletion
540 if (begin == 0) {
541 begin = ii;
542 if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) mut_type = NOCHANGE; // skip ins at the first base
543 }
544 if(mut_type == NOCHANGE || mut_type == SUBSTITUTE) {
545 tmp_seq[0][j++] = c&0xf;
546 continue;
547 }
548 int n = mut_type >> 12, ins = c >> 4;
549 while (n > 0) {
550 tmp_seq[0][j++] = ins & 0x3;
551 ins >>= 2;
552 n--;
553 if ((int)k == s[0]) break;
554 }
555 tmp_seq[0][j++] = c&0xf;
556 }
557 for (ii = index.idx[i]+pos+d-1, j = 0, end = 0; ii>=0 && j < s[1];--ii) {
558 int c = target[ii];
559 if ((c&mutmsk) == DELETE) continue; // deletion
560 if (end == 0) end = i;
561 tmp_seq[1][j++] = c&0xf;
562 if((c&mutmsk) == NOCHANGE || (c&mutmsk) == SUBSTITUTE) continue;
563 int n = (c&mutmsk) >> 12, ins = c >> 4;
564 while (n > 0) {
565 if (j == s[1]) break;
566 tmp_seq[1][j++] = ins & 0x3;
567 ins >>= 2;
568 n--;
569 }
570 }
571 for (j = 0; j < s[0]; j++) {
572 c = tmp_seq[0][j];
573 //c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+j]];
574 if (c >= 4) c = 4;
575 else if (drand48() < ERR_RATE) {
576 c = (c + (int)(drand48()*3.0+1)) & 3;
577 }
578 tmp_seq[0][j] = c;
579 }
580
581 for (j = 0; j < s[1]; j++) {
582 c = tmp_seq[1][j];
583 //c = nst_nt4_table[(int)seq.s[index.idx[i]+pos+d-j]];
584 if (c >= 4) c = 4;
585 else if (drand48() < ERR_RATE) {
586 c = (c + (int)(drand48()*3.0 + 1)) & 3;
587 }
588 tmp_seq[1][j] = c<4?3-c:4;
589 }
590
591 for (m = 0; m < 2; m++) {
592 fprintf(fpo[m], "@%s_%s_%llu_%llu_%d/%d\n", name, cut, (unsigned long long)index.idx[i], (unsigned long long)id, d, m==0?is_flip+1:2-is_flip);
593 for (j = 0; j < s[m]; j++) {
594 qstr[j] = Q;
595 fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
596 }
597 qstr[j] = 0;
598 fprintf(fpo[m], "\n+\n%s\n", qstr);
599 }
600
601 }
602 /*
603 dep = poisson_num_gen(DEPTH);
604 for (k = 0; k < dep; k++) {
605 FILE *fpo[2];
606 int is_flip = 0, s[2], d;
607 id++;
608
609 d = dist + (int)(drand48()*step);
610
611 //if (drand48() < 0.5) {
612 if (!rev) {
613 fpo[0] = fpout1; fpo[1] = fpout2;
614 s[0] = size[0]; s[1] = size[1];
615 } else {
616 fpo[1] = fpout1; fpo[0] = fpout2;
617 s[1] = size[0]; s[0] = size[1];
618 is_flip = 1;
619 }
620
621 for (j = 0; j < s[0]; j++) {
622 c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-j-1]];
623 if (c >= 4) c = 4;
624 else if (drand48() < ERR_RATE) {
625 c = (c + (int)(drand48()*3.0+1)) & 3;
626 }
627 tmp_seq[0][j] = c;
628 }
629
630 for (j = 0; j < s[1]; j++) {
631 c = nst_nt4_table[(int)seq.s[index.idx[i]+pos-d+j]];
632 if (c >= 4) c = 4;
633 else if (drand48() < ERR_RATE) {
634 c = (c + (int)(drand48()*3.0 + 1)) & 3;
635 }
636 tmp_seq[1][j] = c<4?3-c:4;
637 }
638
639 for (m = 0; m < 2; m++) {
640 fprintf(fpo[m], "@%s_%s_%lld_%lld_%d/%d\n", name, cut, index.idx[i], id, d, m==0?is_flip+1:2-is_flip);
641 for (j = 0; j < s[m]; j++) {
642 qstr[j] = Q;
643 fputc("ACGTN"[(int)tmp_seq[m][j]], fpo[m]);
644 }
645 qstr[j] = 0;
646 fprintf(fpo[m], "\n+\n%s\n", qstr);
647 }
648 }*/
649 dist += overlap;
650 }
651 dist = distance;
652 }
653 tot_len += len;
654 ++n_ref;
655 }
656 fprintf(stderr, "-- %d sequences, total length: %llu\n", n_ref, (unsigned long long)tot_len);
657
658 free(seq.s);
659 free(index.idx);
660 }
661
EF_usage()662 int EF_usage()
663 {
664 fprintf(stderr, "\n");
665 fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
666 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
667 fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
668 fprintf(stderr, "Usage: ezmsim RAD [options] <-z enzyme> <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n");
669 fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE);
670 fprintf(stderr, " -D FLOAT read depth [%.1f]\n", DEPTH);
671 fprintf(stderr, " -1 INT length of the first read [100]\n");
672 fprintf(stderr, " -2 INT length of the second read [100]\n");
673 fprintf(stderr, " -z STRING enzyme sequence (must be specified)\n");
674 fprintf(stderr, " -p INT enzyme cut position [length(enzyme)/2]\n");
675 fprintf(stderr, " -d INT initial insert size distance [120]\n");
676 //fprintf(stderr, " -s INT standard deviation of insert size [20]\n");
677 fprintf(stderr, " -o INT insert size overlap distance [50]\n");
678 fprintf(stderr, " -t INT elongation steps of insert size [10]\n");
679 fprintf(stderr, " -h FLOAT rate of homozygosity[%.4f]\n", HOM_RATE);
680 fprintf(stderr, " -m FLOAT rate of mutation[%.4f]\n", MUT_RATE);
681 fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC);
682 fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND);
683 fprintf(stderr, " -r reverse or not [forward only]\n");
684 fprintf(stderr, " -H haploid mode\n");
685 fprintf(stderr, "\n");
686 return 1;
687 }
688
EF_main(int argc,char * argv[])689 int EF_main(int argc, char *argv[])
690 {
691 int c, size_l, size_r, pos, dist, overlap, step, rev, is_hap = 0;
692 FILE *fpout1, *fpout2, *fp_fa;
693 char *cut = "";
694
695 pos = -1;
696 dist = 120; //initial distance
697 //std_dev = 20;
698 overlap = 50;
699 size_l = size_r = 100;
700 step = 10; rev = 0;
701 while ((c = getopt(argc, argv, "e:D:1:2:d:s:z:p:o:t:R:rh:Hm:")) != -1) {
702 switch (c) {
703 case 'e': ERR_RATE = atof(optarg); break;
704 case 'D': DEPTH = atof(optarg); break;
705 case '1': size_l = atoi(optarg); break;
706 case '2': size_r = atoi(optarg); break;
707 case 'z': cut = strdup(optarg);
708 case 'p': pos = atoi(optarg);
709 uc((unsigned char*)cut); break;
710 case 'd': dist = atoi(optarg); break;
711 //case 's': std_dev = atoi(optarg); break;
712 case 'o': overlap = atoi(optarg); break;
713 case 't': step = atoi(optarg); break;
714 case 'r': rev = 1; break;
715 case 'h': HOM_RATE = atof(optarg); break;
716 case 'H': is_hap = 1; break;
717 case 'm': MUT_RATE = atof(optarg); break;
718 case 'R': INDEL_FRAC = atof(optarg); break;
719 case 'X': INDEL_EXTEND = atof(optarg); break;
720 default:
721 //printf("unknown option: %c\n", optopt);
722 //return 1;
723 return EF_usage();
724 }
725 }
726 if (strlen(cut)==0) {
727 //fprintf(stderr, "parameter z (enzyme cut) must be specified\n");
728 return EF_usage();
729 }
730 if (argc - optind < 3) {
731 fprintf(stderr, "files must be specified\n");
732 return EF_usage();
733 }
734 fp_fa = (strcmp(argv[optind+0], "-") == 0)?stdin:xopen(argv[optind+0], "r");
735 fpout1 = xopen(argv[optind+1], "w");
736 fpout2 = xopen(argv[optind+2], "w");
737
738 if (pos == -1) {
739 pos = strlen(cut)/2;
740 }
741 ezmsim_EF_core(fpout1, fpout2, fp_fa, size_l, size_r, (unsigned char*)cut, pos, dist, overlap, step, rev, is_hap);
742
743 fclose(fp_fa); fclose(fpout1); fclose(fpout2);
744 return 0;
745 }
746
usage()747 int usage()
748 {
749 fprintf(stderr, "\n");
750 fprintf(stderr, "Program: ezmsim (simulate enzyme cut assembly sequences)\n");
751 fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
752 fprintf(stderr, "Contact: Zechen Chong <chongzechen@gmail.com>\n\n");
753 fprintf(stderr, "Usage: ezmsim <LR|EF> [options]\n\n");
754 fprintf(stderr, "Options: LR simulate LR reads\n");
755 fprintf(stderr, " RAD simulate RAD reads\n");
756 fprintf(stderr, "\n");
757 return 1;
758 }
759
main(int argc,char * argv[])760 int main (int argc, char *argv[])
761 {
762 if (argc < 2) return usage();
763 if (strcmp(argv[1], "LR") == 0) return LR_main(argc-1, argv+1);
764 else if (strcmp(argv[1], "RAD") == 0) return EF_main(argc-1, argv+1);
765 else {
766 fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
767 return 1;
768 }
769 return 0;
770 }
771