1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <unistd.h>
5 #include <stdint.h>
6 #include <assert.h>
7 #include <limits.h>
8 #include <math.h>
9 #include "bfc.h"
10 
11 #define BFC_VERSION "r181"
12 
13 int bfc_verbose = 3;
14 double bfc_real_time;
15 bfc_kmer_t bfc_kmer_null = {{0,0,0,0}};
16 
bfc_opt_init(bfc_opt_t * opt)17 void bfc_opt_init(bfc_opt_t *opt)
18 {
19 	memset(opt, 0, sizeof(bfc_opt_t));
20 	opt->chunk_size = 100000000;
21 	opt->n_threads = 1;
22 	opt->q = 20;
23 	opt->k = 33;
24 	opt->l_pre = 20;
25 	opt->bf_shift = 33;
26 	opt->n_hashes = 4;
27 
28 	opt->min_frac = .9;
29 
30 	opt->min_cov = 3;
31 	opt->win_multi_ec = 10;
32 	opt->max_end_ext = 5;
33 
34 	opt->w_ec = 1;
35 	opt->w_ec_high = 7;
36 	opt->w_absent = 3;
37 	opt->w_absent_high = 1;
38 	opt->max_path_diff = 15;
39 	opt->max_heap = 100;
40 }
41 
bfc_opt_by_size(bfc_opt_t * opt,long size)42 void bfc_opt_by_size(bfc_opt_t *opt, long size)
43 {
44 	double bits;
45 	bits = log(size) / log(2);
46 	opt->k = (int)(bits + 1.);
47 	if ((opt->k&1) == 0) ++opt->k; // should always be an odd number
48 	if (opt->k > BFC_MAX_KMER)
49 		opt->k = BFC_MAX_KMER;
50 	opt->bf_shift = (int)(bits + 8.);
51 	if (opt->bf_shift > BFC_MAX_BF_SHIFT)
52 		opt->bf_shift = BFC_MAX_BF_SHIFT;
53 }
54 
usage(FILE * fp,bfc_opt_t * o)55 static void usage(FILE *fp, bfc_opt_t *o)
56 {
57 	fprintf(fp, "Usage: bfc [options] <to-count.fq> [to-correct.fq]\n");
58 	fprintf(fp, "Options:\n");
59 	fprintf(fp, "  -s FLOAT     approx genome size (k/m/g allowed; change -k and -b) [unset]\n");
60 	fprintf(fp, "  -k INT       k-mer length [%d]\n", o->k);
61 	fprintf(fp, "  -t INT       number of threads [%d]\n", o->n_threads);
62 	fprintf(fp, "  -b INT       set Bloom filter size to pow(2,INT) bits [%d]\n", o->bf_shift);
63 	fprintf(fp, "  -H INT       use INT hash functions for Bloom filter [%d]\n", o->n_hashes);
64 	fprintf(fp, "  -d FILE      dump hash table to FILE [null]\n");
65 	fprintf(fp, "  -E           skip error correction\n");
66 	fprintf(fp, "  -R           refine bfc-corrected reads\n");
67 	fprintf(fp, "  -r FILE      restore hash table from FILE [null]\n");
68 	fprintf(fp, "  -w INT       no more than %d ec or 2 highQ ec in INT-bp window [%d]\n", BFC_EC_HIST, o->win_multi_ec);
69 	fprintf(fp, "  -c INT       min k-mer coverage [%d]\n", o->min_cov);
70 //	fprintf(fp, "  -D           discard uncorrectable reads\n");
71 	fprintf(fp, "  -Q           force FASTA output\n");
72 	fprintf(fp, "  -1           drop reads containing unique k-mers\n");
73 	fprintf(fp, "  -v           show version number\n");
74 	fprintf(fp, "  -h           show command line help\n");
75 }
76 
main(int argc,char * argv[])77 int main(int argc, char *argv[])
78 {
79 	bfc_opt_t opt;
80 	bfc_bf_t *bf = 0;
81 	bfc_ch_t *ch = 0;
82 	int i, c, no_ec = 0;
83 	char *in_hash = 0, *out_hash = 0, *next_fn;
84 
85 	bfc_real_time = realtime();
86 	bfc_opt_init(&opt);
87 	while ((c = getopt(argc, argv, "hvV:Ed:k:s:b:L:t:C:H:q:Jr:c:w:D1QR")) >= 0) {
88 		if (c == 'd') out_hash = optarg;
89 		else if (c == 'r') in_hash = optarg;
90 		else if (c == 'q') opt.q = atoi(optarg);
91 		else if (c == 'b') opt.bf_shift = atoi(optarg);
92 		else if (c == 't') opt.n_threads = atoi(optarg);
93 		else if (c == 'H') opt.n_hashes = atoi(optarg);
94 		else if (c == 'c') opt.min_cov = atoi(optarg);
95 		else if (c == 'w') opt.win_multi_ec = atoi(optarg);
96 		else if (c == 'R') opt.refine_ec = 1;
97 		else if (c == 'D') opt.discard = 1;
98 		else if (c == '1') opt.filter_mode = 1;
99 		else if (c == 'Q') opt.no_qual = 1;
100 		else if (c == 'J') opt.no_mt_io = 1; // for debugging kt_pipeline()
101 		else if (c == 'E') no_ec = 1;
102 		else if (c == 'V') bfc_verbose = atoi(optarg);
103 		else if (c == 'k') {
104 			opt.k = atoi(optarg);
105 			fprintf(stderr, "[M::%s] set k to %d\n", __func__, opt.k);
106 		} else if (c == 'h') {
107 			usage(stdout, &opt);
108 			return 0;
109 		} else if (c == 'v') {
110 			printf("%s\n", BFC_VERSION);
111 			return 0;
112 		} else if (c == 'L' || c == 's') {
113 			double x;
114 			char *p;
115 			x = strtod(optarg, &p);
116 			if (*p == 'G' || *p == 'g') x *= 1e9;
117 			else if (*p == 'M' || *p == 'm') x *= 1e6;
118 			else if (*p == 'K' || *p == 'k') x *= 1e3;
119 			if (c == 's') {
120 				bfc_opt_by_size(&opt, (long)x + 1);
121 				fprintf(stderr, "[M::%s] applied `-k %d -b %d'\n", __func__, opt.k, opt.bf_shift);
122 			} else if (c == 'L') opt.chunk_size = (long)x + 1;
123 		}
124 	}
125 
126 	if (optind == argc) {
127 		usage(stderr, &opt);
128 		return 1;
129 	}
130 
131 	if (opt.filter_mode) bf = (bfc_bf_t*)bfc_count(argv[optind], &opt);
132 	else if (!in_hash) ch = (bfc_ch_t*)bfc_count(argv[optind], &opt);
133 	else {
134 		ch = bfc_ch_restore(in_hash);
135 		if (opt.k != bfc_ch_get_k(ch)) {
136 			opt.k = bfc_ch_get_k(ch);
137 			if (bfc_verbose >= 2)
138 				fprintf(stderr, "[W::%s] hash table was constructed with a different k; set k to %d\n", __func__, opt.k);
139 		}
140 	}
141 
142 	next_fn = optind + 1 < argc? argv[optind+1] : argv[optind];
143 	if (ch) {
144 		if (out_hash) bfc_ch_dump(ch, out_hash);
145 		if (!no_ec) bfc_correct(next_fn, &opt, ch);
146 		bfc_ch_destroy(ch);
147 	} else if (bf) {
148 		bfc_correct(next_fn, &opt, bf);
149 		bfc_bf_destroy(bf);
150 	}
151 
152 	fprintf(stderr, "[M::%s] Version: %s\n", __func__, BFC_VERSION);
153 	fprintf(stderr, "[M::%s] CMD:", __func__);
154 	for (i = 0; i < argc; ++i)
155 		fprintf(stderr, " %s", argv[i]);
156 	fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - bfc_real_time, cputime());
157 	return 0;
158 }
159