1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <unistd.h>
5 #include <stdint.h>
6 #include <assert.h>
7 #include <limits.h>
8 #include <math.h>
9 #include "bfc.h"
10
11 #define BFC_VERSION "r181"
12
13 int bfc_verbose = 3;
14 double bfc_real_time;
15 bfc_kmer_t bfc_kmer_null = {{0,0,0,0}};
16
bfc_opt_init(bfc_opt_t * opt)17 void bfc_opt_init(bfc_opt_t *opt)
18 {
19 memset(opt, 0, sizeof(bfc_opt_t));
20 opt->chunk_size = 100000000;
21 opt->n_threads = 1;
22 opt->q = 20;
23 opt->k = 33;
24 opt->l_pre = 20;
25 opt->bf_shift = 33;
26 opt->n_hashes = 4;
27
28 opt->min_frac = .9;
29
30 opt->min_cov = 3;
31 opt->win_multi_ec = 10;
32 opt->max_end_ext = 5;
33
34 opt->w_ec = 1;
35 opt->w_ec_high = 7;
36 opt->w_absent = 3;
37 opt->w_absent_high = 1;
38 opt->max_path_diff = 15;
39 opt->max_heap = 100;
40 }
41
bfc_opt_by_size(bfc_opt_t * opt,long size)42 void bfc_opt_by_size(bfc_opt_t *opt, long size)
43 {
44 double bits;
45 bits = log(size) / log(2);
46 opt->k = (int)(bits + 1.);
47 if ((opt->k&1) == 0) ++opt->k; // should always be an odd number
48 if (opt->k > BFC_MAX_KMER)
49 opt->k = BFC_MAX_KMER;
50 opt->bf_shift = (int)(bits + 8.);
51 if (opt->bf_shift > BFC_MAX_BF_SHIFT)
52 opt->bf_shift = BFC_MAX_BF_SHIFT;
53 }
54
usage(FILE * fp,bfc_opt_t * o)55 static void usage(FILE *fp, bfc_opt_t *o)
56 {
57 fprintf(fp, "Usage: bfc [options] <to-count.fq> [to-correct.fq]\n");
58 fprintf(fp, "Options:\n");
59 fprintf(fp, " -s FLOAT approx genome size (k/m/g allowed; change -k and -b) [unset]\n");
60 fprintf(fp, " -k INT k-mer length [%d]\n", o->k);
61 fprintf(fp, " -t INT number of threads [%d]\n", o->n_threads);
62 fprintf(fp, " -b INT set Bloom filter size to pow(2,INT) bits [%d]\n", o->bf_shift);
63 fprintf(fp, " -H INT use INT hash functions for Bloom filter [%d]\n", o->n_hashes);
64 fprintf(fp, " -d FILE dump hash table to FILE [null]\n");
65 fprintf(fp, " -E skip error correction\n");
66 fprintf(fp, " -R refine bfc-corrected reads\n");
67 fprintf(fp, " -r FILE restore hash table from FILE [null]\n");
68 fprintf(fp, " -w INT no more than %d ec or 2 highQ ec in INT-bp window [%d]\n", BFC_EC_HIST, o->win_multi_ec);
69 fprintf(fp, " -c INT min k-mer coverage [%d]\n", o->min_cov);
70 // fprintf(fp, " -D discard uncorrectable reads\n");
71 fprintf(fp, " -Q force FASTA output\n");
72 fprintf(fp, " -1 drop reads containing unique k-mers\n");
73 fprintf(fp, " -v show version number\n");
74 fprintf(fp, " -h show command line help\n");
75 }
76
main(int argc,char * argv[])77 int main(int argc, char *argv[])
78 {
79 bfc_opt_t opt;
80 bfc_bf_t *bf = 0;
81 bfc_ch_t *ch = 0;
82 int i, c, no_ec = 0;
83 char *in_hash = 0, *out_hash = 0, *next_fn;
84
85 bfc_real_time = realtime();
86 bfc_opt_init(&opt);
87 while ((c = getopt(argc, argv, "hvV:Ed:k:s:b:L:t:C:H:q:Jr:c:w:D1QR")) >= 0) {
88 if (c == 'd') out_hash = optarg;
89 else if (c == 'r') in_hash = optarg;
90 else if (c == 'q') opt.q = atoi(optarg);
91 else if (c == 'b') opt.bf_shift = atoi(optarg);
92 else if (c == 't') opt.n_threads = atoi(optarg);
93 else if (c == 'H') opt.n_hashes = atoi(optarg);
94 else if (c == 'c') opt.min_cov = atoi(optarg);
95 else if (c == 'w') opt.win_multi_ec = atoi(optarg);
96 else if (c == 'R') opt.refine_ec = 1;
97 else if (c == 'D') opt.discard = 1;
98 else if (c == '1') opt.filter_mode = 1;
99 else if (c == 'Q') opt.no_qual = 1;
100 else if (c == 'J') opt.no_mt_io = 1; // for debugging kt_pipeline()
101 else if (c == 'E') no_ec = 1;
102 else if (c == 'V') bfc_verbose = atoi(optarg);
103 else if (c == 'k') {
104 opt.k = atoi(optarg);
105 fprintf(stderr, "[M::%s] set k to %d\n", __func__, opt.k);
106 } else if (c == 'h') {
107 usage(stdout, &opt);
108 return 0;
109 } else if (c == 'v') {
110 printf("%s\n", BFC_VERSION);
111 return 0;
112 } else if (c == 'L' || c == 's') {
113 double x;
114 char *p;
115 x = strtod(optarg, &p);
116 if (*p == 'G' || *p == 'g') x *= 1e9;
117 else if (*p == 'M' || *p == 'm') x *= 1e6;
118 else if (*p == 'K' || *p == 'k') x *= 1e3;
119 if (c == 's') {
120 bfc_opt_by_size(&opt, (long)x + 1);
121 fprintf(stderr, "[M::%s] applied `-k %d -b %d'\n", __func__, opt.k, opt.bf_shift);
122 } else if (c == 'L') opt.chunk_size = (long)x + 1;
123 }
124 }
125
126 if (optind == argc) {
127 usage(stderr, &opt);
128 return 1;
129 }
130
131 if (opt.filter_mode) bf = (bfc_bf_t*)bfc_count(argv[optind], &opt);
132 else if (!in_hash) ch = (bfc_ch_t*)bfc_count(argv[optind], &opt);
133 else {
134 ch = bfc_ch_restore(in_hash);
135 if (opt.k != bfc_ch_get_k(ch)) {
136 opt.k = bfc_ch_get_k(ch);
137 if (bfc_verbose >= 2)
138 fprintf(stderr, "[W::%s] hash table was constructed with a different k; set k to %d\n", __func__, opt.k);
139 }
140 }
141
142 next_fn = optind + 1 < argc? argv[optind+1] : argv[optind];
143 if (ch) {
144 if (out_hash) bfc_ch_dump(ch, out_hash);
145 if (!no_ec) bfc_correct(next_fn, &opt, ch);
146 bfc_ch_destroy(ch);
147 } else if (bf) {
148 bfc_correct(next_fn, &opt, bf);
149 bfc_bf_destroy(bf);
150 }
151
152 fprintf(stderr, "[M::%s] Version: %s\n", __func__, BFC_VERSION);
153 fprintf(stderr, "[M::%s] CMD:", __func__);
154 for (i = 0; i < argc; ++i)
155 fprintf(stderr, " %s", argv[i]);
156 fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - bfc_real_time, cputime());
157 return 0;
158 }
159