1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // kmer_filter --
23 //
24
25 #include "kmer_filter.h"
26
27 //
28 // Global variables to hold command-line options.
29 //
30 FileT in_file_type = FileT::unknown;
31 FileT out_file_type = FileT::fastq;
32 vector<string> in_files;
33 vector<string> in_pair_files;
34 string in_path;
35 string out_path;
36 string k_freq_path;
37 bool filter_mare_k = false;
38 bool filter_abundant_k = false;
39 bool normalize = false;
40 bool discards = false;
41 bool write_k_freq = false;
42 bool read_k_freq = false;
43 bool kmer_distr = false;
44 bool ill_barcode = false;
45 uint truncate_seq = 0;
46 int transition_lim = 3;
47 int normalize_lim = 0;
48 int kmer_len = 15;
49 int max_k_freq = 20000;
50 double max_k_pct = 0.80;
51 double min_k_pct = 0.15;
52 int min_lim = 0;
53 int max_lim = 0;
54 int num_threads = 1;
55
main(int argc,char * argv[])56 int main (int argc, char* argv[]) {
57 IF_NDEBUG_TRY
58
59 parse_command_line(argc, argv);
60
61 if (min_lim == 0)
62 min_lim = (int) round((double) kmer_len * 0.80);
63
64 cerr << "Using a kmer size of " << kmer_len << "\n";
65 if (filter_mare_k) {
66 cerr << "Filtering out reads by identifying rare kmers: On.\n"
67 << " A kmer is considered rare when its coverage is at " << min_k_pct * 100 << "% or below the median kmer coverage for the read.\n"
68 << " A read is dropped when it contains " << min_lim << " or more rare kmers in a row.\n";
69 } else
70 cerr << "Filtering out reads by identifying rare kmers: Off.\n";
71
72 if (filter_abundant_k) {
73 cerr << "Filtering out reads by identifying abundant kmers: On.\n"
74 << " Kmer is considered abundant when it occurs " << max_k_freq << " or more times.\n";
75 if (max_lim == 0)
76 cerr << " A read is dropped when it contains " << max_k_pct * 100 << "% or more abundant kmers.\n";
77 else
78 cerr << " A read is dropped when it contains " << max_lim << " or more abundant kmers.\n";
79 } else
80 cerr << "Filtering out reads by identifying abundant kmers: Off.\n";
81
82 if (normalize)
83 cerr << "Normalizing read depth: On.\n"
84 << " Read depth limit: " << normalize_lim << "x\n";
85 else
86 cerr << "Normalizing read depth: Off.\n";
87
88 vector<pair<string, string> > files, pair_files;
89 map<string, map<string, long> > counters;
90 SeqKmerHash kmers;
91 vector<char *> kmers_keys;
92
93 build_file_list(in_files, files);
94 cerr << "Found " << files.size() << " input file(s).\n";
95
96 build_file_list(in_pair_files, pair_files);
97 cerr << "Found " << pair_files.size() << " paired input file(s).\n";
98
99 if (filter_mare_k || filter_abundant_k || kmer_distr || write_k_freq) {
100 cerr << "Generating kmer distribution...\n";
101
102 if (read_k_freq)
103 read_kmer_freq(k_freq_path, kmers, kmers_keys);
104 else
105 populate_kmers(pair_files, files, kmers, kmers_keys);
106
107 // double kmer_med, kmer_mad;
108 // calc_kmer_median(kmers, kmer_med, kmer_mad);
109 // cerr << "Median kmer frequency: " << kmer_med << "; median absolute deviation: " << kmer_mad << "\n";
110
111 if (kmer_distr) {
112 generate_kmer_dist(kmers);
113 if (write_k_freq == false)
114 exit(1);
115 }
116
117 if (write_k_freq) {
118 write_kmer_freq(k_freq_path, kmers);
119 exit(1);
120 }
121
122 cerr << "Filtering reads by kmer frequency...\n";
123
124 for (uint i = 0; i < pair_files.size(); i += 2) {
125 cerr << "Processing paired file " << i+1 << " of " << (pair_files.size() / 2) << " [" << pair_files[i].second << "]\n";
126
127 counters[pair_files[i].second]["total"] = 0;
128 counters[pair_files[i].second]["retained"] = 0;
129 counters[pair_files[i].second]["rare_k"] = 0;
130 counters[pair_files[i].second]["abundant_k"] = 0;
131
132 process_paired_reads(pair_files[i].first,
133 pair_files[i].second,
134 pair_files[i+1].first,
135 pair_files[i+1].second,
136 kmers,
137 counters[pair_files[i].second]);
138
139 cerr << " "
140 << counters[pair_files[i].second]["total"] << " total reads; "
141 << "-" << counters[pair_files[i].second]["rare_k"] << " rare k-mer reads; "
142 << "-" << counters[pair_files[i].second]["abundant_k"] << " abundant k-mer reads; "
143 << counters[pair_files[i].second]["retained"] << " retained reads.\n";
144 }
145
146 for (uint i = 0; i < files.size(); i++) {
147 cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
148
149 counters[files[i].second]["total"] = 0;
150 counters[files[i].second]["retained"] = 0;
151 counters[files[i].second]["rare_k"] = 0;
152 counters[files[i].second]["abundant_k"] = 0;
153
154 process_reads(files[i].first,
155 files[i].second,
156 kmers,
157 counters[files[i].second]);
158
159 cerr << " "
160 << counters[files[i].second]["total"] << " total reads; "
161 << "-" << counters[files[i].second]["rare_k"] << " rare k-mer reads; "
162 << "-" << counters[files[i].second]["abundant_k"] << " abundant k-mer reads; "
163 << counters[files[i].second]["retained"] << " retained reads.\n";
164 }
165
166 free_kmer_hash(kmers, kmers_keys);
167
168 print_results(counters);
169 }
170
171 if (normalize) {
172 cerr << "Normalizing read depth...\n";
173
174 //
175 // Add the remainder files from the previous step to the queue.
176 //
177 if (filter_mare_k || filter_abundant_k) {
178 string file;
179 int pos;
180 for (uint i = 0; i < pair_files.size(); i += 2) {
181 file = pair_files[i].second;
182 pos = file.find_last_of(".");
183 if (file.substr(pos - 2, 2) == ".1")
184 pos -= 2;
185 file = file.substr(0, pos) + ".rem.fil";
186 file += out_file_type == FileT::fastq ? ".fq" : ".fa";
187 cerr << "Adding remainder file generated in previous step to queue, '" << file << "\n";
188 files.push_back(make_pair(pair_files[i].first, file));
189 }
190 }
191
192 for (uint i = 0; i < pair_files.size(); i += 2) {
193 cerr << "Processing paired files " << i+1 << " of " << (pair_files.size() / 2)
194 << " [" << pair_files[i].second << " / " << pair_files[i+1].second << "]\n";
195
196 counters[pair_files[i].second]["total"] = 0;
197 counters[pair_files[i].second]["retained"] = 0;
198 counters[pair_files[i].second]["overep"] = 0;
199
200 normalize_paired_reads(pair_files[i].first,
201 pair_files[i].second,
202 pair_files[i+1].first,
203 pair_files[i+1].second,
204 kmers, kmers_keys,
205 counters[pair_files[i].second]);
206
207 cerr << " "
208 << counters[pair_files[i].second]["total"] << " total reads; "
209 << "-" << counters[pair_files[i].second]["overep"] << " over-represented reads; "
210 << counters[pair_files[i].second]["retained"] << " retained reads.\n";
211 }
212
213 for (uint i = 0; i < files.size(); i++) {
214 cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
215
216 counters[files[i].second]["total"] = 0;
217 counters[files[i].second]["retained"] = 0;
218 counters[files[i].second]["overep"] = 0;
219
220 normalize_reads(files[i].first,
221 files[i].second,
222 kmers, kmers_keys,
223 counters[files[i].second]);
224
225 cerr << " "
226 << counters[files[i].second]["total"] << " total reads; "
227 << "-" << counters[files[i].second]["overep"] << " over-represented reads; "
228 << counters[files[i].second]["retained"] << " retained reads.\n";
229 }
230
231 free_kmer_hash(kmers, kmers_keys);
232 }
233
234 return 0;
235 IF_NDEBUG_CATCH_ALL_EXCEPTIONS
236 }
237
process_paired_reads(string in_path_1,string in_file_1,string in_path_2,string in_file_2,SeqKmerHash & kmers,map<string,long> & counter)238 int process_paired_reads(string in_path_1,
239 string in_file_1,
240 string in_path_2,
241 string in_file_2,
242 SeqKmerHash &kmers,
243 map<string, long> &counter) {
244 Input *fh_1=NULL, *fh_2=NULL;
245 ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
246 int pos;
247 string path;
248
249 string path_1 = in_path_1 + in_file_1;
250 string path_2 = in_path_2 + in_file_2;
251
252 if (in_file_type == FileT::fastq) {
253 fh_1 = new Fastq(path_1);
254 fh_2 = new Fastq(path_2);
255 } else if (in_file_type == FileT::fasta) {
256 fh_1 = new Fasta(path_1);
257 fh_2 = new Fasta(path_2);
258 } else if (in_file_type == FileT::gzfasta) {
259 fh_1 = new GzFasta(path_1 + ".gz");
260 fh_2 = new GzFasta(path_2 + ".gz");
261 } else if (in_file_type == FileT::gzfastq) {
262 fh_1 = new GzFastq(path_1 + ".gz");
263 fh_2 = new GzFastq(path_2 + ".gz");
264 } else if (in_file_type == FileT::bustard) {
265 fh_1 = new Bustard(path_1);
266 fh_2 = new Bustard(path_2);
267 }
268
269 //
270 // Open the output files.
271 //
272 pos = in_file_1.find_last_of(".");
273 path = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
274 ofstream *ofh_1 = new ofstream(path.c_str(), ifstream::out);
275 if (ofh_1->fail()) {
276 cerr << "Error opening filtered output file '" << path << "'\n";
277 exit(1);
278 }
279
280 pos = in_file_2.find_last_of(".");
281 path = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
282 ofstream *ofh_2 = new ofstream(path.c_str(), ifstream::out);
283 if (ofh_2->fail()) {
284 cerr << "Error opening filtered paired output file '" << path << "'\n";
285 exit(1);
286 }
287
288 pos = in_file_2.find_last_of(".");
289 //
290 // Pull the ".2" suffix off the paired file name to make the remainder file name.
291 //
292 if (in_file_2.substr(pos - 2, 2) == ".2")
293 pos -= 2;
294 path = out_path + in_file_2.substr(0, pos) + ".rem.fil";
295 path += out_file_type == FileT::fastq ? ".fq" : ".fa";
296 ofstream *rem_fh = new ofstream(path.c_str(), ifstream::out);
297 if (rem_fh->fail()) {
298 cerr << "Error opening filtered remainder output file '" << path << "'\n";
299 exit(1);
300 }
301
302 //
303 // Open a file for recording discarded reads
304 //
305 if (discards) {
306 pos = in_file_1.find_last_of(".");
307 path = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
308 discard_fh_1 = new ofstream(path.c_str(), ifstream::out);
309
310 if (discard_fh_1->fail()) {
311 cerr << "Error opening discard output file '" << path << "'\n";
312 exit(1);
313 }
314 pos = in_file_2.find_last_of(".");
315 path = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
316 discard_fh_2 = new ofstream(path.c_str(), ifstream::out);
317
318 if (discard_fh_2->fail()) {
319 cerr << "Error opening discard output file '" << path << "'\n";
320 exit(1);
321 }
322 }
323
324 //
325 // Read in the first record, initializing the Seq object s. Then
326 // initialize the Read object r, then loop, using the same objects.
327 //
328 Seq *s_1 = fh_1->next_seq();
329 Seq *s_2 = fh_2->next_seq();
330 if (s_1 == NULL || s_2 == NULL) {
331 cerr << "Unable to allocate Seq object.\n";
332 exit(1);
333 }
334
335 int rare_k, abundant_k, num_kmers, max_kmer_lim;
336 bool retain_1, retain_2;
337 char *kmer = new char[kmer_len + 1];
338
339 long i = 1;
340 do {
341 if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
342 counter["total"] += 2;
343 stringstream msg_1, msg_2;
344
345 retain_1 = true;
346 retain_2 = true;
347 num_kmers = strlen(s_1->seq) - kmer_len + 1;
348 max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
349
350 //
351 // Drop the first sequence if it has too many rare or abundant kmers.
352 //
353 kmer_lookup(kmers, s_1->seq, kmer, num_kmers, rare_k, abundant_k);
354
355 if (filter_mare_k && rare_k > 0) {
356 counter["rare_k"]++;
357 retain_1 = false;
358 msg_1 << "rare_k_" << rare_k;
359 }
360
361 if (retain_1 && filter_abundant_k && abundant_k > max_kmer_lim) {
362 counter["abundant_k"]++;
363 retain_1 = false;
364 msg_1 << "abundant_k_" << abundant_k;
365 }
366
367 rare_k = 0;
368 abundant_k = 0;
369 num_kmers = strlen(s_2->seq) - kmer_len + 1;
370 max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
371
372 //
373 // Drop the second sequence if it has too many rare or abundant kmers.
374 //
375 kmer_lookup(kmers, s_2->seq, kmer, num_kmers, rare_k, abundant_k);
376
377 if (filter_mare_k && rare_k > 0) {
378 counter["rare_k"]++;
379 retain_2 = false;
380 msg_2 << "rare_k_" << rare_k;
381 }
382
383 if (retain_2 && filter_abundant_k && abundant_k > max_kmer_lim) {
384 counter["abundant_k"]++;
385 retain_2 = false;
386 msg_2 << "abundant_k_" << abundant_k;
387 }
388
389 if (retain_1 && retain_2) {
390 counter["retained"] += 2;
391 out_file_type == FileT::fastq ?
392 write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
393 out_file_type == FileT::fastq ?
394 write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
395 }
396
397 if (retain_1 && !retain_2) {
398 counter["retained"]++;
399 out_file_type == FileT::fastq ?
400 write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
401 }
402
403 if (!retain_1 && retain_2) {
404 counter["retained"]++;
405 out_file_type == FileT::fastq ?
406 write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
407 }
408
409 if (discards && !retain_1)
410 out_file_type == FileT::fastq ?
411 write_fastq(discard_fh_1, s_1, msg_1.str()) : write_fasta(discard_fh_1, s_1, msg_1.str());
412 if (discards && !retain_2)
413 out_file_type == FileT::fastq ?
414 write_fastq(discard_fh_2, s_2, msg_2.str()) : write_fasta(discard_fh_2, s_2, msg_2.str());
415
416 delete s_1;
417 delete s_2;
418
419 i++;
420 } while ((s_1 = fh_1->next_seq()) != NULL &&
421 (s_2 = fh_2->next_seq()) != NULL);
422
423 delete [] kmer;
424
425 if (discards) {
426 delete discard_fh_1;
427 delete discard_fh_2;
428 }
429
430 //
431 // Close the file and delete the Input object.
432 //
433 delete fh_1;
434 delete fh_2;
435 delete ofh_1;
436 delete ofh_2;
437 delete rem_fh;
438
439 return 0;
440 }
441
process_reads(string in_path,string in_file,SeqKmerHash & kmers,map<string,long> & counter)442 int process_reads(string in_path,
443 string in_file,
444 SeqKmerHash &kmers,
445 map<string, long> &counter) {
446 Input *fh=NULL;
447 ofstream *discard_fh=NULL;
448 int pos;
449
450 string path = in_path + in_file;
451
452 if (in_file_type == FileT::fastq)
453 fh = new Fastq(path);
454 else if (in_file_type == FileT::fasta)
455 fh = new Fasta(path);
456 else if (in_file_type == FileT::gzfastq)
457 fh = new GzFastq(path + ".gz");
458 else if (in_file_type == FileT::gzfasta)
459 fh = new GzFasta(path + ".gz");
460 else if (in_file_type == FileT::bustard)
461 fh = new Bustard(path);
462
463 //
464 // Open the output file.
465 //
466 pos = in_file.find_last_of(".");
467 path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
468 ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
469
470 if (out_fh->fail()) {
471 cerr << "Error opening output file '" << path << "'\n";
472 exit(1);
473 }
474
475 //
476 // Open a file for recording discarded reads
477 //
478 if (discards) {
479 pos = in_file.find_last_of(".");
480 path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
481 discard_fh = new ofstream(path.c_str(), ifstream::out);
482
483 if (discard_fh->fail()) {
484 cerr << "Error opening discard output file '" << path << "'\n";
485 exit(1);
486 }
487 }
488
489 //
490 // Read in the first record, initializing the Seq object s. Then
491 // initialize the Read object r, then loop, using the same objects.
492 //
493 Seq *s = fh->next_seq();
494 if (s == NULL) {
495 cerr << "Unable to allocate Seq object.\n";
496 exit(1);
497 }
498
499 int rare_k, abundant_k, num_kmers, max_kmer_lim;
500 bool retain;
501 char *kmer = new char[kmer_len + 1];
502 long i = 1;
503
504 do {
505 if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
506 counter["total"]++;
507 stringstream msg;
508
509 //
510 // Drop this sequence if it has too many rare or abundant kmers.
511 //
512 retain = true;
513 num_kmers = strlen(s->seq) - kmer_len + 1;
514 max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
515
516 kmer_lookup(kmers, s->seq, kmer, num_kmers, rare_k, abundant_k);
517
518 if (filter_mare_k && rare_k > 0) {
519 counter["rare_k"]++;
520 retain = false;
521 msg << "rare_k_" << rare_k;
522 }
523
524 if (retain && filter_abundant_k && abundant_k > max_kmer_lim) {
525 counter["abundant_k"]++;
526 retain = false;
527 msg << "abundant_k_" << abundant_k;
528 }
529
530 if (retain) {
531 counter["retained"]++;
532 out_file_type == FileT::fastq ?
533 write_fastq(out_fh, s) : write_fasta(out_fh, s);
534 }
535
536 if (discards && !retain)
537 out_file_type == FileT::fastq ?
538 write_fastq(discard_fh, s, msg.str()) : write_fasta(discard_fh, s, msg.str());
539
540 delete s;
541
542 i++;
543 } while ((s = fh->next_seq()) != NULL);
544
545 delete [] kmer;
546
547 if (discards) delete discard_fh;
548
549 //
550 // Close the file and delete the Input object.
551 //
552 delete fh;
553 delete out_fh;
554
555 return 0;
556 }
557
558 int
normalize_paired_reads(string in_path_1,string in_file_1,string in_path_2,string in_file_2,SeqKmerHash & kmers,vector<char * > & kmer_keys,map<string,long> & counter)559 normalize_paired_reads(string in_path_1,
560 string in_file_1,
561 string in_path_2,
562 string in_file_2,
563 SeqKmerHash &kmers, vector<char *> &kmer_keys,
564 map<string, long> &counter)
565 {
566 Input *fh_1=NULL, *fh_2=NULL;
567 ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
568 ofstream *ofh_1=NULL, *ofh_2=NULL, *rem_fh=NULL;
569 string path_1, path_2;
570 int pos;
571
572 if (filter_abundant_k || filter_mare_k) {
573 //
574 // If we already filtered the data, open the files we created in the output
575 // directory to normalize.
576 //
577 pos = in_file_1.find_last_of(".");
578 path_1 = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
579
580 pos = in_file_2.find_last_of(".");
581 path_2 = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
582
583 if (in_file_type == FileT::fastq) {
584 fh_1 = new Fastq(path_1);
585 fh_2 = new Fastq(path_2);
586 } else if (in_file_type == FileT::gzfastq) {
587 fh_1 = new Fastq(path_1);
588 fh_2 = new Fastq(path_2);
589 } else if (in_file_type == FileT::fasta) {
590 fh_1 = new Fasta(path_1);
591 fh_2 = new Fasta(path_2);
592 } else if (in_file_type == FileT::gzfasta) {
593 fh_1 = new Fasta(path_1);
594 fh_2 = new Fasta(path_2);
595 }
596 } else {
597 //
598 // Otherwise, open unmodified files.
599 //
600 path_1 = in_path_1 + in_file_1;
601 path_2 = in_path_2 + in_file_2;
602
603 if (in_file_type == FileT::fastq) {
604 fh_1 = new Fastq(path_1);
605 fh_2 = new Fastq(path_2);
606 } else if (in_file_type == FileT::gzfastq) {
607 fh_1 = new GzFastq(path_1 + ".gz");
608 fh_2 = new GzFastq(path_2 + ".gz");
609 } else if (in_file_type == FileT::fasta) {
610 fh_1 = new Fasta(path_1);
611 fh_2 = new Fasta(path_2);
612 } else if (in_file_type == FileT::gzfasta) {
613 fh_1 = new GzFasta(path_1 + ".gz");
614 fh_2 = new GzFasta(path_2 + ".gz");
615 } else if (in_file_type == FileT::bustard) {
616 fh_1 = new Bustard(path_1);
617 fh_2 = new Bustard(path_2);
618 }
619 }
620
621 //
622 // Open the output files.
623 //
624 if (filter_abundant_k || filter_mare_k) {
625 pos = in_file_1.find_last_of(".");
626 path_1 = out_path + in_file_1.substr(0, pos) + ".fil.norm" + in_file_1.substr(pos);
627 ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
628
629 if (ofh_1->fail()) {
630 cerr << "Error opening normalized output file '" << path_1 << "'\n";
631 exit(1);
632 }
633
634 pos = in_file_2.find_last_of(".");
635 path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm" + in_file_2.substr(pos);
636 ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
637
638 if (ofh_2->fail()) {
639 cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
640 exit(1);
641 }
642
643 if (in_file_2.substr(pos - 2, 2) == ".2")
644 pos -= 2;
645 path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm.rem";
646 path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
647 rem_fh = new ofstream(path_2.c_str(), ifstream::out);
648
649 if (rem_fh->fail()) {
650 cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
651 exit(1);
652 }
653
654 } else {
655 pos = in_file_1.find_last_of(".");
656 path_1 = out_path + in_file_1.substr(0, pos) + ".norm" + in_file_1.substr(pos);
657 ofh_1 = new ofstream(path_1.c_str(), ifstream::out);
658
659 if (ofh_1->fail()) {
660 cerr << "Error opening normalized output file '" << path_1 << "'\n";
661 exit(1);
662 }
663
664 pos = in_file_2.find_last_of(".");
665 path_2 = out_path + in_file_2.substr(0, pos) + ".norm" + in_file_2.substr(pos);
666 ofh_2 = new ofstream(path_2.c_str(), ifstream::out);
667
668 if (ofh_2->fail()) {
669 cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
670 exit(1);
671 }
672
673 if (in_file_2.substr(pos - 2, 2) == ".2")
674 pos -= 2;
675 path_2 = out_path + in_file_2.substr(0, pos) + ".norm.rem";
676 path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
677 rem_fh = new ofstream(path_2.c_str(), ifstream::out);
678
679 if (rem_fh->fail()) {
680 cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
681 exit(1);
682 }
683 }
684
685 //
686 // Open a file for recording discarded reads
687 //
688 if (discards) {
689 pos = in_file_1.find_last_of(".");
690 if (filter_abundant_k || filter_mare_k)
691 path_1 = out_path + in_file_1.substr(0, pos) + ".fil.discards" + in_file_1.substr(pos);
692 else
693 path_1 = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
694 discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
695
696 if (discard_fh_1->fail()) {
697 cerr << "Error opening discard output file '" << path_1 << "'\n";
698 exit(1);
699 }
700
701 pos = in_file_2.find_last_of(".");
702 if (filter_abundant_k || filter_mare_k)
703 path_2 = out_path + in_file_2.substr(0, pos) + ".fil.discards" + in_file_2.substr(pos);
704 else
705 path_2 = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
706 discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
707
708 if (discard_fh_2->fail()) {
709 cerr << "Error opening discard output file '" << path_1 << "'\n";
710 exit(1);
711 }
712 }
713
714 //
715 // Read in the first record, initializing the Seq object s. Then
716 // initialize the Read object r, then loop, using the same objects.
717 //
718 Seq *s_1 = fh_1->next_seq();
719 Seq *s_2 = fh_2->next_seq();
720 if (s_1 == NULL || s_2 == NULL) {
721 cerr << "Unable to allocate Seq object.\n";
722 exit(1);
723 }
724
725 int num_kmers;
726 bool retain_1, retain_2;
727 char *kmer = new char[kmer_len + 1];
728
729 long i = 1;
730 do {
731 if (i % 10000 == 0) cerr << " Processing short read pair " << i << " \r";
732 counter["total"] += 2;
733
734 retain_1 = true;
735 retain_2 = true;
736 num_kmers = strlen(s_1->seq) - kmer_len + 1;
737
738 //
739 // Drop the first sequence if it has too many rare or abundant kmers.
740 //
741 retain_1 = normalize_kmer_lookup(kmers, s_1->seq, kmer, num_kmers, kmer_keys);
742
743 num_kmers = strlen(s_2->seq) - kmer_len + 1;
744
745 //
746 // Drop the second sequence if it has too many rare or abundant kmers.
747 //
748 retain_2 = normalize_kmer_lookup(kmers, s_2->seq, kmer, num_kmers, kmer_keys);
749
750 if (retain_1 && retain_2) {
751 counter["retained"] += 2;
752 out_file_type == FileT::fastq ?
753 write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
754 out_file_type == FileT::fastq ?
755 write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
756 } else {
757 counter["overep"] +=2;
758 }
759
760 if (retain_1 && !retain_2) {
761 counter["retained"]++;
762 counter["overep"]++;
763 out_file_type == FileT::fastq ?
764 write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
765 }
766
767 if (!retain_1 && retain_2) {
768 counter["retained"]++;
769 counter["overep"]++;
770 out_file_type == FileT::fastq ?
771 write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
772 }
773
774 if (discards && !retain_1)
775 out_file_type == FileT::fastq ?
776 write_fastq(discard_fh_1, s_1) : write_fasta(discard_fh_1, s_1);
777 if (discards && !retain_2)
778 out_file_type == FileT::fastq ?
779 write_fastq(discard_fh_2, s_2) : write_fasta(discard_fh_2, s_2);
780
781 delete s_1;
782 delete s_2;
783
784 i++;
785 } while ((s_1 = fh_1->next_seq()) != NULL &&
786 (s_2 = fh_2->next_seq()) != NULL);
787
788 delete [] kmer;
789
790 if (discards) {
791 delete discard_fh_1;
792 delete discard_fh_2;
793 }
794
795 //
796 // Close the file and delete the Input object.
797 //
798 delete fh_1;
799 delete fh_2;
800 delete ofh_1;
801 delete ofh_2;
802 delete rem_fh;
803
804 return 0;
805 }
806
807 int
normalize_reads(string in_path,string in_file,SeqKmerHash & kmers,vector<char * > & kmer_keys,map<string,long> & counter)808 normalize_reads(string in_path,
809 string in_file,
810 SeqKmerHash &kmers, vector<char *> &kmer_keys,
811 map<string, long> &counter)
812 {
813 Input *fh=NULL;
814 ofstream *discard_fh=NULL;
815 string path;
816
817 int pos = in_file.find_last_of(".");
818
819 if (filter_abundant_k || filter_mare_k) {
820 if (in_file.substr(pos - 4, 4) == ".fil")
821 path = out_path + in_file;
822 else
823 path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
824
825 if (in_file_type == FileT::fastq)
826 fh = new Fastq(path);
827 else if (in_file_type == FileT::gzfastq)
828 fh = new Fastq(path);
829 else if (in_file_type == FileT::fasta)
830 fh = new Fasta(path);
831 else if (in_file_type == FileT::gzfasta)
832 fh = new Fasta(path);
833 else if (in_file_type == FileT::bustard)
834 fh = new Bustard(path);
835
836 } else {
837 path = in_path + in_file;
838
839 if (in_file_type == FileT::fastq)
840 fh = new Fastq(path);
841 else if (in_file_type == FileT::gzfastq)
842 fh = new GzFastq(path + ".gz");
843 else if (in_file_type == FileT::fasta)
844 fh = new Fasta(path);
845 else if (in_file_type == FileT::gzfasta)
846 fh = new GzFasta(path + ".gz");
847 else if (in_file_type == FileT::bustard)
848 fh = new Bustard(path);
849 }
850
851 //
852 // Open the output file.
853 //
854 // if (filter_abundant_k || filter_mare_k) {
855 // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
856 // } else {
857 // path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
858 // }
859 path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
860 ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
861
862 if (out_fh->fail()) {
863 cerr << "Error opening normalized output file '" << path << "'\n";
864 exit(1);
865 }
866
867 //
868 // Open a file for recording discarded reads
869 //
870 if (discards) {
871 if (filter_abundant_k || filter_mare_k)
872 path = out_path + in_file.substr(0, pos) + ".fil.discards" + in_file.substr(pos);
873 else
874 path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
875 discard_fh = new ofstream(path.c_str(), ifstream::out);
876
877 if (discard_fh->fail()) {
878 cerr << "Error opening discard output file '" << path << "'\n";
879 exit(1);
880 }
881 }
882
883 //
884 // Read in the first record, initializing the Seq object s. Then
885 // initialize the Read object r, then loop, using the same objects.
886 //
887 Seq *s = fh->next_seq();
888 if (s == NULL) {
889 cerr << "Unable to allocate Seq object.\n";
890 exit(1);
891 }
892
893 int num_kmers;
894 bool retain;
895 char *kmer = new char[kmer_len + 1];
896 long i = 1;
897
898 do {
899 if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
900 counter["total"]++;
901
902 //
903 // Drop this sequence if it has too many rare or abundant kmers.
904 //
905 retain = true;
906 num_kmers = strlen(s->seq) - kmer_len + 1;
907
908 retain = normalize_kmer_lookup(kmers, s->seq, kmer, num_kmers, kmer_keys);
909
910 if (retain) {
911 counter["retained"]++;
912 out_file_type == FileT::fastq ?
913 write_fastq(out_fh, s) : write_fasta(out_fh, s);
914 } else {
915 counter["overep"]++;
916 }
917
918 if (discards && !retain)
919 out_file_type == FileT::fastq ?
920 write_fastq(discard_fh, s) : write_fasta(discard_fh, s);
921
922 delete s;
923
924 i++;
925 } while ((s = fh->next_seq()) != NULL);
926
927 delete [] kmer;
928
929 if (discards) delete discard_fh;
930
931 //
932 // Close the file and delete the Input object.
933 //
934 delete fh;
935 delete out_fh;
936
937 return 0;
938 }
939
940 int
populate_kmers(vector<pair<string,string>> & pair_files,vector<pair<string,string>> & files,SeqKmerHash & kmers,vector<char * > & kmers_keys)941 populate_kmers(vector<pair<string, string> > &pair_files,
942 vector<pair<string, string> > &files,
943 SeqKmerHash &kmers,
944 vector<char *> &kmers_keys)
945 {
946 //
947 // Break each read down into k-mers and create a hash map of those k-mers
948 // recording in which sequences they occur.
949 //
950 uint j = 1;
951 uint cnt = files.size() + pair_files.size();
952 for (uint i = 0; i < files.size(); i++) {
953 cerr << "Generating kmers from file " << j << " of " << cnt << " [" << files[i].second << "]\n";
954 process_file_kmers(files[i].first + files[i].second, kmers, kmers_keys);
955 j++;
956 }
957
958 for (uint i = 0; i < pair_files.size(); i++) {
959 cerr << "Generating kmers from file " << j << " of " << cnt << " [" << pair_files[i].second << "]\n";
960 process_file_kmers(pair_files[i].first + pair_files[i].second, kmers, kmers_keys);
961 j++;
962 }
963
964 cerr << kmers.size() << " unique k-mers recorded.\n";
965
966 return 0;
967 }
968
969 int
read_kmer_freq(string in_path,SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)970 read_kmer_freq(string in_path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
971 {
972 cerr << "Reading kmer frequencies from '" << in_path.c_str() << "'...\n";
973
974 ifstream fh(in_path.c_str(), ifstream::in);
975 if (fh.fail()) {
976 cerr << "Error opening rare kmer frequency input file '" << in_path << "'\n";
977 exit(1);
978 }
979
980 char *hash_key;
981 bool exists;
982 int len, cnt;
983 char kmer[id_len];
984 char line[max_len];
985 vector<string> parts;
986
987 long i = 1;
988
989 while (fh.good()) {
990 if (i % 10000 == 0) cerr << " Processing kmer " << i << " \r";
991
992 fh.getline(line, max_len);
993
994 len = strlen(line);
995 if (len == 0) continue;
996
997 //
998 // Check that there is no carraige return in the buffer.
999 //
1000 if (line[len - 1] == '\r') line[len - 1] = '\0';
1001
1002 //
1003 // Ignore comments
1004 //
1005 if (line[0] == '#') continue;
1006
1007 //
1008 // Parse the kmer and the number of times it occurred
1009 // <kmer> <tab> <integer>
1010 //
1011 parse_tsv(line, parts);
1012
1013 if (parts.size() != 2) {
1014 cerr << "kmer frequencies are not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
1015 exit(1);
1016 }
1017
1018 strcpy(kmer, parts[1].c_str());
1019 cnt = is_integer(kmer);
1020 if (cnt < 0) {
1021 cerr << "Non integer found in second column.\n";
1022 exit(1);
1023 }
1024
1025 strcpy(kmer, parts[0].c_str());
1026 exists = kmer_map.count(kmer) == 0 ? false : true;
1027
1028 if (exists) {
1029 cerr << "Warning: kmer '" << kmer << "' already exists in the kmer hash map.\n";
1030 hash_key = kmer;
1031 kmer_map[hash_key] += cnt;
1032 } else {
1033 hash_key = new char [strlen(kmer) + 1];
1034 strcpy(hash_key, kmer);
1035 kmer_map_keys.push_back(hash_key);
1036 kmer_map[hash_key] = cnt;
1037 }
1038 i++;
1039 }
1040
1041 fh.close();
1042
1043 cerr << kmer_map.size() << " unique k-mers read.\n";
1044
1045 kmer_len = strlen(kmer_map.begin()->first);
1046 cerr << "Setting kmer length to " << kmer_len << "bp.\n";
1047
1048 return 0;
1049 }
1050
1051 int
write_kmer_freq(string path,SeqKmerHash & kmer_map)1052 write_kmer_freq(string path, SeqKmerHash &kmer_map)
1053 {
1054 cerr << "Writing kmer frequencies to '" << path.c_str() << "'...";
1055
1056 ofstream out_fh(path.c_str(), ifstream::out);
1057 if (out_fh.fail()) {
1058 cerr << "Error opening rare kmer output file '" << path << "'\n";
1059 exit(1);
1060 }
1061
1062 SeqKmerHash::iterator i;
1063
1064 out_fh << "# Kmer\tCount\n";
1065
1066 for (i = kmer_map.begin(); i != kmer_map.end(); i++) {
1067 out_fh << i->first << "\t" << i->second << "\n";
1068 }
1069
1070 out_fh.close();
1071
1072 cerr << "done.\n";
1073
1074 return 0;
1075 }
1076
1077 int
process_file_kmers(string path,SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)1078 process_file_kmers(string path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
1079 {
1080 vector<char *> kmers;
1081 char *hash_key;
1082 bool exists;
1083 int j;
1084 Input *fh = NULL;
1085
1086 if (in_file_type == FileT::fastq)
1087 fh = new Fastq(path);
1088 else if (in_file_type == FileT::gzfastq)
1089 fh = new GzFastq(path + ".gz");
1090 else if (in_file_type == FileT::fasta)
1091 fh = new Fasta(path);
1092 else if (in_file_type == FileT::gzfasta)
1093 fh = new GzFasta(path + ".gz");
1094 else if (in_file_type == FileT::bustard)
1095 fh = new Bustard(path.c_str());
1096
1097 //
1098 // Read in the first record, initializing the Seq object s.
1099 //
1100 Seq *s = fh->next_seq();
1101 if (s == NULL) {
1102 cerr << "Unable to allocate Seq object.\n";
1103 exit(1);
1104 }
1105
1106 int num_kmers;
1107 char *kmer = new char [kmer_len + 1];
1108
1109 long i = 1;
1110 do {
1111 if (i % 10000 == 0) cerr << " Processing short read " << i << " \r";
1112
1113 num_kmers = strlen(s->seq) - kmer_len + 1;
1114
1115 //
1116 // Generate and hash the kmers for this raw read
1117 //
1118 kmer[kmer_len] = '\0';
1119
1120 for (j = 0; j < num_kmers; j++) {
1121 strncpy(kmer, s->seq + j, kmer_len);
1122
1123 exists = kmer_map.count(kmer) == 0 ? false : true;
1124
1125 if (exists) {
1126 hash_key = kmer;
1127 } else {
1128 hash_key = new char [kmer_len + 1];
1129 strcpy(hash_key, kmer);
1130 kmer_map_keys.push_back(hash_key);
1131 }
1132
1133 kmer_map[hash_key]++;
1134 }
1135
1136 delete s;
1137
1138 i++;
1139 } while ((s = fh->next_seq()) != NULL);
1140
1141 delete [] kmer;
1142
1143 //
1144 // Close the file and delete the Input object.
1145 //
1146 delete fh;
1147
1148 return 0;
1149 }
1150
1151 int
generate_kmer_dist(SeqKmerHash & kmer_map)1152 generate_kmer_dist(SeqKmerHash &kmer_map)
1153 {
1154 SeqKmerHash::iterator i;
1155 map<uint, uint> bins;
1156
1157 cerr << "Generating kmer distribution...\n";
1158
1159 for (i = kmer_map.begin(); i != kmer_map.end(); i++)
1160 bins[i->second]++;
1161
1162 map<uint, uint>::iterator j;
1163 vector<pair<uint, uint> > sorted_kmers;
1164
1165 for (j = bins.begin(); j != bins.end(); j++)
1166 sorted_kmers.push_back(make_pair(j->first, j->second));
1167
1168 cout << "KmerFrequency\tCount\n";
1169
1170 for (unsigned long k = 0; k < sorted_kmers.size(); k++)
1171 cout << sorted_kmers[k].first << "\t" << sorted_kmers[k].second << "\n";
1172
1173 return 0;
1174 }
1175
1176 int
calc_kmer_median(SeqKmerHash & kmers,double & kmer_med,double & kmer_mad)1177 calc_kmer_median(SeqKmerHash &kmers, double &kmer_med, double &kmer_mad)
1178 {
1179 kmer_med = 0.0;
1180 kmer_mad = 0.0;
1181
1182 int num_kmers = kmers.size();
1183 vector<int> freqs, residuals;
1184 freqs.reserve(num_kmers);
1185
1186 SeqKmerHash::iterator i;
1187
1188 for (i = kmers.begin(); i != kmers.end(); i++)
1189 freqs.push_back(i->second);
1190
1191 sort(freqs.begin(), freqs.end());
1192
1193 kmer_med = num_kmers % 2 == 0 ?
1194 (double) (freqs[num_kmers / 2 - 1] + freqs[num_kmers / 2]) / 2.0 :
1195 (double) freqs[num_kmers / 2 - 1];
1196
1197 //
1198 // Calculate the median absolute deviation.
1199 //
1200 residuals.reserve(num_kmers);
1201
1202 for (int j = 0; j < num_kmers; j++)
1203 residuals.push_back(abs(freqs[j] - (int) kmer_med));
1204 sort(residuals.begin(), residuals.end());
1205
1206 kmer_mad = num_kmers % 2 == 0 ?
1207 (double) (residuals[num_kmers / 2 - 1] + residuals[num_kmers / 2]) / 2.0 :
1208 (double) residuals[num_kmers / 2 - 1];
1209
1210 return 0;
1211 }
1212
1213 int
kmer_map_cmp(pair<char *,long> a,pair<char *,long> b)1214 kmer_map_cmp(pair<char *, long> a, pair<char *, long> b)
1215 {
1216 return (a.second < b.second);
1217 }
1218
1219 inline bool
normalize_kmer_lookup(SeqKmerHash & kmer_map,char * read,char * kmer,int num_kmers,vector<char * > & kmer_keys)1220 normalize_kmer_lookup(SeqKmerHash &kmer_map,
1221 char *read, char *kmer,
1222 int num_kmers,
1223 vector<char *> &kmer_keys)
1224 {
1225 kmer[kmer_len] = '\0';
1226 int cnt = 0;
1227 bool retain = true;
1228 //
1229 // Generate kmers from this read, increment kmer frequency in dataset.
1230 //
1231 vector<int> sorted_cnts;
1232 sorted_cnts.reserve(num_kmers);
1233
1234 // cout << "# " << read << "\n";
1235
1236 for (int j = 0; j < num_kmers; j++) {
1237 strncpy(kmer, read + j, kmer_len);
1238
1239 cnt = kmer_map.count(kmer) > 0 ? kmer_map[kmer] : 0;
1240 sorted_cnts.push_back(cnt);
1241
1242 // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1243 }
1244
1245 //
1246 // Calculate the median kmer frequency along the read.
1247 //
1248 sort(sorted_cnts.begin(), sorted_cnts.end());
1249 double median = num_kmers % 2 == 0 ?
1250 (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
1251 (double) sorted_cnts[num_kmers / 2 - 1];
1252 // cout << "# median: " << median << "\n";
1253
1254 if (median > normalize_lim)
1255 retain = false;
1256
1257 //
1258 // Generate and hash the kmers for this raw read
1259 //
1260 bool exists;
1261 char *hash_key;
1262 kmer[kmer_len] = '\0';
1263
1264 for (int j = 0; j < num_kmers; j++) {
1265 strncpy(kmer, read + j, kmer_len);
1266
1267 exists = kmer_map.count(kmer) == 0 ? false : true;
1268
1269 if (exists) {
1270 hash_key = kmer;
1271 } else {
1272 hash_key = new char [kmer_len + 1];
1273 strcpy(hash_key, kmer);
1274 kmer_keys.push_back(hash_key);
1275 }
1276
1277 kmer_map[hash_key]++;
1278 }
1279
1280 return retain;
1281 }
1282
1283 inline int
kmer_lookup(SeqKmerHash & kmer_map,char * read,char * kmer,int num_kmers,int & rare_k,int & abundant_k)1284 kmer_lookup(SeqKmerHash &kmer_map,
1285 char *read, char *kmer,
1286 int num_kmers,
1287 int &rare_k, int &abundant_k)
1288 {
1289 //
1290 // Generate kmers from this read, lookup kmer frequency in dataset.
1291 //
1292 rare_k = 0;
1293 abundant_k = 0;
1294 kmer[kmer_len] = '\0';
1295 int cnt = 0;
1296
1297 vector<int> cnts, sorted_cnts;
1298 cnts.reserve(num_kmers);
1299 sorted_cnts.reserve(num_kmers);
1300
1301 // cout << "# " << read << "\n";
1302
1303 for (int j = 0; j < num_kmers; j++) {
1304 strncpy(kmer, read + j, kmer_len);
1305
1306 cnt = kmer_map[kmer];
1307 cnts.push_back(cnt);
1308 sorted_cnts.push_back(cnt);
1309
1310 // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1311
1312 if (cnt >= max_k_freq) abundant_k++;
1313 }
1314
1315 //
1316 // Calculate the median kmer frequency along the read.
1317 //
1318 sort(sorted_cnts.begin(), sorted_cnts.end());
1319 double median = num_kmers % 2 == 0 ?
1320 (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
1321 (double) sorted_cnts[num_kmers / 2 - 1];
1322 // cout << "# median: " << median << "\n";
1323
1324 double bound = round(median * min_k_pct);
1325 // cout << "# kmer cov bound: " << bound << "\n";
1326
1327 //
1328 // Look for runs of rare kmers.
1329 //
1330 // We will slide a window across the read, f represents the front of the window, b
1331 // represents the back. Each time a kmer is below the bound we will increment run_cnt,
1332 // which represents the number of kmers in the window below the bound. If 2/3 of the
1333 // kmers in the window go below the bound, assume a sequencing error has occurred.
1334 //
1335 int run_cnt = 0;
1336 int b = 0;
1337 for (int f = 0; f < num_kmers; f++) {
1338 if (f >= kmer_len) {
1339 b++;
1340 if (cnts[b] <= bound)
1341 run_cnt--;
1342 }
1343 if (cnts[f] <= bound) {
1344 run_cnt++;
1345 if (run_cnt >= min_lim) {
1346 rare_k++;
1347 // cout << "# Rejecting read, position: " << f << "; run_cnt: " << run_cnt << "\n";
1348 return 0;
1349 }
1350 }
1351 // cout << "# b: " << b << "; f: " << f << "; run_cnt: " << run_cnt << "; counts[front]: " << cnts[f] << "; bound: " << bound << "\n";
1352 }
1353 // cout << "\n\n";
1354
1355 return 0;
1356 }
1357
1358 // inline int
1359 // kmer_lookup(SeqKmerHash &kmer_map,
1360 // char *read, char *kmer,
1361 // int num_kmers,
1362 // int &rare_k, int &abundant_k, bool &complex)
1363 // {
1364 // //
1365 // // Generate kmers from this read, lookup kmer frequency in dataset.
1366 // //
1367 // rare_k = 0;
1368 // abundant_k = 0;
1369 // complex = false;
1370 // kmer[kmer_len] = '\0';
1371 // int cnt = 0;
1372 // int rare_k_lim = (int) round((double) kmer_len * (1.0/2.0));
1373
1374 // vector<int> cnts;
1375 // cnts.reserve(num_kmers);
1376
1377 // // cout << "# " << read << "\n";
1378
1379 // for (int j = 0; j < num_kmers; j++) {
1380 // strncpy(kmer, read + j, kmer_len);
1381
1382 // cnt = kmer_map[kmer];
1383
1384 // if (cnt >= 100000)
1385 // cnts.push_back(100000);
1386 // else if (cnt >= 10000)
1387 // cnts.push_back(10000);
1388 // else if (cnt >= 1000)
1389 // cnts.push_back(1000);
1390 // else if (cnt >= 100)
1391 // cnts.push_back(100);
1392 // else if (cnt >= 10)
1393 // cnts.push_back(10);
1394 // else
1395 // cnts.push_back(1);
1396
1397 // // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1398
1399 // if (cnt >= max_k_freq) abundant_k++;
1400 // }
1401
1402 // // //
1403 // // // Detect the number of kmer coverage transitions.
1404 // // //
1405 // // int t = 0;
1406 // // int cov = cnts[0];
1407 // // cout << "\nDetermining transitions:\n" << kmer << "\t" << "0" << "\t" << cnts[0] << "\n";
1408 // // for (int j = 1; j < num_kmers; j++)
1409 // // if (cnts[j] != cov) {
1410 // // cov = cnts[j];
1411 // // t++;
1412 // // cout << kmer << "\t" << j << "\t" << cnts[j] << ": Transition." << "\n";
1413 // // } else {
1414 // // cout << kmer << "\t" << j << "\t" << cnts[j] << "\n";
1415 // // }
1416 // // cerr << t << " total cnts.\n";
1417
1418 // //
1419 // // Look for runs of kmers at various orders of magnitude.
1420 // //
1421 // // We will slide a window across the read, f represents the front of the window, b
1422 // // represents the back. Each time a kmer is below the bound we will increment run_cnt,
1423 // // which represents the number of kmers in the window below the bound. If 2/3 of the
1424 // // kmers in the window go below the bound, assume a sequencing error has occurred.
1425
1426 // // Run counters:
1427 // // 1 10 100 1k 10k 100k
1428 // // runs[0] runs[1] runs[2] runs[3] runs[4] runs[5]
1429 // int runs[6] = {0};
1430 // int prev_cnt, run_cnt, tot_trans;
1431 // int f = 0;
1432
1433 // while (f < num_kmers) {
1434
1435 // tot_trans = 0;
1436 // run_cnt = 1;
1437 // prev_cnt = cnts[f];
1438 // f++;
1439
1440 // while (f < num_kmers && cnts[f] == prev_cnt) {
1441 // // cout << "# window front: " << f << "; run_cnt: " << run_cnt << "; prev_cnt: " << prev_cnt << "\n";
1442 // f++;
1443 // run_cnt++;
1444 // }
1445
1446 // if (run_cnt >= rare_k_lim) {
1447 // // cout << "# found transition run, position: " << f-1 << "; run_cnt: " << run_cnt << "\n";
1448 // switch(prev_cnt) {
1449 // case 1:
1450 // runs[0]++;
1451 // break;
1452 // case 10:
1453 // runs[1]++;
1454 // break;
1455 // case 100:
1456 // runs[2]++;
1457 // break;
1458 // case 1000:
1459 // runs[3]++;
1460 // break;
1461 // case 10000:
1462 // runs[4]++;
1463 // break;
1464 // case 100000:
1465 // runs[5]++;
1466 // break;
1467 // }
1468 // }
1469
1470 // for (int j = 0; j < 6; j++)
1471 // if (runs[j] > 0) tot_trans++;
1472
1473 // // cout << "# Total transitions: " << tot_trans << "\n";
1474
1475 // if (tot_trans >= transition_lim) {
1476 // // cout << "# Rejecting read.\n";
1477 // rare_k++;
1478 // return 0;
1479 // }
1480 // }
1481
1482 // // cout << "\n\n";
1483
1484 // return 0;
1485 // }
1486
1487 int
free_kmer_hash(SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)1488 free_kmer_hash(SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
1489 {
1490 for (uint i = 0; i < kmer_map_keys.size(); i++) {
1491 delete [] kmer_map_keys[i];
1492 }
1493 kmer_map_keys.clear();
1494
1495 kmer_map.clear();
1496
1497 return 0;
1498 }
1499
print_results(map<string,map<string,long>> & counters)1500 int print_results(map<string, map<string, long> > &counters) {
1501 map<string, map<string, long> >::iterator it;
1502
1503 string log_path = out_path + "kmer_filter.log";
1504 ofstream log(log_path.c_str());
1505
1506 if (log.fail()) {
1507 cerr << "Unable to open log file '" << log_path << "'\n";
1508 return 0;
1509 }
1510
1511 cerr << "Outputing details to log: '" << log_path << "'\n\n";
1512
1513 log << "File\t"
1514 << "Retained Reads\t"
1515 << "Rare K\t"
1516 << "Abundant K\t"
1517 << "Total\n";
1518
1519 for (it = counters.begin(); it != counters.end(); it++) {
1520 log << it->first << "\t"
1521 << it->second["retained"] << "\t"
1522 << it->second["rare_k"] << "\t"
1523 << it->second["abundant_k"] << "\t"
1524 << it->second["total"] << "\n";
1525 }
1526
1527 map<string, long> c;
1528 c["total"] = 0;
1529
1530 //
1531 // Total up the individual counters
1532 //
1533 for (it = counters.begin(); it != counters.end(); it++) {
1534 c["total"] += it->second["total"];
1535 c["retained"] += it->second["retained"];
1536 c["rare_k"] += it->second["rare_k"];
1537 c["abundant_k"] += it->second["abundant_k"];
1538 }
1539
1540 cerr <<
1541 c["total"] << " total sequences;\n"
1542 << " " << c["rare_k"] << " rare k-mer reads;\n"
1543 << " " << c["abundant_k"] << " abundant k-mer reads;\n"
1544 << c["retained"] << " retained reads.\n";
1545
1546 log << "Total Sequences\t" << c["total"] << "\n"
1547 << "Retained Reads\t" << c["retained"] << "\n";
1548
1549 log.close();
1550
1551 return 0;
1552 }
1553
build_file_list(vector<string> & in_files,vector<pair<string,string>> & files)1554 int build_file_list(vector<string> &in_files, vector<pair<string, string> > &files) {
1555 string file, suffix;
1556 int pos;
1557
1558 //
1559 // Scan a directory for a list of files.
1560 //
1561 if (in_path.length() > 0) {
1562 struct dirent *direntry;
1563
1564 DIR *dir = opendir(in_path.c_str());
1565
1566 if (dir == NULL) {
1567 cerr << "Unable to open directory '" << in_path << "' for reading.\n";
1568 exit(1);
1569 }
1570
1571 while ((direntry = readdir(dir)) != NULL) {
1572 file = direntry->d_name;
1573
1574 if (file.substr(0, 1) == ".")
1575 continue;
1576
1577 //
1578 // If the file is gzip'ed, remove the '.gz' suffix.
1579 //
1580 pos = file.find_last_of(".");
1581 if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
1582 file.substr(pos) == ".gz") {
1583 file = file.substr(0, pos);
1584 pos = file.find_last_of(".");
1585 }
1586
1587 //
1588 // Check that the remaining file name has the right suffix.
1589 //
1590 suffix = file.substr(pos + 1);
1591 if (in_file_type == FileT::fastq && (suffix.substr(0, 2) == "fq" || suffix.substr(0, 5) == "fastq"))
1592 files.push_back(make_pair(in_path, file));
1593 else if (in_file_type == FileT::fasta && (suffix.substr(0, 2) == "fa" || suffix.substr(0, 5) == "fasta"))
1594 files.push_back(make_pair(in_path, file));
1595 }
1596
1597 if (files.size() == 0)
1598 cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
1599
1600 } else {
1601 string path;
1602
1603 for (uint i = 0; i < in_files.size(); i++) {
1604 //
1605 // Files specified directly:
1606 // Break off file path and store path and file name.
1607 // Check if this is a gzip'ed file and if so, remove 'gz' suffix.
1608 //
1609 file = in_files[i];
1610 pos = file.find_last_of(".");
1611 if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
1612 file.substr(pos) == ".gz") {
1613 file = file.substr(0, pos);
1614 pos = file.find_last_of(".");
1615 }
1616 pos = file.find_last_of("/");
1617 path = file.substr(0, pos + 1);
1618 files.push_back(make_pair(path, file.substr(pos+1)));
1619 }
1620 }
1621
1622 return 0;
1623 }
1624
parse_command_line(int argc,char * argv[])1625 int parse_command_line(int argc, char* argv[]) {
1626 string pair_1, pair_2;
1627 int c;
1628
1629 while (1) {
1630 static struct option long_options[] = {
1631 {"help", no_argument, NULL, 'h'},
1632 {"version", no_argument, NULL, 'v'},
1633 {"discards", no_argument, NULL, 'D'},
1634 {"pair-1", required_argument, NULL, '1'}, {"pair_1", required_argument, NULL, '1'},
1635 {"pair-2", required_argument, NULL, '2'}, {"pair_2", required_argument, NULL, '2'},
1636 {"infile-type", required_argument, NULL, 'i'}, {"infile_type", required_argument, NULL, 'i'},
1637 {"outfile-type", required_argument, NULL, 'y'}, {"outfile_type", required_argument, NULL, 'y'},
1638 {"file", required_argument, NULL, 'f'},
1639 {"path", required_argument, NULL, 'p'},
1640 {"outpath", required_argument, NULL, 'o'},
1641 {"k-dist", no_argument, NULL, 'I'}, {"k_dist", no_argument, NULL, 'I'},
1642 {"rare", no_argument, NULL, 'R'},
1643 {"abundant", no_argument, NULL, 'A'},
1644 {"normalize", required_argument, NULL, 'N'},
1645 {"k-len", required_argument, NULL, 'K'}, {"k_len", required_argument, NULL, 'K'},
1646 {"max-k-freq", required_argument, NULL, 'M'}, {"max_k_freq", required_argument, NULL, 'M'},
1647 {"min-lim", required_argument, NULL, 'F'}, {"min_lim", required_argument, NULL, 'F'},
1648 {"max-lim", required_argument, NULL, 'G'}, {"max_lim", required_argument, NULL, 'G'},
1649 {"min-k-pct", required_argument, NULL, 'P'}, {"min_k_pct", required_argument, NULL, 'P'},
1650 {"read-k-freq", required_argument, NULL, 'r'}, {"read_k_freq", required_argument, NULL, 'r'},
1651 {"write-k-freq", required_argument, NULL, 'w'}, {"write_k_freq", required_argument, NULL, 'w'},
1652 {0, 0, 0, 0}
1653 };
1654
1655 // getopt_long stores the option index here.
1656 int option_index = 0;
1657
1658 c = getopt_long(argc, argv, "hvRADkP:N:I:w:r:K:F:G:M:m:i:y:f:o:t:p:1:2:", long_options, &option_index);
1659
1660 // Detect the end of the options.
1661 if (c == -1)
1662 break;
1663
1664 switch (c) {
1665 case 'h':
1666 help();
1667 break;
1668 case 'i':
1669 if (strcasecmp(optarg, "fasta") == 0)
1670 in_file_type = FileT::fasta;
1671 else if (strcasecmp(optarg, "gzfasta") == 0)
1672 in_file_type = FileT::gzfasta;
1673 else if (strcasecmp(optarg, "gzfastq") == 0)
1674 in_file_type = FileT::gzfastq;
1675 else
1676 in_file_type = FileT::fastq;
1677 break;
1678 case 'y':
1679 if (strcasecmp(optarg, "fasta") == 0)
1680 out_file_type = FileT::fasta;
1681 else
1682 out_file_type = FileT::fastq;
1683 break;
1684 case 'f':
1685 in_files.push_back(optarg);
1686 break;
1687 case '1':
1688 pair_1 = optarg;
1689 break;
1690 case '2':
1691 pair_2 = optarg;
1692 if (pair_1.length() == 0) help();
1693 in_pair_files.push_back(pair_1);
1694 in_pair_files.push_back(pair_2);
1695 pair_1 = "";
1696 pair_2 = "";
1697 break;
1698 case 'p':
1699 in_path = optarg;
1700 break;
1701 case 'o':
1702 out_path = optarg;
1703 break;
1704 case 'D':
1705 discards = true;
1706 break;
1707 case 'I':
1708 kmer_distr = true;
1709 break;
1710 case 'R':
1711 filter_mare_k = true;
1712 break;
1713 case 'A':
1714 filter_abundant_k = true;
1715 break;
1716 case 'N':
1717 normalize = true;
1718 normalize_lim = is_integer(optarg);
1719 break;
1720 case 'K':
1721 kmer_len = is_integer(optarg);
1722 break;
1723 case 'M':
1724 max_k_freq = is_integer(optarg);
1725 break;
1726 case 'F':
1727 min_lim = is_integer(optarg);
1728 break;
1729 case 'G':
1730 max_lim = is_integer(optarg);
1731 break;
1732 case 'P':
1733 min_k_pct = is_double(optarg);
1734 break;
1735 case 'r':
1736 read_k_freq = true;
1737 k_freq_path = optarg;
1738 break;
1739 case 'w':
1740 write_k_freq = true;
1741 k_freq_path = optarg;
1742 break;
1743 case 'v':
1744 version();
1745 break;
1746 case '?':
1747 // getopt_long already printed an error message.
1748 help();
1749 break;
1750
1751 default:
1752 cerr << "Unknown command line option '" << (char) c << "'\n";
1753 help();
1754 exit(1);
1755 }
1756 }
1757
1758 if (in_files.size() == 0 && in_pair_files.size() == 0 && in_path.length() == 0) {
1759 cerr << "You must specify an input file of a directory path to a set of input files.\n";
1760 help();
1761 }
1762
1763 if (in_files.size() > 0 && in_path.length() > 0) {
1764 cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
1765 help();
1766 }
1767
1768 if (in_path.length() > 0 && in_path.at(in_path.length() - 1) != '/')
1769 in_path += "/";
1770
1771 if (out_path.length() == 0)
1772 out_path = ".";
1773
1774 if (out_path.at(out_path.length() - 1) != '/')
1775 out_path += "/";
1776
1777 if (in_file_type == FileT::unknown)
1778 in_file_type = FileT::fastq;
1779
1780 if (read_k_freq && write_k_freq) {
1781 cerr << "You may either read a set of kmer frequencies, or write kmer frequencies, not both.\n";
1782 help();
1783 }
1784
1785 if (min_k_pct < 0.0 || min_k_pct > 1.0) {
1786 cerr << "Percentage to consider a kmer rare must be between 0 and 1.0.\n";
1787 help();
1788 }
1789
1790 //
1791 // Check that the output path exists.
1792 //
1793 struct stat info;
1794 if (stat(out_path.c_str(), &info) != 0) {
1795 cerr << "Unable to locate the specified output path, '" << out_path << "'\n";
1796 exit(1);
1797 }
1798
1799 return 0;
1800 }
1801
version()1802 void version() {
1803 cerr << "kmer_filter " << VERSION << "\n\n";
1804
1805 exit(1);
1806 }
1807
help()1808 void help() {
1809 cerr << "kmer_filter " << VERSION << "\n"
1810 << "kmer_filter [-f in_file_1 [-f in_file_2...] | -p in_dir] [-1 pair_1 -2 pair_2 [-1 pair_1...]] -o out_dir [-i type] [-y type] [-D] [-h]\n"
1811 << " f: path to the input file if processing single-end seqeunces.\n"
1812 << " i: input file type, either 'bustard' for the Illumina BUSTARD output files, 'fasta', 'fastq', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
1813 << " p: path to a directory of files (for single-end files only).\n"
1814 << " 1: specify the first in a pair of files to be processed together.\n"
1815 << " 2: specify the second in a pair of files to be processed together.\n"
1816 << " o: path to output the processed files.\n"
1817 << " y: output type, either 'fastq' or 'fasta' (default fastq).\n"
1818 << " D: capture discarded reads to a file.\n"
1819 << " h: display this help messsage.\n\n"
1820 << " Filtering options:\n"
1821 << " --rare: turn on filtering based on rare k-mers.\n"
1822 << " --abundant: turn on filtering based on abundant k-mers.\n"
1823 << " --k-len <len>: specify k-mer size (default 15).\n\n"
1824 << " Advanced filtering options:\n"
1825 << " --max-k-freq <value>: specify the number of times a kmer must occur to be considered abundant (default 20,000).\n"
1826 << " --min-lim <value>: specify number of rare kmers occuring in a row required to discard a read (default 80% of the k-mer length).\n"
1827 << " --max-lim <value>: specify number of abundant kmers required to discard a read (default 80% of the k-mers in a read).\n\n"
1828 << " Normalize data:\n"
1829 << " --normalize <depth>: normalize read depth according to k-mer coverage.\n\n"
1830 << " Characterizing K-mers:\n"
1831 << " --write-k-freq: write kmers along with their frequency of occurrence and exit.\n"
1832 << " --k-dist: print k-mer frequency distribution and exit.\n\n"
1833 << " Advanced input options:\n"
1834 << " --read-k-freq <path>: read a set of kmers along with their frequencies of occurrence instead of reading raw input files.\n"
1835 << "\n";
1836
1837 exit(1);
1838 }
1839