1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // kmer_filter --
23 //
24 
25 #include "kmer_filter.h"
26 
27 //
28 // Global variables to hold command-line options.
29 //
30 FileT  in_file_type  = FileT::unknown;
31 FileT  out_file_type = FileT::fastq;
32 vector<string> in_files;
33 vector<string> in_pair_files;
34 string in_path;
35 string out_path;
36 string k_freq_path;
37 bool   filter_mare_k     = false;
38 bool   filter_abundant_k = false;
39 bool   normalize         = false;
40 bool   discards     = false;
41 bool   write_k_freq = false;
42 bool   read_k_freq  = false;
43 bool   kmer_distr   = false;
44 bool   ill_barcode  = false;
45 uint   truncate_seq = 0;
46 int    transition_lim = 3;
47 int    normalize_lim  = 0;
48 int    kmer_len       = 15;
49 int    max_k_freq     = 20000;
50 double max_k_pct      = 0.80;
51 double min_k_pct      = 0.15;
52 int    min_lim        = 0;
53 int    max_lim        = 0;
54 int    num_threads    = 1;
55 
main(int argc,char * argv[])56 int main (int argc, char* argv[]) {
57     IF_NDEBUG_TRY
58 
59     parse_command_line(argc, argv);
60 
61     if (min_lim == 0)
62         min_lim = (int) round((double) kmer_len * 0.80);
63 
64     cerr << "Using a kmer size of " << kmer_len << "\n";
65     if (filter_mare_k) {
66         cerr << "Filtering out reads by identifying rare kmers: On.\n"
67              << "  A kmer is considered rare when its coverage is at " << min_k_pct * 100 << "% or below the median kmer coverage for the read.\n"
68              << "  A read is dropped when it contains " << min_lim << " or more rare kmers in a row.\n";
69     } else
70         cerr << "Filtering out reads by identifying rare kmers: Off.\n";
71 
72     if (filter_abundant_k) {
73         cerr << "Filtering out reads by identifying abundant kmers: On.\n"
74              << "  Kmer is considered abundant when it occurs " << max_k_freq << " or more times.\n";
75         if (max_lim == 0)
76             cerr << "  A read is dropped when it contains " << max_k_pct * 100 << "% or more abundant kmers.\n";
77         else
78             cerr << "  A read is dropped when it contains " << max_lim << " or more abundant kmers.\n";
79     } else
80         cerr << "Filtering out reads by identifying abundant kmers: Off.\n";
81 
82     if (normalize)
83         cerr << "Normalizing read depth: On.\n"
84              << "  Read depth limit: " << normalize_lim << "x\n";
85     else
86         cerr << "Normalizing read depth: Off.\n";
87 
88     vector<pair<string, string> >   files, pair_files;
89     map<string, map<string, long> > counters;
90     SeqKmerHash    kmers;
91     vector<char *> kmers_keys;
92 
93     build_file_list(in_files, files);
94     cerr << "Found " << files.size() << " input file(s).\n";
95 
96     build_file_list(in_pair_files, pair_files);
97     cerr << "Found " << pair_files.size() << " paired input file(s).\n";
98 
99     if (filter_mare_k || filter_abundant_k || kmer_distr || write_k_freq) {
100         cerr << "Generating kmer distribution...\n";
101 
102         if (read_k_freq)
103             read_kmer_freq(k_freq_path, kmers, kmers_keys);
104         else
105             populate_kmers(pair_files, files, kmers, kmers_keys);
106 
107         // double kmer_med, kmer_mad;
108         // calc_kmer_median(kmers, kmer_med, kmer_mad);
109         // cerr << "Median kmer frequency: " << kmer_med << "; median absolute deviation: " << kmer_mad << "\n";
110 
111         if (kmer_distr) {
112             generate_kmer_dist(kmers);
113             if (write_k_freq == false)
114                 exit(1);
115         }
116 
117         if (write_k_freq) {
118             write_kmer_freq(k_freq_path, kmers);
119             exit(1);
120         }
121 
122         cerr << "Filtering reads by kmer frequency...\n";
123 
124         for (uint i = 0; i < pair_files.size(); i += 2) {
125             cerr << "Processing paired file " << i+1 << " of " << (pair_files.size() / 2) << " [" << pair_files[i].second << "]\n";
126 
127             counters[pair_files[i].second]["total"]      = 0;
128             counters[pair_files[i].second]["retained"]   = 0;
129             counters[pair_files[i].second]["rare_k"]     = 0;
130             counters[pair_files[i].second]["abundant_k"] = 0;
131 
132             process_paired_reads(pair_files[i].first,
133                                  pair_files[i].second,
134                                  pair_files[i+1].first,
135                                  pair_files[i+1].second,
136                                  kmers,
137                                  counters[pair_files[i].second]);
138 
139             cerr << "  "
140                  << counters[pair_files[i].second]["total"] << " total reads; "
141                  << "-" << counters[pair_files[i].second]["rare_k"] << " rare k-mer reads; "
142                  << "-" << counters[pair_files[i].second]["abundant_k"] << " abundant k-mer reads; "
143                  << counters[pair_files[i].second]["retained"] << " retained reads.\n";
144         }
145 
146         for (uint i = 0; i < files.size(); i++) {
147             cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
148 
149             counters[files[i].second]["total"]      = 0;
150             counters[files[i].second]["retained"]   = 0;
151             counters[files[i].second]["rare_k"]     = 0;
152             counters[files[i].second]["abundant_k"] = 0;
153 
154             process_reads(files[i].first,
155                           files[i].second,
156                           kmers,
157                           counters[files[i].second]);
158 
159             cerr << "  "
160                  << counters[files[i].second]["total"] << " total reads; "
161                  << "-" << counters[files[i].second]["rare_k"] << " rare k-mer reads; "
162                  << "-" << counters[files[i].second]["abundant_k"] << " abundant k-mer reads; "
163                  << counters[files[i].second]["retained"] << " retained reads.\n";
164         }
165 
166         free_kmer_hash(kmers, kmers_keys);
167 
168         print_results(counters);
169     }
170 
171     if (normalize) {
172         cerr << "Normalizing read depth...\n";
173 
174         //
175         // Add the remainder files from the previous step to the queue.
176         //
177         if (filter_mare_k || filter_abundant_k) {
178             string file;
179             int    pos;
180             for (uint i = 0; i < pair_files.size(); i += 2) {
181                 file  = pair_files[i].second;
182                 pos   = file.find_last_of(".");
183                 if (file.substr(pos - 2, 2) == ".1")
184                     pos -= 2;
185                 file  = file.substr(0, pos) + ".rem.fil";
186                 file += out_file_type == FileT::fastq ? ".fq" : ".fa";
187                 cerr << "Adding remainder file generated in previous step to queue, '" << file << "\n";
188                 files.push_back(make_pair(pair_files[i].first, file));
189             }
190         }
191 
192         for (uint i = 0; i < pair_files.size(); i += 2) {
193             cerr << "Processing paired files " << i+1 << " of " << (pair_files.size() / 2)
194                  << " [" << pair_files[i].second << " / " << pair_files[i+1].second << "]\n";
195 
196             counters[pair_files[i].second]["total"]    = 0;
197             counters[pair_files[i].second]["retained"] = 0;
198             counters[pair_files[i].second]["overep"]   = 0;
199 
200             normalize_paired_reads(pair_files[i].first,
201                                    pair_files[i].second,
202                                    pair_files[i+1].first,
203                                    pair_files[i+1].second,
204                                    kmers, kmers_keys,
205                                    counters[pair_files[i].second]);
206 
207             cerr << "  "
208                  << counters[pair_files[i].second]["total"] << " total reads; "
209                  << "-" << counters[pair_files[i].second]["overep"] << " over-represented reads; "
210                  << counters[pair_files[i].second]["retained"] << " retained reads.\n";
211         }
212 
213         for (uint i = 0; i < files.size(); i++) {
214             cerr << "Processing file " << i+1 << " of " << files.size() << " [" << files[i].second << "]\n";
215 
216             counters[files[i].second]["total"]    = 0;
217             counters[files[i].second]["retained"] = 0;
218             counters[files[i].second]["overep"]   = 0;
219 
220             normalize_reads(files[i].first,
221                             files[i].second,
222                             kmers, kmers_keys,
223                             counters[files[i].second]);
224 
225             cerr << "  "
226                  << counters[files[i].second]["total"] << " total reads; "
227                  << "-" << counters[files[i].second]["overep"] << " over-represented reads; "
228                  << counters[files[i].second]["retained"] << " retained reads.\n";
229         }
230 
231         free_kmer_hash(kmers, kmers_keys);
232     }
233 
234     return 0;
235     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
236 }
237 
process_paired_reads(string in_path_1,string in_file_1,string in_path_2,string in_file_2,SeqKmerHash & kmers,map<string,long> & counter)238 int process_paired_reads(string in_path_1,
239                          string in_file_1,
240                          string in_path_2,
241                          string in_file_2,
242                          SeqKmerHash &kmers,
243                          map<string, long> &counter) {
244     Input    *fh_1=NULL, *fh_2=NULL;
245     ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
246     int       pos;
247     string    path;
248 
249     string path_1 = in_path_1 + in_file_1;
250     string path_2 = in_path_2 + in_file_2;
251 
252     if (in_file_type == FileT::fastq) {
253         fh_1 = new Fastq(path_1);
254         fh_2 = new Fastq(path_2);
255     } else if (in_file_type == FileT::fasta) {
256         fh_1 = new Fasta(path_1);
257         fh_2 = new Fasta(path_2);
258     } else if (in_file_type == FileT::gzfasta) {
259         fh_1 = new GzFasta(path_1 + ".gz");
260         fh_2 = new GzFasta(path_2 + ".gz");
261     } else if (in_file_type == FileT::gzfastq) {
262         fh_1 = new GzFastq(path_1 + ".gz");
263         fh_2 = new GzFastq(path_2 + ".gz");
264     } else if (in_file_type == FileT::bustard) {
265         fh_1 = new Bustard(path_1);
266         fh_2 = new Bustard(path_2);
267     }
268 
269     //
270     // Open the output files.
271     //
272     pos  = in_file_1.find_last_of(".");
273     path = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
274     ofstream *ofh_1 = new ofstream(path.c_str(), ifstream::out);
275     if (ofh_1->fail()) {
276         cerr << "Error opening filtered output file '" << path << "'\n";
277         exit(1);
278     }
279 
280     pos  = in_file_2.find_last_of(".");
281     path = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
282     ofstream *ofh_2  = new ofstream(path.c_str(), ifstream::out);
283     if (ofh_2->fail()) {
284         cerr << "Error opening filtered paired output file '" << path << "'\n";
285         exit(1);
286     }
287 
288     pos  = in_file_2.find_last_of(".");
289     //
290     // Pull the ".2" suffix off the paired file name to make the remainder file name.
291     //
292     if (in_file_2.substr(pos - 2, 2) == ".2")
293         pos -= 2;
294     path  = out_path + in_file_2.substr(0, pos) + ".rem.fil";
295     path += out_file_type == FileT::fastq ? ".fq" : ".fa";
296     ofstream *rem_fh = new ofstream(path.c_str(), ifstream::out);
297     if (rem_fh->fail()) {
298         cerr << "Error opening filtered remainder output file '" << path << "'\n";
299         exit(1);
300     }
301 
302     //
303     // Open a file for recording discarded reads
304     //
305     if (discards) {
306         pos  = in_file_1.find_last_of(".");
307         path = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
308         discard_fh_1 = new ofstream(path.c_str(), ifstream::out);
309 
310         if (discard_fh_1->fail()) {
311             cerr << "Error opening discard output file '" << path << "'\n";
312             exit(1);
313         }
314         pos  = in_file_2.find_last_of(".");
315         path = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
316         discard_fh_2 = new ofstream(path.c_str(), ifstream::out);
317 
318         if (discard_fh_2->fail()) {
319             cerr << "Error opening discard output file '" << path << "'\n";
320             exit(1);
321         }
322     }
323 
324     //
325     // Read in the first record, initializing the Seq object s. Then
326     // initialize the Read object r, then loop, using the same objects.
327     //
328     Seq *s_1 = fh_1->next_seq();
329     Seq *s_2 = fh_2->next_seq();
330     if (s_1 == NULL || s_2 == NULL) {
331         cerr << "Unable to allocate Seq object.\n";
332         exit(1);
333     }
334 
335     int   rare_k, abundant_k, num_kmers, max_kmer_lim;
336     bool  retain_1, retain_2;
337     char *kmer = new char[kmer_len + 1];
338 
339     long i = 1;
340     do {
341         if (i % 10000 == 0) cerr << "  Processing short read pair " << i << "       \r";
342         counter["total"] += 2;
343         stringstream msg_1, msg_2;
344 
345         retain_1       = true;
346         retain_2       = true;
347         num_kmers      = strlen(s_1->seq) - kmer_len + 1;
348         max_kmer_lim   = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
349 
350         //
351         // Drop the first sequence if it has too many rare or abundant kmers.
352         //
353         kmer_lookup(kmers, s_1->seq, kmer, num_kmers, rare_k, abundant_k);
354 
355         if (filter_mare_k && rare_k > 0) {
356             counter["rare_k"]++;
357             retain_1 = false;
358             msg_1 << "rare_k_" << rare_k;
359         }
360 
361         if (retain_1 && filter_abundant_k && abundant_k > max_kmer_lim) {
362             counter["abundant_k"]++;
363             retain_1 = false;
364             msg_1 << "abundant_k_" << abundant_k;
365         }
366 
367         rare_k       = 0;
368         abundant_k   = 0;
369         num_kmers    = strlen(s_2->seq) - kmer_len + 1;
370         max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
371 
372         //
373         // Drop the second sequence if it has too many rare or abundant kmers.
374         //
375         kmer_lookup(kmers, s_2->seq, kmer, num_kmers, rare_k, abundant_k);
376 
377         if (filter_mare_k && rare_k > 0) {
378             counter["rare_k"]++;
379             retain_2 = false;
380             msg_2 << "rare_k_" << rare_k;
381         }
382 
383         if (retain_2 && filter_abundant_k && abundant_k > max_kmer_lim) {
384             counter["abundant_k"]++;
385             retain_2 = false;
386             msg_2 << "abundant_k_" << abundant_k;
387         }
388 
389         if (retain_1 && retain_2) {
390             counter["retained"] += 2;
391             out_file_type == FileT::fastq ?
392                  write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
393             out_file_type == FileT::fastq ?
394                  write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
395         }
396 
397         if (retain_1 && !retain_2) {
398             counter["retained"]++;
399             out_file_type == FileT::fastq ?
400                  write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
401         }
402 
403         if (!retain_1 && retain_2) {
404             counter["retained"]++;
405             out_file_type == FileT::fastq ?
406                  write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
407         }
408 
409         if (discards && !retain_1)
410             out_file_type == FileT::fastq ?
411                 write_fastq(discard_fh_1, s_1, msg_1.str()) : write_fasta(discard_fh_1, s_1, msg_1.str());
412         if (discards && !retain_2)
413             out_file_type == FileT::fastq ?
414                 write_fastq(discard_fh_2, s_2, msg_2.str()) : write_fasta(discard_fh_2, s_2, msg_2.str());
415 
416         delete s_1;
417         delete s_2;
418 
419         i++;
420     } while ((s_1 = fh_1->next_seq()) != NULL &&
421              (s_2 = fh_2->next_seq()) != NULL);
422 
423     delete [] kmer;
424 
425     if (discards) {
426         delete discard_fh_1;
427         delete discard_fh_2;
428     }
429 
430     //
431     // Close the file and delete the Input object.
432     //
433     delete fh_1;
434     delete fh_2;
435     delete ofh_1;
436     delete ofh_2;
437     delete rem_fh;
438 
439     return 0;
440 }
441 
process_reads(string in_path,string in_file,SeqKmerHash & kmers,map<string,long> & counter)442 int process_reads(string in_path,
443                   string in_file,
444                   SeqKmerHash &kmers,
445                   map<string, long> &counter) {
446     Input    *fh=NULL;
447     ofstream *discard_fh=NULL;
448     int       pos;
449 
450     string path = in_path + in_file;
451 
452     if (in_file_type == FileT::fastq)
453         fh = new Fastq(path);
454     else if (in_file_type == FileT::fasta)
455         fh = new Fasta(path);
456     else if (in_file_type == FileT::gzfastq)
457         fh = new GzFastq(path + ".gz");
458     else if (in_file_type == FileT::gzfasta)
459         fh = new GzFasta(path + ".gz");
460     else if (in_file_type == FileT::bustard)
461         fh = new Bustard(path);
462 
463     //
464     // Open the output file.
465     //
466     pos  = in_file.find_last_of(".");
467     path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
468     ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
469 
470     if (out_fh->fail()) {
471         cerr << "Error opening output file '" << path << "'\n";
472         exit(1);
473     }
474 
475     //
476     // Open a file for recording discarded reads
477     //
478     if (discards) {
479         pos  = in_file.find_last_of(".");
480         path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
481         discard_fh = new ofstream(path.c_str(), ifstream::out);
482 
483         if (discard_fh->fail()) {
484             cerr << "Error opening discard output file '" << path << "'\n";
485             exit(1);
486         }
487     }
488 
489     //
490     // Read in the first record, initializing the Seq object s. Then
491     // initialize the Read object r, then loop, using the same objects.
492     //
493     Seq *s = fh->next_seq();
494     if (s == NULL) {
495         cerr << "Unable to allocate Seq object.\n";
496         exit(1);
497     }
498 
499     int   rare_k, abundant_k, num_kmers, max_kmer_lim;
500     bool  retain;
501     char *kmer  = new char[kmer_len + 1];
502     long  i     = 1;
503 
504     do {
505         if (i % 10000 == 0) cerr << "  Processing short read " << i << "       \r";
506         counter["total"]++;
507         stringstream msg;
508 
509         //
510         // Drop this sequence if it has too many rare or abundant kmers.
511         //
512         retain       = true;
513         num_kmers    = strlen(s->seq) - kmer_len + 1;
514         max_kmer_lim = max_lim == 0 ? (int) round((double) num_kmers * max_k_pct) : max_lim;
515 
516         kmer_lookup(kmers, s->seq, kmer, num_kmers, rare_k, abundant_k);
517 
518         if (filter_mare_k && rare_k > 0) {
519             counter["rare_k"]++;
520             retain = false;
521             msg << "rare_k_" << rare_k;
522         }
523 
524         if (retain && filter_abundant_k && abundant_k > max_kmer_lim) {
525             counter["abundant_k"]++;
526             retain = false;
527             msg << "abundant_k_" << abundant_k;
528         }
529 
530         if (retain) {
531             counter["retained"]++;
532             out_file_type == FileT::fastq ?
533                  write_fastq(out_fh, s) : write_fasta(out_fh, s);
534         }
535 
536         if (discards && !retain)
537             out_file_type == FileT::fastq ?
538                 write_fastq(discard_fh, s, msg.str()) : write_fasta(discard_fh, s, msg.str());
539 
540         delete s;
541 
542         i++;
543     } while ((s = fh->next_seq()) != NULL);
544 
545     delete [] kmer;
546 
547     if (discards) delete discard_fh;
548 
549     //
550     // Close the file and delete the Input object.
551     //
552     delete fh;
553     delete out_fh;
554 
555     return 0;
556 }
557 
558 int
normalize_paired_reads(string in_path_1,string in_file_1,string in_path_2,string in_file_2,SeqKmerHash & kmers,vector<char * > & kmer_keys,map<string,long> & counter)559 normalize_paired_reads(string in_path_1,
560                        string in_file_1,
561                        string in_path_2,
562                        string in_file_2,
563                        SeqKmerHash &kmers, vector<char *> &kmer_keys,
564                        map<string, long> &counter)
565 {
566     Input    *fh_1=NULL, *fh_2=NULL;
567     ofstream *discard_fh_1=NULL, *discard_fh_2=NULL;
568     ofstream *ofh_1=NULL, *ofh_2=NULL, *rem_fh=NULL;
569     string    path_1, path_2;
570     int       pos;
571 
572     if (filter_abundant_k || filter_mare_k) {
573         //
574         // If we already filtered the data, open the files we created in the output
575         // directory to normalize.
576         //
577         pos    = in_file_1.find_last_of(".");
578         path_1 = out_path + in_file_1.substr(0, pos) + ".fil" + in_file_1.substr(pos);
579 
580         pos    = in_file_2.find_last_of(".");
581         path_2 = out_path + in_file_2.substr(0, pos) + ".fil" + in_file_2.substr(pos);
582 
583         if (in_file_type == FileT::fastq) {
584             fh_1 = new Fastq(path_1);
585             fh_2 = new Fastq(path_2);
586         } else if (in_file_type == FileT::gzfastq) {
587             fh_1 = new Fastq(path_1);
588             fh_2 = new Fastq(path_2);
589         } else if (in_file_type == FileT::fasta) {
590             fh_1 = new Fasta(path_1);
591             fh_2 = new Fasta(path_2);
592         } else if (in_file_type == FileT::gzfasta) {
593             fh_1 = new Fasta(path_1);
594             fh_2 = new Fasta(path_2);
595         }
596     } else {
597         //
598         // Otherwise, open unmodified files.
599         //
600         path_1 = in_path_1 + in_file_1;
601         path_2 = in_path_2 + in_file_2;
602 
603         if (in_file_type == FileT::fastq) {
604             fh_1 = new Fastq(path_1);
605             fh_2 = new Fastq(path_2);
606         } else if (in_file_type == FileT::gzfastq) {
607             fh_1 = new GzFastq(path_1 + ".gz");
608             fh_2 = new GzFastq(path_2 + ".gz");
609         } else if (in_file_type == FileT::fasta) {
610             fh_1 = new Fasta(path_1);
611             fh_2 = new Fasta(path_2);
612         } else if (in_file_type == FileT::gzfasta) {
613             fh_1 = new GzFasta(path_1 + ".gz");
614             fh_2 = new GzFasta(path_2 + ".gz");
615         } else if (in_file_type == FileT::bustard) {
616             fh_1 = new Bustard(path_1);
617             fh_2 = new Bustard(path_2);
618         }
619     }
620 
621     //
622     // Open the output files.
623     //
624     if (filter_abundant_k || filter_mare_k) {
625         pos    = in_file_1.find_last_of(".");
626         path_1 = out_path + in_file_1.substr(0, pos) + ".fil.norm" + in_file_1.substr(pos);
627         ofh_1  = new ofstream(path_1.c_str(), ifstream::out);
628 
629         if (ofh_1->fail()) {
630             cerr << "Error opening normalized output file '" << path_1 << "'\n";
631             exit(1);
632         }
633 
634         pos    = in_file_2.find_last_of(".");
635         path_2 = out_path + in_file_2.substr(0, pos) + ".fil.norm" + in_file_2.substr(pos);
636         ofh_2  = new ofstream(path_2.c_str(), ifstream::out);
637 
638         if (ofh_2->fail()) {
639             cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
640             exit(1);
641         }
642 
643         if (in_file_2.substr(pos - 2, 2) == ".2")
644             pos -= 2;
645         path_2  = out_path + in_file_2.substr(0, pos) + ".fil.norm.rem";
646         path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
647         rem_fh  = new ofstream(path_2.c_str(), ifstream::out);
648 
649         if (rem_fh->fail()) {
650             cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
651             exit(1);
652         }
653 
654     } else {
655         pos    = in_file_1.find_last_of(".");
656         path_1 = out_path + in_file_1.substr(0, pos) + ".norm" + in_file_1.substr(pos);
657         ofh_1  = new ofstream(path_1.c_str(), ifstream::out);
658 
659         if (ofh_1->fail()) {
660             cerr << "Error opening normalized output file '" << path_1 << "'\n";
661             exit(1);
662         }
663 
664         pos    = in_file_2.find_last_of(".");
665         path_2 = out_path + in_file_2.substr(0, pos) + ".norm" + in_file_2.substr(pos);
666         ofh_2  = new ofstream(path_2.c_str(), ifstream::out);
667 
668         if (ofh_2->fail()) {
669             cerr << "Error opening normalized paired output file '" << path_2 << "'\n";
670             exit(1);
671         }
672 
673         if (in_file_2.substr(pos - 2, 2) == ".2")
674             pos -= 2;
675         path_2  = out_path + in_file_2.substr(0, pos) + ".norm.rem";
676         path_2 += out_file_type == FileT::fastq ? ".fq" : ".fa";
677         rem_fh  = new ofstream(path_2.c_str(), ifstream::out);
678 
679         if (rem_fh->fail()) {
680             cerr << "Error opening normalized remainder output file '" << path_2 << "'\n";
681             exit(1);
682         }
683     }
684 
685     //
686     // Open a file for recording discarded reads
687     //
688     if (discards) {
689         pos = in_file_1.find_last_of(".");
690         if (filter_abundant_k || filter_mare_k)
691             path_1 = out_path + in_file_1.substr(0, pos) + ".fil.discards" + in_file_1.substr(pos);
692         else
693             path_1 = out_path + in_file_1.substr(0, pos) + ".discards" + in_file_1.substr(pos);
694         discard_fh_1 = new ofstream(path_1.c_str(), ifstream::out);
695 
696         if (discard_fh_1->fail()) {
697             cerr << "Error opening discard output file '" << path_1 << "'\n";
698             exit(1);
699         }
700 
701         pos = in_file_2.find_last_of(".");
702         if (filter_abundant_k || filter_mare_k)
703             path_2 = out_path + in_file_2.substr(0, pos) + ".fil.discards" + in_file_2.substr(pos);
704         else
705             path_2 = out_path + in_file_2.substr(0, pos) + ".discards" + in_file_2.substr(pos);
706         discard_fh_2 = new ofstream(path_2.c_str(), ifstream::out);
707 
708         if (discard_fh_2->fail()) {
709             cerr << "Error opening discard output file '" << path_1 << "'\n";
710             exit(1);
711         }
712     }
713 
714     //
715     // Read in the first record, initializing the Seq object s. Then
716     // initialize the Read object r, then loop, using the same objects.
717     //
718     Seq *s_1 = fh_1->next_seq();
719     Seq *s_2 = fh_2->next_seq();
720     if (s_1 == NULL || s_2 == NULL) {
721         cerr << "Unable to allocate Seq object.\n";
722         exit(1);
723     }
724 
725     int   num_kmers;
726     bool  retain_1, retain_2;
727     char *kmer = new char[kmer_len + 1];
728 
729     long i = 1;
730     do {
731         if (i % 10000 == 0) cerr << "  Processing short read pair " << i << "       \r";
732         counter["total"] += 2;
733 
734         retain_1  = true;
735         retain_2  = true;
736         num_kmers = strlen(s_1->seq) - kmer_len + 1;
737 
738         //
739         // Drop the first sequence if it has too many rare or abundant kmers.
740         //
741         retain_1 = normalize_kmer_lookup(kmers, s_1->seq, kmer, num_kmers, kmer_keys);
742 
743         num_kmers = strlen(s_2->seq) - kmer_len + 1;
744 
745         //
746         // Drop the second sequence if it has too many rare or abundant kmers.
747         //
748         retain_2 = normalize_kmer_lookup(kmers, s_2->seq, kmer, num_kmers, kmer_keys);
749 
750         if (retain_1 && retain_2) {
751             counter["retained"] += 2;
752             out_file_type == FileT::fastq ?
753                  write_fastq(ofh_1, s_1) : write_fasta(ofh_1, s_1);
754             out_file_type == FileT::fastq ?
755                  write_fastq(ofh_2, s_2) : write_fasta(ofh_2, s_2);
756         } else {
757             counter["overep"] +=2;
758         }
759 
760         if (retain_1 && !retain_2) {
761             counter["retained"]++;
762             counter["overep"]++;
763             out_file_type == FileT::fastq ?
764                  write_fastq(rem_fh, s_1) : write_fasta(rem_fh, s_1);
765         }
766 
767         if (!retain_1 && retain_2) {
768             counter["retained"]++;
769             counter["overep"]++;
770             out_file_type == FileT::fastq ?
771                  write_fastq(rem_fh, s_2) : write_fasta(rem_fh, s_2);
772         }
773 
774         if (discards && !retain_1)
775             out_file_type == FileT::fastq ?
776                 write_fastq(discard_fh_1, s_1) : write_fasta(discard_fh_1, s_1);
777         if (discards && !retain_2)
778             out_file_type == FileT::fastq ?
779                 write_fastq(discard_fh_2, s_2) : write_fasta(discard_fh_2, s_2);
780 
781         delete s_1;
782         delete s_2;
783 
784         i++;
785     } while ((s_1 = fh_1->next_seq()) != NULL &&
786              (s_2 = fh_2->next_seq()) != NULL);
787 
788     delete [] kmer;
789 
790     if (discards) {
791         delete discard_fh_1;
792         delete discard_fh_2;
793     }
794 
795     //
796     // Close the file and delete the Input object.
797     //
798     delete fh_1;
799     delete fh_2;
800     delete ofh_1;
801     delete ofh_2;
802     delete rem_fh;
803 
804     return 0;
805 }
806 
807 int
normalize_reads(string in_path,string in_file,SeqKmerHash & kmers,vector<char * > & kmer_keys,map<string,long> & counter)808 normalize_reads(string in_path,
809                 string in_file,
810                 SeqKmerHash &kmers, vector<char *> &kmer_keys,
811                 map<string, long> &counter)
812 {
813     Input    *fh=NULL;
814     ofstream *discard_fh=NULL;
815     string    path;
816 
817     int pos = in_file.find_last_of(".");
818 
819     if (filter_abundant_k || filter_mare_k) {
820         if (in_file.substr(pos - 4, 4) == ".fil")
821             path = out_path + in_file;
822         else
823             path = out_path + in_file.substr(0, pos) + ".fil" + in_file.substr(pos);
824 
825         if (in_file_type == FileT::fastq)
826             fh = new Fastq(path);
827         else if (in_file_type == FileT::gzfastq)
828             fh = new Fastq(path);
829         else if (in_file_type == FileT::fasta)
830             fh = new Fasta(path);
831         else if (in_file_type == FileT::gzfasta)
832             fh = new Fasta(path);
833         else if (in_file_type == FileT::bustard)
834             fh = new Bustard(path);
835 
836     } else {
837         path = in_path + in_file;
838 
839         if (in_file_type == FileT::fastq)
840             fh = new Fastq(path);
841         else if (in_file_type == FileT::gzfastq)
842             fh = new GzFastq(path + ".gz");
843         else if (in_file_type == FileT::fasta)
844             fh = new Fasta(path);
845         else if (in_file_type == FileT::gzfasta)
846             fh = new GzFasta(path + ".gz");
847         else if (in_file_type == FileT::bustard)
848             fh = new Bustard(path);
849     }
850 
851     //
852     // Open the output file.
853     //
854     // if (filter_abundant_k || filter_mare_k) {
855     //         path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
856     // } else {
857     //         path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
858     // }
859     path = out_path + in_file.substr(0, pos) + ".norm" + in_file.substr(pos);
860     ofstream *out_fh = new ofstream(path.c_str(), ifstream::out);
861 
862     if (out_fh->fail()) {
863         cerr << "Error opening normalized output file '" << path << "'\n";
864         exit(1);
865     }
866 
867     //
868     // Open a file for recording discarded reads
869     //
870     if (discards) {
871         if (filter_abundant_k || filter_mare_k)
872             path = out_path + in_file.substr(0, pos) + ".fil.discards" + in_file.substr(pos);
873         else
874             path = out_path + in_file.substr(0, pos) + ".discards" + in_file.substr(pos);
875         discard_fh = new ofstream(path.c_str(), ifstream::out);
876 
877         if (discard_fh->fail()) {
878             cerr << "Error opening discard output file '" << path << "'\n";
879             exit(1);
880         }
881     }
882 
883     //
884     // Read in the first record, initializing the Seq object s. Then
885     // initialize the Read object r, then loop, using the same objects.
886     //
887     Seq *s = fh->next_seq();
888     if (s == NULL) {
889         cerr << "Unable to allocate Seq object.\n";
890         exit(1);
891     }
892 
893     int   num_kmers;
894     bool  retain;
895     char *kmer  = new char[kmer_len + 1];
896     long  i     = 1;
897 
898     do {
899         if (i % 10000 == 0) cerr << "  Processing short read " << i << "       \r";
900         counter["total"]++;
901 
902         //
903         // Drop this sequence if it has too many rare or abundant kmers.
904         //
905         retain    = true;
906         num_kmers = strlen(s->seq) - kmer_len + 1;
907 
908         retain = normalize_kmer_lookup(kmers, s->seq, kmer, num_kmers, kmer_keys);
909 
910         if (retain) {
911             counter["retained"]++;
912             out_file_type == FileT::fastq ?
913                  write_fastq(out_fh, s) : write_fasta(out_fh, s);
914         } else {
915             counter["overep"]++;
916         }
917 
918         if (discards && !retain)
919             out_file_type == FileT::fastq ?
920                 write_fastq(discard_fh, s) : write_fasta(discard_fh, s);
921 
922         delete s;
923 
924         i++;
925     } while ((s = fh->next_seq()) != NULL);
926 
927     delete [] kmer;
928 
929     if (discards) delete discard_fh;
930 
931     //
932     // Close the file and delete the Input object.
933     //
934     delete fh;
935     delete out_fh;
936 
937     return 0;
938 }
939 
940 int
populate_kmers(vector<pair<string,string>> & pair_files,vector<pair<string,string>> & files,SeqKmerHash & kmers,vector<char * > & kmers_keys)941 populate_kmers(vector<pair<string, string> > &pair_files,
942                vector<pair<string, string> > &files,
943                SeqKmerHash &kmers,
944                vector<char *> &kmers_keys)
945 {
946     //
947     // Break each read down into k-mers and create a hash map of those k-mers
948     // recording in which sequences they occur.
949     //
950     uint j   = 1;
951     uint cnt = files.size() + pair_files.size();
952     for (uint i = 0; i < files.size(); i++) {
953         cerr << "Generating kmers from file " << j << " of " << cnt << " [" << files[i].second << "]\n";
954         process_file_kmers(files[i].first + files[i].second, kmers, kmers_keys);
955         j++;
956     }
957 
958     for (uint i = 0; i < pair_files.size(); i++) {
959         cerr << "Generating kmers from file " << j << " of " << cnt << " [" << pair_files[i].second << "]\n";
960         process_file_kmers(pair_files[i].first + pair_files[i].second, kmers, kmers_keys);
961         j++;
962     }
963 
964     cerr << kmers.size() << " unique k-mers recorded.\n";
965 
966     return 0;
967 }
968 
969 int
read_kmer_freq(string in_path,SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)970 read_kmer_freq(string in_path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
971 {
972     cerr << "Reading kmer frequencies from '" << in_path.c_str() << "'...\n";
973 
974     ifstream fh(in_path.c_str(), ifstream::in);
975     if (fh.fail()) {
976         cerr << "Error opening rare kmer frequency input file '" << in_path << "'\n";
977         exit(1);
978     }
979 
980     char *hash_key;
981     bool  exists;
982     int   len, cnt;
983     char  kmer[id_len];
984     char  line[max_len];
985     vector<string> parts;
986 
987     long i = 1;
988 
989     while (fh.good()) {
990         if (i % 10000 == 0) cerr << "  Processing kmer " << i << "       \r";
991 
992         fh.getline(line, max_len);
993 
994         len = strlen(line);
995         if (len == 0) continue;
996 
997         //
998         // Check that there is no carraige return in the buffer.
999         //
1000         if (line[len - 1] == '\r') line[len - 1] = '\0';
1001 
1002         //
1003         // Ignore comments
1004         //
1005         if (line[0] == '#') continue;
1006 
1007         //
1008         // Parse the kmer and the number of times it occurred
1009         // <kmer> <tab> <integer>
1010         //
1011         parse_tsv(line, parts);
1012 
1013         if (parts.size() != 2) {
1014             cerr << "kmer frequencies are not formated correctly: expecting two, tab separated columns, found " << parts.size() << ".\n";
1015             exit(1);
1016         }
1017 
1018         strcpy(kmer, parts[1].c_str());
1019         cnt = is_integer(kmer);
1020         if (cnt < 0) {
1021             cerr << "Non integer found in second column.\n";
1022             exit(1);
1023         }
1024 
1025         strcpy(kmer, parts[0].c_str());
1026         exists = kmer_map.count(kmer) == 0 ? false : true;
1027 
1028         if (exists) {
1029             cerr << "Warning: kmer '" << kmer << "' already exists in the kmer hash map.\n";
1030             hash_key = kmer;
1031             kmer_map[hash_key] += cnt;
1032         } else {
1033             hash_key = new char [strlen(kmer) + 1];
1034             strcpy(hash_key, kmer);
1035             kmer_map_keys.push_back(hash_key);
1036             kmer_map[hash_key]  = cnt;
1037         }
1038         i++;
1039     }
1040 
1041     fh.close();
1042 
1043     cerr << kmer_map.size() << " unique k-mers read.\n";
1044 
1045     kmer_len = strlen(kmer_map.begin()->first);
1046     cerr << "Setting kmer length to " << kmer_len << "bp.\n";
1047 
1048     return 0;
1049 }
1050 
1051 int
write_kmer_freq(string path,SeqKmerHash & kmer_map)1052 write_kmer_freq(string path, SeqKmerHash &kmer_map)
1053 {
1054     cerr << "Writing kmer frequencies to '" << path.c_str() << "'...";
1055 
1056     ofstream out_fh(path.c_str(), ifstream::out);
1057     if (out_fh.fail()) {
1058         cerr << "Error opening rare kmer output file '" << path << "'\n";
1059         exit(1);
1060     }
1061 
1062     SeqKmerHash::iterator i;
1063 
1064     out_fh << "# Kmer\tCount\n";
1065 
1066     for (i = kmer_map.begin(); i != kmer_map.end(); i++) {
1067         out_fh << i->first << "\t" << i->second << "\n";
1068     }
1069 
1070     out_fh.close();
1071 
1072     cerr << "done.\n";
1073 
1074     return 0;
1075 }
1076 
1077 int
process_file_kmers(string path,SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)1078 process_file_kmers(string path, SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
1079 {
1080     vector<char *> kmers;
1081     char          *hash_key;
1082     bool           exists;
1083     int            j;
1084     Input         *fh = NULL;
1085 
1086     if (in_file_type == FileT::fastq)
1087         fh = new Fastq(path);
1088     else if (in_file_type == FileT::gzfastq)
1089         fh = new GzFastq(path + ".gz");
1090     else if (in_file_type == FileT::fasta)
1091         fh = new Fasta(path);
1092     else if (in_file_type == FileT::gzfasta)
1093         fh = new GzFasta(path + ".gz");
1094     else if (in_file_type == FileT::bustard)
1095         fh = new Bustard(path.c_str());
1096 
1097     //
1098     // Read in the first record, initializing the Seq object s.
1099     //
1100     Seq *s = fh->next_seq();
1101     if (s == NULL) {
1102         cerr << "Unable to allocate Seq object.\n";
1103         exit(1);
1104     }
1105 
1106     int   num_kmers;
1107     char *kmer = new char [kmer_len + 1];
1108 
1109     long i = 1;
1110     do {
1111         if (i % 10000 == 0) cerr << "  Processing short read " << i << "       \r";
1112 
1113         num_kmers = strlen(s->seq) - kmer_len + 1;
1114 
1115         //
1116         // Generate and hash the kmers for this raw read
1117         //
1118         kmer[kmer_len] = '\0';
1119 
1120         for (j = 0; j < num_kmers; j++) {
1121             strncpy(kmer, s->seq + j, kmer_len);
1122 
1123             exists = kmer_map.count(kmer) == 0 ? false : true;
1124 
1125             if (exists) {
1126                     hash_key = kmer;
1127             } else {
1128                     hash_key = new char [kmer_len + 1];
1129                     strcpy(hash_key, kmer);
1130                 kmer_map_keys.push_back(hash_key);
1131             }
1132 
1133             kmer_map[hash_key]++;
1134         }
1135 
1136         delete s;
1137 
1138         i++;
1139     } while ((s = fh->next_seq()) != NULL);
1140 
1141     delete [] kmer;
1142 
1143     //
1144     // Close the file and delete the Input object.
1145     //
1146     delete fh;
1147 
1148     return 0;
1149 }
1150 
1151 int
generate_kmer_dist(SeqKmerHash & kmer_map)1152 generate_kmer_dist(SeqKmerHash &kmer_map)
1153 {
1154     SeqKmerHash::iterator i;
1155     map<uint, uint> bins;
1156 
1157     cerr << "Generating kmer distribution...\n";
1158 
1159     for (i = kmer_map.begin(); i != kmer_map.end(); i++)
1160         bins[i->second]++;
1161 
1162     map<uint, uint>::iterator j;
1163     vector<pair<uint, uint> > sorted_kmers;
1164 
1165     for (j = bins.begin(); j != bins.end(); j++)
1166         sorted_kmers.push_back(make_pair(j->first, j->second));
1167 
1168     cout << "KmerFrequency\tCount\n";
1169 
1170     for (unsigned long k = 0; k < sorted_kmers.size(); k++)
1171         cout << sorted_kmers[k].first << "\t" << sorted_kmers[k].second << "\n";
1172 
1173     return 0;
1174 }
1175 
1176 int
calc_kmer_median(SeqKmerHash & kmers,double & kmer_med,double & kmer_mad)1177 calc_kmer_median(SeqKmerHash &kmers, double &kmer_med, double &kmer_mad)
1178 {
1179     kmer_med = 0.0;
1180     kmer_mad = 0.0;
1181 
1182     int num_kmers = kmers.size();
1183     vector<int> freqs, residuals;
1184     freqs.reserve(num_kmers);
1185 
1186     SeqKmerHash::iterator i;
1187 
1188     for (i = kmers.begin(); i != kmers.end(); i++)
1189         freqs.push_back(i->second);
1190 
1191     sort(freqs.begin(), freqs.end());
1192 
1193     kmer_med = num_kmers % 2 == 0 ?
1194         (double) (freqs[num_kmers / 2 - 1] + freqs[num_kmers / 2]) / 2.0 :
1195         (double) freqs[num_kmers / 2 - 1];
1196 
1197     //
1198     // Calculate the median absolute deviation.
1199     //
1200     residuals.reserve(num_kmers);
1201 
1202     for (int j = 0; j < num_kmers; j++)
1203         residuals.push_back(abs(freqs[j] - (int) kmer_med));
1204     sort(residuals.begin(), residuals.end());
1205 
1206     kmer_mad = num_kmers % 2 == 0 ?
1207         (double) (residuals[num_kmers / 2 - 1] + residuals[num_kmers / 2]) / 2.0 :
1208         (double) residuals[num_kmers / 2 - 1];
1209 
1210     return 0;
1211 }
1212 
1213 int
kmer_map_cmp(pair<char *,long> a,pair<char *,long> b)1214 kmer_map_cmp(pair<char *, long> a, pair<char *, long> b)
1215 {
1216     return (a.second < b.second);
1217 }
1218 
1219 inline bool
normalize_kmer_lookup(SeqKmerHash & kmer_map,char * read,char * kmer,int num_kmers,vector<char * > & kmer_keys)1220 normalize_kmer_lookup(SeqKmerHash &kmer_map,
1221                       char *read, char *kmer,
1222                       int num_kmers,
1223                       vector<char *> &kmer_keys)
1224 {
1225     kmer[kmer_len] = '\0';
1226     int  cnt       = 0;
1227     bool retain    = true;
1228     //
1229     // Generate kmers from this read, increment kmer frequency in dataset.
1230     //
1231     vector<int> sorted_cnts;
1232     sorted_cnts.reserve(num_kmers);
1233 
1234     // cout << "# " << read << "\n";
1235 
1236     for (int j = 0; j < num_kmers; j++) {
1237         strncpy(kmer, read + j, kmer_len);
1238 
1239         cnt = kmer_map.count(kmer) > 0 ? kmer_map[kmer] : 0;
1240         sorted_cnts.push_back(cnt);
1241 
1242         // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1243     }
1244 
1245     //
1246     // Calculate the median kmer frequency along the read.
1247     //
1248     sort(sorted_cnts.begin(), sorted_cnts.end());
1249     double median = num_kmers % 2 == 0 ?
1250         (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
1251         (double) sorted_cnts[num_kmers / 2 - 1];
1252     // cout << "# median: " << median << "\n";
1253 
1254     if (median > normalize_lim)
1255         retain = false;
1256 
1257     //
1258     // Generate and hash the kmers for this raw read
1259     //
1260     bool  exists;
1261     char *hash_key;
1262     kmer[kmer_len] = '\0';
1263 
1264     for (int j = 0; j < num_kmers; j++) {
1265         strncpy(kmer, read + j, kmer_len);
1266 
1267         exists = kmer_map.count(kmer) == 0 ? false : true;
1268 
1269         if (exists) {
1270             hash_key = kmer;
1271         } else {
1272             hash_key = new char [kmer_len + 1];
1273             strcpy(hash_key, kmer);
1274             kmer_keys.push_back(hash_key);
1275         }
1276 
1277         kmer_map[hash_key]++;
1278     }
1279 
1280     return retain;
1281 }
1282 
1283 inline int
kmer_lookup(SeqKmerHash & kmer_map,char * read,char * kmer,int num_kmers,int & rare_k,int & abundant_k)1284 kmer_lookup(SeqKmerHash &kmer_map,
1285             char *read, char *kmer,
1286             int num_kmers,
1287             int &rare_k, int &abundant_k)
1288 {
1289     //
1290     // Generate kmers from this read, lookup kmer frequency in dataset.
1291     //
1292     rare_k         = 0;
1293     abundant_k     = 0;
1294     kmer[kmer_len] = '\0';
1295     int cnt        = 0;
1296 
1297     vector<int> cnts, sorted_cnts;
1298     cnts.reserve(num_kmers);
1299     sorted_cnts.reserve(num_kmers);
1300 
1301     // cout << "# " << read << "\n";
1302 
1303     for (int j = 0; j < num_kmers; j++) {
1304         strncpy(kmer, read + j, kmer_len);
1305 
1306         cnt = kmer_map[kmer];
1307         cnts.push_back(cnt);
1308         sorted_cnts.push_back(cnt);
1309 
1310         // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1311 
1312         if (cnt >= max_k_freq) abundant_k++;
1313     }
1314 
1315     //
1316     // Calculate the median kmer frequency along the read.
1317     //
1318     sort(sorted_cnts.begin(), sorted_cnts.end());
1319     double median = num_kmers % 2 == 0 ?
1320         (double) (sorted_cnts[num_kmers / 2 - 1] + sorted_cnts[num_kmers / 2]) / 2.0 :
1321         (double) sorted_cnts[num_kmers / 2 - 1];
1322     // cout << "# median: " << median << "\n";
1323 
1324     double bound = round(median * min_k_pct);
1325     // cout << "# kmer cov bound: " << bound << "\n";
1326 
1327     //
1328     // Look for runs of rare kmers.
1329     //
1330     // We will slide a window across the read, f represents the front of the window, b
1331     // represents the back. Each  time a kmer is below the bound we will increment run_cnt,
1332     // which represents the number of kmers in the window below the bound. If 2/3 of the
1333     // kmers in the window go below the bound, assume a sequencing error has occurred.
1334     //
1335     int run_cnt = 0;
1336     int b = 0;
1337     for (int f = 0; f < num_kmers; f++) {
1338         if (f >= kmer_len) {
1339             b++;
1340             if (cnts[b] <= bound)
1341                 run_cnt--;
1342         }
1343         if (cnts[f] <= bound) {
1344             run_cnt++;
1345             if (run_cnt >= min_lim) {
1346                 rare_k++;
1347                 // cout << "# Rejecting read, position: " << f << "; run_cnt: " << run_cnt << "\n";
1348                 return 0;
1349             }
1350         }
1351         // cout << "# b: " << b << "; f: " << f << "; run_cnt: " << run_cnt << "; counts[front]: " << cnts[f] << "; bound: " << bound << "\n";
1352     }
1353     // cout << "\n\n";
1354 
1355     return 0;
1356 }
1357 
1358 // inline int
1359 // kmer_lookup(SeqKmerHash &kmer_map,
1360 //             char *read, char *kmer,
1361 //             int num_kmers,
1362 //             int &rare_k, int &abundant_k, bool &complex)
1363 // {
1364 //     //
1365 //     // Generate kmers from this read, lookup kmer frequency in dataset.
1366 //     //
1367 //     rare_k         = 0;
1368 //     abundant_k     = 0;
1369 //     complex        = false;
1370 //     kmer[kmer_len] = '\0';
1371 //     int cnt        = 0;
1372 //     int rare_k_lim = (int) round((double) kmer_len * (1.0/2.0));
1373 
1374 //     vector<int> cnts;
1375 //     cnts.reserve(num_kmers);
1376 
1377 //     // cout << "# " << read << "\n";
1378 
1379 //     for (int j = 0; j < num_kmers; j++) {
1380 //         strncpy(kmer, read + j, kmer_len);
1381 
1382 //         cnt = kmer_map[kmer];
1383 
1384 //         if (cnt >= 100000)
1385 //             cnts.push_back(100000);
1386 //         else if (cnt >= 10000)
1387 //             cnts.push_back(10000);
1388 //         else if (cnt >= 1000)
1389 //             cnts.push_back(1000);
1390 //         else if (cnt >= 100)
1391 //             cnts.push_back(100);
1392 //         else if (cnt >= 10)
1393 //             cnts.push_back(10);
1394 //         else
1395 //             cnts.push_back(1);
1396 
1397 //         // cout << kmer << "\t" << j << "\t" << cnt << "\n";
1398 
1399 //         if (cnt >= max_k_freq) abundant_k++;
1400 //     }
1401 
1402 //     // //
1403 //     // // Detect the number of kmer coverage transitions.
1404 //     // //
1405 //     // int t   = 0;
1406 //     // int cov = cnts[0];
1407 //     // cout << "\nDetermining transitions:\n" << kmer << "\t" << "0" << "\t" << cnts[0] << "\n";
1408 //     // for (int j = 1; j < num_kmers; j++)
1409 //     //         if (cnts[j] != cov) {
1410 //     //             cov = cnts[j];
1411 //     //             t++;
1412 //     //             cout << kmer << "\t" << j << "\t" << cnts[j] << ": Transition." << "\n";
1413 //     //         } else {
1414 //     //             cout << kmer << "\t" << j << "\t" << cnts[j] << "\n";
1415 //     //         }
1416 //     // cerr << t << " total cnts.\n";
1417 
1418 //     //
1419 //     // Look for runs of kmers at various orders of magnitude.
1420 //     //
1421 //     // We will slide a window across the read, f represents the front of the window, b
1422 //     // represents the back. Each  time a kmer is below the bound we will increment run_cnt,
1423 //     // which represents the number of kmers in the window below the bound. If 2/3 of the
1424 //     // kmers in the window go below the bound, assume a sequencing error has occurred.
1425 
1426 //     // Run counters:
1427 //     //   1      10      100      1k     10k    100k
1428 //     // runs[0] runs[1] runs[2] runs[3] runs[4] runs[5]
1429 //     int runs[6] = {0};
1430 //     int prev_cnt, run_cnt, tot_trans;
1431 //     int f = 0;
1432 
1433 //     while (f < num_kmers) {
1434 
1435 //         tot_trans = 0;
1436 //         run_cnt   = 1;
1437 //         prev_cnt  = cnts[f];
1438 //         f++;
1439 
1440 //         while (f < num_kmers && cnts[f] == prev_cnt) {
1441 //             // cout << "# window front: " << f << "; run_cnt: " << run_cnt << "; prev_cnt: " << prev_cnt << "\n";
1442 //             f++;
1443 //             run_cnt++;
1444 //         }
1445 
1446 //         if (run_cnt >= rare_k_lim) {
1447 //             // cout << "#   found transition run, position: " << f-1 << "; run_cnt: " << run_cnt << "\n";
1448 //             switch(prev_cnt) {
1449 //             case 1:
1450 //                 runs[0]++;
1451 //                 break;
1452 //             case 10:
1453 //                 runs[1]++;
1454 //                 break;
1455 //             case 100:
1456 //                 runs[2]++;
1457 //                 break;
1458 //             case 1000:
1459 //                 runs[3]++;
1460 //                 break;
1461 //             case 10000:
1462 //                 runs[4]++;
1463 //                 break;
1464 //             case 100000:
1465 //                 runs[5]++;
1466 //                 break;
1467 //             }
1468 //         }
1469 
1470 //         for (int j = 0; j < 6; j++)
1471 //             if (runs[j] > 0) tot_trans++;
1472 
1473 //         // cout << "# Total transitions: " << tot_trans << "\n";
1474 
1475 //         if (tot_trans >= transition_lim) {
1476 //             // cout << "# Rejecting read.\n";
1477 //             rare_k++;
1478 //             return 0;
1479 //         }
1480 //     }
1481 
1482 //     // cout << "\n\n";
1483 
1484 //     return 0;
1485 // }
1486 
1487 int
free_kmer_hash(SeqKmerHash & kmer_map,vector<char * > & kmer_map_keys)1488 free_kmer_hash(SeqKmerHash &kmer_map, vector<char *> &kmer_map_keys)
1489 {
1490     for (uint i = 0; i < kmer_map_keys.size(); i++) {
1491         delete [] kmer_map_keys[i];
1492     }
1493     kmer_map_keys.clear();
1494 
1495     kmer_map.clear();
1496 
1497     return 0;
1498 }
1499 
print_results(map<string,map<string,long>> & counters)1500 int print_results(map<string, map<string, long> > &counters) {
1501     map<string, map<string, long> >::iterator it;
1502 
1503     string log_path = out_path + "kmer_filter.log";
1504     ofstream log(log_path.c_str());
1505 
1506     if (log.fail()) {
1507         cerr << "Unable to open log file '" << log_path << "'\n";
1508         return 0;
1509     }
1510 
1511     cerr << "Outputing details to log: '" << log_path << "'\n\n";
1512 
1513     log << "File\t"
1514         << "Retained Reads\t"
1515         << "Rare K\t"
1516         << "Abundant K\t"
1517         << "Total\n";
1518 
1519     for (it = counters.begin(); it != counters.end(); it++) {
1520         log << it->first                 << "\t"
1521             << it->second["retained"]    << "\t"
1522             << it->second["rare_k"]      << "\t"
1523             << it->second["abundant_k"]  << "\t"
1524             << it->second["total"]       << "\n";
1525     }
1526 
1527     map<string, long> c;
1528     c["total"]       = 0;
1529 
1530     //
1531     // Total up the individual counters
1532     //
1533     for (it = counters.begin(); it != counters.end(); it++) {
1534         c["total"]       += it->second["total"];
1535         c["retained"]    += it->second["retained"];
1536         c["rare_k"]      += it->second["rare_k"];
1537         c["abundant_k"]  += it->second["abundant_k"];
1538     }
1539 
1540     cerr <<
1541         c["total"] << " total sequences;\n"
1542          << "  " << c["rare_k"]      << " rare k-mer reads;\n"
1543          << "  " << c["abundant_k"]  << " abundant k-mer reads;\n"
1544          << c["retained"] << " retained reads.\n";
1545 
1546     log        << "Total Sequences\t"      << c["total"]       << "\n"
1547         << "Retained Reads\t"       << c["retained"]      << "\n";
1548 
1549     log.close();
1550 
1551     return 0;
1552 }
1553 
build_file_list(vector<string> & in_files,vector<pair<string,string>> & files)1554 int build_file_list(vector<string> &in_files, vector<pair<string, string> > &files) {
1555     string file, suffix;
1556     int    pos;
1557 
1558     //
1559     // Scan a directory for a list of files.
1560     //
1561     if (in_path.length() > 0) {
1562         struct dirent *direntry;
1563 
1564         DIR *dir = opendir(in_path.c_str());
1565 
1566         if (dir == NULL) {
1567             cerr << "Unable to open directory '" << in_path << "' for reading.\n";
1568             exit(1);
1569         }
1570 
1571         while ((direntry = readdir(dir)) != NULL) {
1572             file = direntry->d_name;
1573 
1574             if (file.substr(0, 1) == ".")
1575                 continue;
1576 
1577             //
1578             // If the file is gzip'ed, remove the '.gz' suffix.
1579             //
1580             pos  = file.find_last_of(".");
1581             if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
1582                 file.substr(pos) == ".gz") {
1583                 file = file.substr(0, pos);
1584                 pos  = file.find_last_of(".");
1585             }
1586 
1587             //
1588             // Check that the remaining file name has the right suffix.
1589             //
1590             suffix = file.substr(pos + 1);
1591             if (in_file_type == FileT::fastq && (suffix.substr(0, 2) == "fq" || suffix.substr(0, 5) == "fastq"))
1592                 files.push_back(make_pair(in_path, file));
1593             else if (in_file_type == FileT::fasta && (suffix.substr(0, 2) == "fa" || suffix.substr(0, 5) == "fasta"))
1594                 files.push_back(make_pair(in_path, file));
1595         }
1596 
1597         if (files.size() == 0)
1598             cerr << "Unable to locate any input files to process within '" << in_path << "'\n";
1599 
1600     } else {
1601         string path;
1602 
1603         for (uint i = 0; i < in_files.size(); i++) {
1604             //
1605             // Files specified directly:
1606             //    Break off file path and store path and file name.
1607             //    Check if this is a gzip'ed file and if so, remove 'gz' suffix.
1608             //
1609             file = in_files[i];
1610             pos  = file.find_last_of(".");
1611             if ((in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) &&
1612                 file.substr(pos) == ".gz") {
1613                 file = file.substr(0, pos);
1614                 pos  = file.find_last_of(".");
1615             }
1616             pos  = file.find_last_of("/");
1617             path = file.substr(0, pos + 1);
1618             files.push_back(make_pair(path, file.substr(pos+1)));
1619         }
1620     }
1621 
1622     return 0;
1623 }
1624 
parse_command_line(int argc,char * argv[])1625 int parse_command_line(int argc, char* argv[]) {
1626     string pair_1, pair_2;
1627     int c;
1628 
1629     while (1) {
1630         static struct option long_options[] = {
1631             {"help",         no_argument,       NULL, 'h'},
1632             {"version",      no_argument,       NULL, 'v'},
1633             {"discards",     no_argument,       NULL, 'D'},
1634             {"pair-1",       required_argument, NULL, '1'}, {"pair_1",       required_argument, NULL, '1'},
1635             {"pair-2",       required_argument, NULL, '2'}, {"pair_2",       required_argument, NULL, '2'},
1636             {"infile-type",  required_argument, NULL, 'i'}, {"infile_type",  required_argument, NULL, 'i'},
1637             {"outfile-type", required_argument, NULL, 'y'}, {"outfile_type", required_argument, NULL, 'y'},
1638             {"file",         required_argument, NULL, 'f'},
1639             {"path",         required_argument, NULL, 'p'},
1640             {"outpath",      required_argument, NULL, 'o'},
1641             {"k-dist",       no_argument,       NULL, 'I'}, {"k_dist",       no_argument,       NULL, 'I'},
1642             {"rare",         no_argument,       NULL, 'R'},
1643             {"abundant",     no_argument,       NULL, 'A'},
1644             {"normalize",    required_argument, NULL, 'N'},
1645             {"k-len",        required_argument, NULL, 'K'}, {"k_len",        required_argument, NULL, 'K'},
1646             {"max-k-freq",   required_argument, NULL, 'M'}, {"max_k_freq",   required_argument, NULL, 'M'},
1647             {"min-lim",      required_argument, NULL, 'F'}, {"min_lim",      required_argument, NULL, 'F'},
1648             {"max-lim",      required_argument, NULL, 'G'}, {"max_lim",      required_argument, NULL, 'G'},
1649             {"min-k-pct",    required_argument, NULL, 'P'}, {"min_k_pct",    required_argument, NULL, 'P'},
1650             {"read-k-freq",  required_argument, NULL, 'r'}, {"read_k_freq",  required_argument, NULL, 'r'},
1651             {"write-k-freq", required_argument, NULL, 'w'}, {"write_k_freq", required_argument, NULL, 'w'},
1652             {0, 0, 0, 0}
1653         };
1654 
1655         // getopt_long stores the option index here.
1656         int option_index = 0;
1657 
1658         c = getopt_long(argc, argv, "hvRADkP:N:I:w:r:K:F:G:M:m:i:y:f:o:t:p:1:2:", long_options, &option_index);
1659 
1660         // Detect the end of the options.
1661         if (c == -1)
1662             break;
1663 
1664         switch (c) {
1665         case 'h':
1666             help();
1667             break;
1668         case 'i':
1669              if (strcasecmp(optarg, "fasta") == 0)
1670                 in_file_type = FileT::fasta;
1671              else if (strcasecmp(optarg, "gzfasta") == 0)
1672                  in_file_type = FileT::gzfasta;
1673              else if (strcasecmp(optarg, "gzfastq") == 0)
1674                  in_file_type = FileT::gzfastq;
1675              else
1676                  in_file_type = FileT::fastq;
1677             break;
1678         case 'y':
1679             if (strcasecmp(optarg, "fasta") == 0)
1680                 out_file_type = FileT::fasta;
1681             else
1682                 out_file_type = FileT::fastq;
1683             break;
1684         case 'f':
1685             in_files.push_back(optarg);
1686             break;
1687         case '1':
1688             pair_1 = optarg;
1689             break;
1690         case '2':
1691             pair_2 = optarg;
1692             if (pair_1.length() == 0) help();
1693             in_pair_files.push_back(pair_1);
1694             in_pair_files.push_back(pair_2);
1695             pair_1 = "";
1696             pair_2 = "";
1697             break;
1698         case 'p':
1699             in_path = optarg;
1700             break;
1701         case 'o':
1702             out_path = optarg;
1703             break;
1704         case 'D':
1705             discards = true;
1706             break;
1707         case 'I':
1708             kmer_distr = true;
1709             break;
1710         case 'R':
1711             filter_mare_k = true;
1712             break;
1713         case 'A':
1714             filter_abundant_k = true;
1715             break;
1716         case 'N':
1717             normalize = true;
1718             normalize_lim = is_integer(optarg);
1719             break;
1720         case 'K':
1721             kmer_len = is_integer(optarg);
1722             break;
1723         case 'M':
1724             max_k_freq = is_integer(optarg);
1725             break;
1726         case 'F':
1727             min_lim = is_integer(optarg);
1728             break;
1729         case 'G':
1730             max_lim = is_integer(optarg);
1731             break;
1732         case 'P':
1733             min_k_pct = is_double(optarg);
1734             break;
1735         case 'r':
1736             read_k_freq = true;
1737             k_freq_path = optarg;
1738             break;
1739         case 'w':
1740             write_k_freq = true;
1741             k_freq_path = optarg;
1742             break;
1743         case 'v':
1744             version();
1745             break;
1746         case '?':
1747             // getopt_long already printed an error message.
1748             help();
1749             break;
1750 
1751         default:
1752             cerr << "Unknown command line option '" << (char) c << "'\n";
1753             help();
1754             exit(1);
1755         }
1756     }
1757 
1758     if (in_files.size() == 0 && in_pair_files.size() == 0 && in_path.length() == 0) {
1759         cerr << "You must specify an input file of a directory path to a set of input files.\n";
1760         help();
1761     }
1762 
1763     if (in_files.size() > 0 && in_path.length() > 0) {
1764         cerr << "You must specify either a single input file (-f) or a directory path (-p), not both.\n";
1765         help();
1766     }
1767 
1768     if (in_path.length() > 0 && in_path.at(in_path.length() - 1) != '/')
1769         in_path += "/";
1770 
1771     if (out_path.length() == 0)
1772         out_path = ".";
1773 
1774     if (out_path.at(out_path.length() - 1) != '/')
1775         out_path += "/";
1776 
1777     if (in_file_type == FileT::unknown)
1778         in_file_type = FileT::fastq;
1779 
1780     if (read_k_freq && write_k_freq) {
1781         cerr << "You may either read a set of kmer frequencies, or write kmer frequencies, not both.\n";
1782         help();
1783     }
1784 
1785     if (min_k_pct < 0.0 || min_k_pct > 1.0) {
1786         cerr << "Percentage to consider a kmer rare must be between 0 and 1.0.\n";
1787         help();
1788     }
1789 
1790     //
1791     // Check that the output path exists.
1792     //
1793     struct stat info;
1794     if (stat(out_path.c_str(), &info) != 0) {
1795         cerr << "Unable to locate the specified output path, '" << out_path << "'\n";
1796         exit(1);
1797     }
1798 
1799     return 0;
1800 }
1801 
version()1802 void version() {
1803     cerr << "kmer_filter " << VERSION << "\n\n";
1804 
1805     exit(1);
1806 }
1807 
help()1808 void help() {
1809     cerr << "kmer_filter " << VERSION << "\n"
1810               << "kmer_filter [-f in_file_1 [-f in_file_2...] | -p in_dir] [-1 pair_1 -2 pair_2 [-1 pair_1...]] -o out_dir [-i type] [-y type] [-D] [-h]\n"
1811               << "  f: path to the input file if processing single-end seqeunces.\n"
1812               << "  i: input file type, either 'bustard' for the Illumina BUSTARD output files, 'fasta', 'fastq', 'gzfasta', or 'gzfastq' (default 'fastq').\n"
1813               << "  p: path to a directory of files (for single-end files only).\n"
1814               << "  1: specify the first in a pair of files to be processed together.\n"
1815               << "  2: specify the second in a pair of files to be processed together.\n"
1816               << "  o: path to output the processed files.\n"
1817               << "  y: output type, either 'fastq' or 'fasta' (default fastq).\n"
1818               << "  D: capture discarded reads to a file.\n"
1819               << "  h: display this help messsage.\n\n"
1820               << "  Filtering options:\n"
1821               << "    --rare: turn on filtering based on rare k-mers.\n"
1822               << "    --abundant: turn on filtering based on abundant k-mers.\n"
1823               << "    --k-len <len>: specify k-mer size (default 15).\n\n"
1824               << "  Advanced filtering options:\n"
1825               << "    --max-k-freq <value>: specify the number of times a kmer must occur to be considered abundant (default 20,000).\n"
1826               << "    --min-lim <value>: specify number of rare kmers occuring in a row required to discard a read (default 80% of the k-mer length).\n"
1827               << "    --max-lim <value>: specify number of abundant kmers required to discard a read (default 80% of the k-mers in a read).\n\n"
1828               << "  Normalize data:\n"
1829               << "    --normalize <depth>: normalize read depth according to k-mer coverage.\n\n"
1830               << "  Characterizing K-mers:\n"
1831               << "    --write-k-freq: write kmers along with their frequency of occurrence and exit.\n"
1832               << "    --k-dist: print k-mer frequency distribution and exit.\n\n"
1833               << "  Advanced input options:\n"
1834               << "    --read-k-freq <path>: read a set of kmers along with their frequencies of occurrence instead of reading raw input files.\n"
1835               << "\n";
1836 
1837     exit(1);
1838 }
1839