1 /* Quorum
2  * Copyright (C) 2012  Genome group at University of Maryland.
3  *
4  * This program is free software: you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation, either version 3 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 #include <string>
20 #include <vector>
21 #include <sstream>
22 
23 #include <jellyfish/mer_dna.hpp>
24 #include <jellyfish/file_header.hpp>
25 #include <jellyfish/err.hpp>
26 #include <jellyfish/thread_exec.hpp>
27 #include <jellyfish/stream_manager.hpp>
28 #include <jellyfish/whole_sequence_parser.hpp>
29 #include <jellyfish/large_hash_array.hpp>
30 
31 #include <src/mer_database.hpp>
32 #include <src/create_database_cmdline.hpp>
33 
34 namespace err = jellyfish::err;
35 
36 static create_database_cmdline args;
37 typedef create_database_cmdline::error error;
38 
39 using jellyfish::mer_dna;
40 typedef std::vector<const char*> file_vector;
41 typedef jellyfish::stream_manager<file_vector::const_iterator> stream_manager;
42 typedef jellyfish::whole_sequence_parser<stream_manager> read_parser;
43 
44 class quality_mer_counter : public jellyfish::thread_exec {
45   hash_with_quality& ary_;
46   read_parser        parser_;
47   const char         qual_thresh_;
48 
49 public:
quality_mer_counter(int nb_threads,hash_with_quality & ary,stream_manager & streams,char qual_thresh)50   quality_mer_counter(int nb_threads, hash_with_quality& ary, stream_manager& streams, char qual_thresh) :
51     ary_(ary),
52     parser_(4 * nb_threads, 100, 1, streams),
53     qual_thresh_(qual_thresh)
54   { }
55 
start(int thid)56   virtual void start(int thid) {
57     mer_dna m, rm;
58     size_t counted_high = 0, counted_low = 0;
59 
60     while(true) {
61       read_parser::job job(parser_);
62       if(job.is_empty()) break;
63 
64       for(size_t i = 0; i < job->nb_filled; ++i) { // Process each read
65         std::string& seq = job->data[i].seq;
66         std::string& quals = job->data[i].qual;
67 
68         auto         base     = seq.begin();
69         auto         qual     = quals.begin();
70         unsigned int low_len  = 0; // Length of low quality stretch
71         unsigned int high_len = 0; // Length of high quality stretch
72         for( ; base != seq.end(); ++base, ++qual) {
73           int code = mer_dna::code(*base);
74           if(mer_dna::not_dna(code)) {
75             high_len = low_len = 0;
76             continue;
77           }
78           m.shift_left(code);
79           rm.shift_right(mer_dna::complement(code));
80           ++low_len;
81           if(*qual >= qual_thresh_)
82             ++high_len;
83           else
84             high_len = 0;
85           if(low_len >= mer_dna::k()) {
86             if(!ary_.add(m < rm ? m : rm, high_len >= mer_dna::k()))
87               throw std::runtime_error(err::msg() << "Hash is full");
88             counted_high += high_len >= mer_dna::k();
89             ++counted_low;
90           }
91         }
92       }
93     }
94     ary_.done();
95   }
96 };
97 
main(int argc,char * argv[])98 int main(int argc, char *argv[])
99 {
100   database_header header;
101   header.fill_standard();
102   header.set_cmdline(argc, argv);
103 
104   args.parse(argc, argv);
105   mer_dna::k(args.mer_arg);
106   if(!args.min_qual_value_given && !args.min_qual_char_given)
107     error("Either a min-qual-value or min-qual-char must be provided.");
108   if(args.min_qual_char_given && args.min_qual_char_arg.size() != 1)
109     error("The min-qual-char should be one ASCII character.");
110   char qual_thresh = args.min_qual_char_given ? args.min_qual_char_arg[0] : (char)args.min_qual_value_arg;
111   if(args.bits_arg < 1 || args.bits_arg > 63)
112     error("The number of bits should be between 1 and 63");
113   std::ofstream output(args.output_arg);
114   if(!output.good())
115     error() << "Failed to open output file '" << args.output_arg << "'.";
116 
117   hash_with_quality ary(args.size_arg, 2 * mer_dna::k(), args.bits_arg,
118                         args.threads_arg, args.reprobe_arg);
119   {
120     stream_manager streams(args.reads_arg.cbegin(), args.reads_arg.cend(), 1);
121     quality_mer_counter counter(args.threads_arg, ary, streams, qual_thresh);
122     counter.exec_join(args.threads_arg);
123   }
124 
125   ary.write(output, &header);
126   output.close();
127 
128   return 0;
129 }
130