1 // Copyright (c) 2020 Robert Vaser
2 
3 #ifndef BIOPARSER_SAM_PARSER_HPP_
4 #define BIOPARSER_SAM_PARSER_HPP_
5 
6 #include <cstdint>
7 #include <cstdlib>
8 #include <memory>
9 #include <vector>
10 #include <stdexcept>
11 
12 #include "bioparser/parser.hpp"
13 
14 namespace bioparser {
15 
16 template<class T>
17 class SamParser: public Parser<T> {
18  public:
19   SamParser(const SamParser&) = delete;
20   SamParser& operator=(const SamParser&) = delete;
21 
22   SamParser(SamParser&&) = delete;
23   SamParser& operator=(SamParser&&) = delete;
24 
~SamParser()25   ~SamParser() {}
26 
Parse(std::uint64_t bytes,bool shorten_names=true)27   std::vector<std::unique_ptr<T>> Parse(
28       std::uint64_t bytes, bool shorten_names = true) override {
29     std::vector<std::unique_ptr<T>> dst;
30     std::uint64_t parsed_bytes = 0;
31 
32     const char* q_name = nullptr;
33     std::uint32_t q_name_len = 0;
34     std::uint32_t flag = 0;
35     const char* t_name = nullptr;
36     std::uint32_t t_name_len = 0;
37     std::uint32_t t_begin = 0;
38     std::uint32_t map_quality = 0;
39     const char* cigar = nullptr;
40     std::uint32_t cigar_len = 0;
41     const char* t_next_name = nullptr;
42     std::uint32_t t_next_name_len = 0;
43     std::uint32_t t_next_begin = 0;
44     std::uint32_t template_len = 0;
45     const char* data = nullptr;
46     std::uint32_t data_len = 0;
47     const char* quality = nullptr;
48     std::uint32_t quality_len = 0;
49 
50     auto create_T = [&] () -> void {
51       if (this->storage()[0] == '@') {  // file header
52         this->Clear();
53         return;
54       }
55       auto storage_ptr = this->RightStrip(
56           this->storage().data(),
57           this->storage_ptr());
58       this->Terminate(storage_ptr);
59 
60       std::uint32_t num_values = 0;
61       std::uint32_t begin_ptr = 0;
62       while (true) {
63         auto end_ptr = begin_ptr;
64         while (end_ptr < storage_ptr && this->storage()[end_ptr] != '\t') {
65           ++end_ptr;
66         }
67         this->Terminate(end_ptr);
68 
69         switch (num_values) {
70           case 0:
71             q_name = this->storage().data() + begin_ptr;
72             q_name_len = end_ptr - begin_ptr;
73             break;
74           case 1: flag = std::atoi(this->storage().data() + begin_ptr); break;
75           case 2:
76             t_name = this->storage().data() + begin_ptr;
77             t_name_len = end_ptr - begin_ptr;
78             break;
79           case 3: t_begin = std::atoi(this->storage().data() + begin_ptr); break;  // NOLINT
80           case 4: map_quality = std::atoi(this->storage().data() + begin_ptr); break;  // NOLINT
81           case 5:
82             cigar = this->storage().data() + begin_ptr;
83             cigar_len = end_ptr - begin_ptr;
84             break;
85           case 6:
86             t_next_name = this->storage().data() + begin_ptr;
87             t_next_name_len = end_ptr - begin_ptr;
88             break;
89           case 7: t_next_begin = std::atoi(this->storage().data() + begin_ptr); break;  // NOLINT
90           case 8: template_len = std::atoi(this->storage().data() + begin_ptr); break;  // NOLINT
91           case 9:
92             data = this->storage().data() + begin_ptr;
93             data_len = end_ptr - begin_ptr;
94             break;
95           case 10:
96             quality = this->storage().data() + begin_ptr;
97             quality_len = end_ptr - begin_ptr;
98             break;
99           default: break;
100         }
101 
102         ++num_values;
103         if (end_ptr == storage_ptr || num_values == 11) {
104           break;
105         }
106         begin_ptr = end_ptr + 1;
107       }
108 
109       if (num_values != 11) {
110         throw std::invalid_argument(
111             "[bioparser::SamParser] error: invalid file format");
112       }
113 
114       q_name_len = shorten_names ?
115           this->Shorten(q_name, q_name_len) :
116           this->RightStrip(q_name, q_name_len);
117 
118       t_name_len = shorten_names ?
119           this->Shorten(t_name, t_name_len) :
120           this->RightStrip(t_name, t_name_len);
121 
122       cigar_len = this->RightStrip(cigar, cigar_len);
123 
124       t_next_name_len = shorten_names ?
125           this->Shorten(t_next_name, t_next_name_len) :
126           this->RightStrip(t_next_name, t_next_name_len);
127 
128       data_len = this->RightStrip(data, data_len);
129       quality_len = this->RightStrip(quality, quality_len);
130 
131       if (q_name_len == 0 || t_name_len == 0 || cigar_len == 0 ||
132           t_next_name_len == 0 || data_len == 0 || quality_len == 0 ||
133           (data_len > 1 && quality_len > 1 && data_len != quality_len)) {
134         throw std::invalid_argument(
135             "[bioparser::SamParser] error: invalid file format");
136       }
137 
138       dst.emplace_back(std::unique_ptr<T>(new T(
139           q_name, q_name_len,
140           flag,
141           t_name, t_name_len, t_begin,
142           map_quality,
143           cigar, cigar_len,
144           t_next_name, t_next_name_len, t_next_begin,
145           template_len,
146           data, data_len,
147           quality, quality_len)));
148 
149       parsed_bytes += this->storage_ptr();
150       this->Clear();
151     };
152 
153     bool is_eof = false;
154 
155     while (true) {
156       auto buffer_ptr = this->buffer_ptr();
157       for (; buffer_ptr < this->buffer_bytes(); ++buffer_ptr) {
158         auto c = this->buffer()[buffer_ptr];
159         if (c == '\n') {
160           this->Store(buffer_ptr - this->buffer_ptr());
161           create_T();
162           if (parsed_bytes >= bytes) {
163             return dst;
164           }
165         }
166       }
167       if (this->buffer_ptr() < buffer_ptr) {
168         this->Store(buffer_ptr - this->buffer_ptr());
169       }
170 
171       if (is_eof) {
172         break;
173       }
174       is_eof = this->Read();
175     }
176 
177     if (this->storage_ptr() != 0) {
178       create_T();
179     }
180 
181     return dst;
182   }
183 
184  private:
SamParser(gzFile file)185   explicit SamParser(gzFile file)
186       : Parser<T>(file, 65536) {}  // 64 kB
187 
188   friend Parser<T>;
189 };
190 
191 }  // namespace bioparser
192 
193 #endif  // BIOPARSER_SAM_PARSER_HPP_
194