1 // Copyright (c) 2020 Robert Vaser 2 3 #ifndef BIOPARSER_SAM_PARSER_HPP_ 4 #define BIOPARSER_SAM_PARSER_HPP_ 5 6 #include <cstdint> 7 #include <cstdlib> 8 #include <memory> 9 #include <vector> 10 #include <stdexcept> 11 12 #include "bioparser/parser.hpp" 13 14 namespace bioparser { 15 16 template<class T> 17 class SamParser: public Parser<T> { 18 public: 19 SamParser(const SamParser&) = delete; 20 SamParser& operator=(const SamParser&) = delete; 21 22 SamParser(SamParser&&) = delete; 23 SamParser& operator=(SamParser&&) = delete; 24 ~SamParser()25 ~SamParser() {} 26 Parse(std::uint64_t bytes,bool shorten_names=true)27 std::vector<std::unique_ptr<T>> Parse( 28 std::uint64_t bytes, bool shorten_names = true) override { 29 std::vector<std::unique_ptr<T>> dst; 30 std::uint64_t parsed_bytes = 0; 31 32 const char* q_name = nullptr; 33 std::uint32_t q_name_len = 0; 34 std::uint32_t flag = 0; 35 const char* t_name = nullptr; 36 std::uint32_t t_name_len = 0; 37 std::uint32_t t_begin = 0; 38 std::uint32_t map_quality = 0; 39 const char* cigar = nullptr; 40 std::uint32_t cigar_len = 0; 41 const char* t_next_name = nullptr; 42 std::uint32_t t_next_name_len = 0; 43 std::uint32_t t_next_begin = 0; 44 std::uint32_t template_len = 0; 45 const char* data = nullptr; 46 std::uint32_t data_len = 0; 47 const char* quality = nullptr; 48 std::uint32_t quality_len = 0; 49 50 auto create_T = [&] () -> void { 51 if (this->storage()[0] == '@') { // file header 52 this->Clear(); 53 return; 54 } 55 auto storage_ptr = this->RightStrip( 56 this->storage().data(), 57 this->storage_ptr()); 58 this->Terminate(storage_ptr); 59 60 std::uint32_t num_values = 0; 61 std::uint32_t begin_ptr = 0; 62 while (true) { 63 auto end_ptr = begin_ptr; 64 while (end_ptr < storage_ptr && this->storage()[end_ptr] != '\t') { 65 ++end_ptr; 66 } 67 this->Terminate(end_ptr); 68 69 switch (num_values) { 70 case 0: 71 q_name = this->storage().data() + begin_ptr; 72 q_name_len = end_ptr - begin_ptr; 73 break; 74 case 1: flag = std::atoi(this->storage().data() + begin_ptr); break; 75 case 2: 76 t_name = this->storage().data() + begin_ptr; 77 t_name_len = end_ptr - begin_ptr; 78 break; 79 case 3: t_begin = std::atoi(this->storage().data() + begin_ptr); break; // NOLINT 80 case 4: map_quality = std::atoi(this->storage().data() + begin_ptr); break; // NOLINT 81 case 5: 82 cigar = this->storage().data() + begin_ptr; 83 cigar_len = end_ptr - begin_ptr; 84 break; 85 case 6: 86 t_next_name = this->storage().data() + begin_ptr; 87 t_next_name_len = end_ptr - begin_ptr; 88 break; 89 case 7: t_next_begin = std::atoi(this->storage().data() + begin_ptr); break; // NOLINT 90 case 8: template_len = std::atoi(this->storage().data() + begin_ptr); break; // NOLINT 91 case 9: 92 data = this->storage().data() + begin_ptr; 93 data_len = end_ptr - begin_ptr; 94 break; 95 case 10: 96 quality = this->storage().data() + begin_ptr; 97 quality_len = end_ptr - begin_ptr; 98 break; 99 default: break; 100 } 101 102 ++num_values; 103 if (end_ptr == storage_ptr || num_values == 11) { 104 break; 105 } 106 begin_ptr = end_ptr + 1; 107 } 108 109 if (num_values != 11) { 110 throw std::invalid_argument( 111 "[bioparser::SamParser] error: invalid file format"); 112 } 113 114 q_name_len = shorten_names ? 115 this->Shorten(q_name, q_name_len) : 116 this->RightStrip(q_name, q_name_len); 117 118 t_name_len = shorten_names ? 119 this->Shorten(t_name, t_name_len) : 120 this->RightStrip(t_name, t_name_len); 121 122 cigar_len = this->RightStrip(cigar, cigar_len); 123 124 t_next_name_len = shorten_names ? 125 this->Shorten(t_next_name, t_next_name_len) : 126 this->RightStrip(t_next_name, t_next_name_len); 127 128 data_len = this->RightStrip(data, data_len); 129 quality_len = this->RightStrip(quality, quality_len); 130 131 if (q_name_len == 0 || t_name_len == 0 || cigar_len == 0 || 132 t_next_name_len == 0 || data_len == 0 || quality_len == 0 || 133 (data_len > 1 && quality_len > 1 && data_len != quality_len)) { 134 throw std::invalid_argument( 135 "[bioparser::SamParser] error: invalid file format"); 136 } 137 138 dst.emplace_back(std::unique_ptr<T>(new T( 139 q_name, q_name_len, 140 flag, 141 t_name, t_name_len, t_begin, 142 map_quality, 143 cigar, cigar_len, 144 t_next_name, t_next_name_len, t_next_begin, 145 template_len, 146 data, data_len, 147 quality, quality_len))); 148 149 parsed_bytes += this->storage_ptr(); 150 this->Clear(); 151 }; 152 153 bool is_eof = false; 154 155 while (true) { 156 auto buffer_ptr = this->buffer_ptr(); 157 for (; buffer_ptr < this->buffer_bytes(); ++buffer_ptr) { 158 auto c = this->buffer()[buffer_ptr]; 159 if (c == '\n') { 160 this->Store(buffer_ptr - this->buffer_ptr()); 161 create_T(); 162 if (parsed_bytes >= bytes) { 163 return dst; 164 } 165 } 166 } 167 if (this->buffer_ptr() < buffer_ptr) { 168 this->Store(buffer_ptr - this->buffer_ptr()); 169 } 170 171 if (is_eof) { 172 break; 173 } 174 is_eof = this->Read(); 175 } 176 177 if (this->storage_ptr() != 0) { 178 create_T(); 179 } 180 181 return dst; 182 } 183 184 private: SamParser(gzFile file)185 explicit SamParser(gzFile file) 186 : Parser<T>(file, 65536) {} // 64 kB 187 188 friend Parser<T>; 189 }; 190 191 } // namespace bioparser 192 193 #endif // BIOPARSER_SAM_PARSER_HPP_ 194