1 #ifndef UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP 2 #define UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP 3 4 /* $Id: sequtil_convert_imp.hpp 495056 2016-03-14 16:14:57Z ucko $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Mati Shomrat 30 * 31 * File Description: 32 * 33 */ 34 #include <corelib/ncbistd.hpp> 35 36 #include <util/sequtil/sequtil_convert.hpp> 37 #include "sequtil_shared.hpp" 38 39 BEGIN_NCBI_SCOPE 40 41 42 class CSeqConvert_imp 43 { 44 public: 45 typedef CSeqUtil::TCoding TCoding; 46 typedef CSeqConvert::IPackTarget IPackTarget; 47 48 // Conversion 49 50 template <typename SrcCont, typename DstCont> Convert(const SrcCont & src,TCoding src_coding,TSeqPos pos,TSeqPos length,DstCont & dst,TCoding dst_coding)51 static SIZE_TYPE Convert 52 (const SrcCont& src, 53 TCoding src_coding, 54 TSeqPos pos, 55 TSeqPos length, 56 DstCont& dst, 57 TCoding dst_coding) 58 { 59 _ASSERT(!OutOfRange(pos, src, src_coding)); 60 61 if ( src.empty() || (length == 0) ) { 62 return 0; 63 } 64 65 AdjustLength(src, src_coding, pos, length); 66 ResizeDst(dst, dst_coding, length); 67 68 return Convert(&*src.begin(), src_coding, pos, length, 69 &*dst.begin(), dst_coding); 70 } 71 72 73 static SIZE_TYPE Convert(const char* src, TCoding src_coding, 74 TSeqPos pos, TSeqPos length, 75 char* dst, TCoding dst_coding); 76 77 // Subseq 78 79 template <typename SrcCont, typename DstCont> Subseq(const SrcCont & src,TCoding coding,TSeqPos pos,TSeqPos length,DstCont & dst)80 static SIZE_TYPE Subseq 81 (const SrcCont& src, 82 TCoding coding, 83 TSeqPos pos, 84 TSeqPos length, 85 DstCont& dst) 86 { 87 _ASSERT(!OutOfRange(pos, src, coding)); 88 89 if ( src.empty() || (length == 0) ) { 90 return 0; 91 } 92 93 AdjustLength(src, coding, pos, length); 94 ResizeDst(dst, coding, length); 95 96 return Subseq(&*src.begin(), coding, pos, length, &*dst.begin()); 97 } 98 99 static SIZE_TYPE Subseq(const char* src, TCoding coding, 100 TSeqPos pos, TSeqPos length, 101 char* dst); 102 103 // Pack 104 105 template <typename SrcCont, typename DstCont> Pack(const SrcCont & src,TCoding src_coding,DstCont & dst,TCoding & dst_coding,TSeqPos length)106 static SIZE_TYPE Pack 107 (const SrcCont& src, 108 TCoding src_coding, 109 DstCont& dst, 110 TCoding& dst_coding, 111 TSeqPos length) 112 { 113 if ( src.empty() || (length == 0) ) { 114 return 0; 115 } 116 117 AdjustLength(src, src_coding, 0, length); 118 // we allocate enough memory for ncbi4na coding 119 // if the result will be ncbi2na coding we'll resize (see below) 120 ResizeDst(dst, CSeqUtil::e_Ncbi4na, length); 121 122 SIZE_TYPE res = Pack(&*src.begin(), length, src_coding, 123 &*dst.begin(), dst_coding); 124 if ( dst_coding == CSeqUtil::e_Ncbi2na ) { 125 size_t new_size = res / 4; 126 if ( (res % 4) != 0 ) { 127 ++new_size; 128 } 129 dst.resize(new_size); 130 } 131 return res; 132 } 133 134 static SIZE_TYPE Pack(const char* src, TSeqPos length, TCoding src_coding, 135 char* dst, TCoding& dst_coding); 136 137 template <typename SrcCont> Pack(const SrcCont & src,TCoding src_coding,IPackTarget & dst,TSeqPos length)138 static SIZE_TYPE Pack 139 (const SrcCont& src, 140 TCoding src_coding, 141 IPackTarget& dst, 142 TSeqPos length) 143 { 144 if ( src.empty() || (length == 0) ) { 145 return 0; 146 } 147 148 AdjustLength(src, src_coding, 0, length); 149 return Pack(&*src.begin(), length, src_coding, dst); 150 } 151 152 static SIZE_TYPE Pack(const char* src, TSeqPos length, TCoding src_coding, 153 IPackTarget& dst); 154 155 private: 156 157 // Conversion methods: 158 159 // --- NA conversions: 160 161 // iupacna -> ... 162 static SIZE_TYPE x_ConvertIupacnaToIupacna(const char* src, TSeqPos pos, 163 TSeqPos length, char* dst); 164 static SIZE_TYPE x_ConvertIupacnaTo2na(const char* src, TSeqPos pos, 165 TSeqPos length, char* dst); 166 static SIZE_TYPE x_ConvertIupacnaTo2naExpand(const char* src, TSeqPos pos, 167 TSeqPos length, char* dst); 168 static SIZE_TYPE x_ConvertIupacnaTo4na(const char* src, TSeqPos pos, 169 TSeqPos length, char* dst); 170 static SIZE_TYPE x_ConvertIupacnaTo8na(const char* src, TSeqPos pos, 171 TSeqPos length, char* dst); 172 173 // ncbi2na -> ... 174 static SIZE_TYPE x_Convert2naToIupacna(const char* src, TSeqPos pos, 175 TSeqPos length, char* dst); 176 static SIZE_TYPE x_Convert2naTo2naExpand(const char* src, TSeqPos pos, 177 TSeqPos length, char* dst); 178 static SIZE_TYPE x_Convert2naTo4na(const char* src, TSeqPos pos, 179 TSeqPos length, char* dst); 180 static SIZE_TYPE x_Convert2naTo8na(const char* src, TSeqPos pos, 181 TSeqPos length, char* dst); 182 183 // ncbi2na_expand -> ... 184 static SIZE_TYPE x_Convert2naExpandToIupacna(const char* src, TSeqPos pos, 185 TSeqPos length, char* dst); 186 static SIZE_TYPE x_Convert2naExpandTo2na(const char* src, TSeqPos pos, 187 TSeqPos length, char* dst); 188 static SIZE_TYPE x_Convert2naExpandTo4na(const char* src, TSeqPos pos, 189 TSeqPos length, char* dst); 190 static SIZE_TYPE x_Convert2naExpandTo8na(const char* src, TSeqPos pos, 191 TSeqPos length, char* dst); 192 193 // ncbi4na -> ... 194 static SIZE_TYPE x_Convert4naToIupacna(const char* src, TSeqPos pos, 195 TSeqPos length, char* dst); 196 static SIZE_TYPE x_Convert4naTo2na(const char* src, TSeqPos pos, 197 TSeqPos length, char* dst); 198 static SIZE_TYPE x_Convert4naTo2naExpand(const char* src, TSeqPos pos, 199 TSeqPos length, char* dst); 200 static SIZE_TYPE x_Convert4naTo8na(const char* src, TSeqPos pos, 201 TSeqPos length, char* dst); 202 203 // ncbi8na (ncbi4na_expand) -> ... 204 static SIZE_TYPE x_Convert8naToIupacna(const char* src, TSeqPos pos, 205 TSeqPos length, char* dst); 206 static SIZE_TYPE x_Convert8naTo2na(const char* src, TSeqPos pos, 207 TSeqPos length, char* dst); 208 static SIZE_TYPE x_Convert8naTo2naExpand(const char* src, TSeqPos pos, 209 TSeqPos length, char* dst); 210 static SIZE_TYPE x_Convert8naTo4na(const char* src, TSeqPos pos, 211 TSeqPos length, char* dst); 212 213 // --- AA conversions: 214 215 // iupacaa -> ... 216 static SIZE_TYPE x_ConvertIupacaaToEaa(const char* src, TSeqPos pos, 217 TSeqPos length, char* dst); 218 static SIZE_TYPE x_ConvertIupacaaToStdaa(const char* src, TSeqPos pos, 219 TSeqPos length, char* dst); 220 221 // ncbieaa -> ... 222 static SIZE_TYPE x_ConvertEaaToIupacaa(const char* src, TSeqPos pos, 223 TSeqPos length, char* dst); 224 static SIZE_TYPE x_ConvertEaaToStdaa(const char* src, TSeqPos pos, 225 TSeqPos length, char* dst); 226 227 // ncbistdaa (ncbi8aa) -> ... 228 static SIZE_TYPE x_ConvertStdaaToIupacaa(const char* src, TSeqPos pos, 229 TSeqPos length, char* dst); 230 static SIZE_TYPE x_ConvertStdaaToEaa(const char* src, TSeqPos pos, 231 TSeqPos length, char* dst); 232 233 // Test for amibiguous bases (not A,C,G or T) starting at position 0. 234 static bool x_HasAmbig(const char* src, TCoding src_coding, size_t length); 235 static bool x_HasAmbigNcbi8na(const char* src, size_t length); 236 static bool x_HasAmbigNcbi4na(const char* src, size_t length); 237 static bool x_HasAmbigIupacna(const char* src, size_t length); 238 239 // Advanced packing 240 241 // General approach: always keep track of the best option ending 242 // in a full-width chunk, which may prove to be useful if a 243 // following short region wouldn't be worth the overhead. 244 // (Also, try to keep partial nucleotide bytes to a minimum.) 245 246 class CPacker { 247 public: CPacker(TCoding src_coding,const TCoding * best_coding,bool gaps_ok,IPackTarget & dst)248 CPacker(TCoding src_coding, const TCoding* best_coding, bool gaps_ok, 249 IPackTarget& dst) 250 : m_SrcCoding(src_coding), m_BestCoding(best_coding), 251 m_Target(dst), m_SrcDensity(GetBasesPerByte(src_coding)), 252 m_GapsOK(gaps_ok), m_WideCoding(x_GetWideCoding(src_coding)) 253 { } 254 ~CPacker(); 255 256 SIZE_TYPE Pack(const char* src, TSeqPos length); 257 258 private: 259 void x_AddBoundary(TSeqPos pos, TCoding new_coding); 260 static TCoding x_GetWideCoding(const TCoding coding); 261 262 struct SCodings { 263 enum { 264 kBlockSize = 16 265 }; GetLastCSeqConvert_imp::CPacker::SCodings266 TCoding GetLast(void) const { return current[current_used - 1]; } 267 TCoding current[16 /* kBlockSize */]; 268 SCodings* previous; 269 unsigned current_used; 270 }; 271 struct SArrangement { SArrangementCSeqConvert_imp::CPacker::SArrangement272 SArrangement() 273 : codings(NULL), shared_codings(NULL), cost(0) 274 { } ~SArrangementCSeqConvert_imp::CPacker::SArrangement275 ~SArrangement() 276 { Reset(); } 277 SArrangement& operator= (SArrangement& arr); 278 void Reset(void); 279 void AddCoding(TCoding coding); 280 SCodings* codings; 281 SCodings* shared_codings; // last common ancestor 282 SIZE_TYPE cost; 283 }; 284 285 const TCoding m_SrcCoding; 286 const TCoding* const m_BestCoding; 287 IPackTarget& m_Target; 288 const size_t m_SrcDensity; 289 const bool m_GapsOK; 290 const TCoding m_WideCoding; 291 292 vector<TSeqPos> m_Boundaries; 293 SArrangement m_EndingNarrow; 294 SArrangement m_EndingWide; 295 296 static const TCoding kNoCoding; 297 }; 298 }; 299 300 301 302 END_NCBI_SCOPE 303 304 305 #endif /* UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP */ 306