1 #ifndef UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP
2 #define UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP
3 
4 /*  $Id: sequtil_convert_imp.hpp 495056 2016-03-14 16:14:57Z ucko $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Mati Shomrat
30  *
31  * File Description:
32  *
33  */
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/sequtil/sequtil_convert.hpp>
37 #include "sequtil_shared.hpp"
38 
39 BEGIN_NCBI_SCOPE
40 
41 
42 class CSeqConvert_imp
43 {
44 public:
45     typedef CSeqUtil::TCoding TCoding;
46     typedef CSeqConvert::IPackTarget IPackTarget;
47 
48     // Conversion
49 
50     template <typename SrcCont, typename DstCont>
Convert(const SrcCont & src,TCoding src_coding,TSeqPos pos,TSeqPos length,DstCont & dst,TCoding dst_coding)51     static SIZE_TYPE Convert
52     (const SrcCont& src,
53      TCoding src_coding,
54      TSeqPos pos,
55      TSeqPos length,
56      DstCont& dst,
57      TCoding dst_coding)
58     {
59         _ASSERT(!OutOfRange(pos, src, src_coding));
60 
61         if ( src.empty()  ||  (length == 0) ) {
62             return 0;
63         }
64 
65         AdjustLength(src, src_coding, pos, length);
66         ResizeDst(dst, dst_coding, length);
67 
68         return Convert(&*src.begin(), src_coding, pos, length,
69             &*dst.begin(), dst_coding);
70     }
71 
72 
73     static SIZE_TYPE Convert(const char* src, TCoding src_coding,
74                              TSeqPos pos, TSeqPos length,
75                              char* dst, TCoding dst_coding);
76 
77     // Subseq
78 
79     template <typename SrcCont, typename DstCont>
Subseq(const SrcCont & src,TCoding coding,TSeqPos pos,TSeqPos length,DstCont & dst)80     static SIZE_TYPE Subseq
81     (const SrcCont& src,
82      TCoding coding,
83      TSeqPos pos,
84      TSeqPos length,
85      DstCont& dst)
86     {
87         _ASSERT(!OutOfRange(pos, src, coding));
88 
89         if ( src.empty()  ||  (length == 0) ) {
90             return 0;
91         }
92 
93         AdjustLength(src, coding, pos, length);
94         ResizeDst(dst, coding, length);
95 
96         return Subseq(&*src.begin(), coding, pos, length, &*dst.begin());
97     }
98 
99     static SIZE_TYPE Subseq(const char* src, TCoding coding,
100                             TSeqPos pos, TSeqPos length,
101                             char* dst);
102 
103     // Pack
104 
105     template <typename SrcCont, typename DstCont>
Pack(const SrcCont & src,TCoding src_coding,DstCont & dst,TCoding & dst_coding,TSeqPos length)106     static SIZE_TYPE Pack
107     (const SrcCont& src,
108      TCoding src_coding,
109      DstCont& dst,
110      TCoding& dst_coding,
111      TSeqPos length)
112     {
113         if ( src.empty()  ||  (length == 0) ) {
114             return 0;
115         }
116 
117         AdjustLength(src, src_coding, 0, length);
118         // we allocate enough memory for ncbi4na coding
119         // if the result will be ncbi2na coding we'll resize (see below)
120         ResizeDst(dst, CSeqUtil::e_Ncbi4na, length);
121 
122         SIZE_TYPE res = Pack(&*src.begin(), length, src_coding,
123                              &*dst.begin(), dst_coding);
124         if ( dst_coding == CSeqUtil::e_Ncbi2na ) {
125             size_t new_size = res / 4;
126             if ( (res % 4) != 0 ) {
127                 ++new_size;
128             }
129             dst.resize(new_size);
130         }
131         return res;
132     }
133 
134     static SIZE_TYPE Pack(const char* src, TSeqPos length, TCoding src_coding,
135                           char* dst, TCoding& dst_coding);
136 
137     template <typename SrcCont>
Pack(const SrcCont & src,TCoding src_coding,IPackTarget & dst,TSeqPos length)138     static SIZE_TYPE Pack
139     (const SrcCont& src,
140      TCoding src_coding,
141      IPackTarget& dst,
142      TSeqPos length)
143     {
144         if ( src.empty()  ||  (length == 0) ) {
145             return 0;
146         }
147 
148         AdjustLength(src, src_coding, 0, length);
149         return Pack(&*src.begin(), length, src_coding, dst);
150     }
151 
152     static SIZE_TYPE Pack(const char* src, TSeqPos length, TCoding src_coding,
153                           IPackTarget& dst);
154 
155 private:
156 
157     // Conversion methods:
158 
159     // --- NA conversions:
160 
161     // iupacna -> ...
162     static SIZE_TYPE x_ConvertIupacnaToIupacna(const char* src, TSeqPos pos,
163         TSeqPos length, char* dst);
164     static SIZE_TYPE x_ConvertIupacnaTo2na(const char* src, TSeqPos pos,
165         TSeqPos length, char* dst);
166     static SIZE_TYPE x_ConvertIupacnaTo2naExpand(const char* src, TSeqPos pos,
167         TSeqPos length, char* dst);
168     static SIZE_TYPE x_ConvertIupacnaTo4na(const char* src, TSeqPos pos,
169         TSeqPos length, char* dst);
170     static SIZE_TYPE x_ConvertIupacnaTo8na(const char* src, TSeqPos pos,
171         TSeqPos length, char* dst);
172 
173     // ncbi2na -> ...
174     static SIZE_TYPE x_Convert2naToIupacna(const char* src, TSeqPos pos,
175         TSeqPos length, char* dst);
176     static SIZE_TYPE x_Convert2naTo2naExpand(const char* src, TSeqPos pos,
177         TSeqPos length, char* dst);
178     static SIZE_TYPE x_Convert2naTo4na(const char* src, TSeqPos pos,
179         TSeqPos length, char* dst);
180     static SIZE_TYPE x_Convert2naTo8na(const char* src, TSeqPos pos,
181         TSeqPos length, char* dst);
182 
183     // ncbi2na_expand -> ...
184     static SIZE_TYPE x_Convert2naExpandToIupacna(const char* src, TSeqPos pos,
185         TSeqPos length, char* dst);
186     static SIZE_TYPE x_Convert2naExpandTo2na(const char* src, TSeqPos pos,
187         TSeqPos length, char* dst);
188     static SIZE_TYPE x_Convert2naExpandTo4na(const char* src, TSeqPos pos,
189         TSeqPos length, char* dst);
190     static SIZE_TYPE x_Convert2naExpandTo8na(const char* src, TSeqPos pos,
191         TSeqPos length, char* dst);
192 
193     // ncbi4na -> ...
194     static SIZE_TYPE x_Convert4naToIupacna(const char* src, TSeqPos pos,
195         TSeqPos length, char* dst);
196     static SIZE_TYPE x_Convert4naTo2na(const char* src, TSeqPos pos,
197         TSeqPos length, char* dst);
198     static SIZE_TYPE x_Convert4naTo2naExpand(const char* src, TSeqPos pos,
199         TSeqPos length, char* dst);
200     static SIZE_TYPE x_Convert4naTo8na(const char* src, TSeqPos pos,
201         TSeqPos length, char* dst);
202 
203     // ncbi8na (ncbi4na_expand) -> ...
204     static SIZE_TYPE x_Convert8naToIupacna(const char* src, TSeqPos pos,
205         TSeqPos length, char* dst);
206     static SIZE_TYPE x_Convert8naTo2na(const char* src, TSeqPos pos,
207         TSeqPos length, char* dst);
208     static SIZE_TYPE x_Convert8naTo2naExpand(const char* src, TSeqPos pos,
209         TSeqPos length, char* dst);
210     static SIZE_TYPE x_Convert8naTo4na(const char* src, TSeqPos pos,
211         TSeqPos length, char* dst);
212 
213     // --- AA conversions:
214 
215     // iupacaa -> ...
216     static SIZE_TYPE x_ConvertIupacaaToEaa(const char* src, TSeqPos pos,
217         TSeqPos length, char* dst);
218     static SIZE_TYPE x_ConvertIupacaaToStdaa(const char* src, TSeqPos pos,
219         TSeqPos length, char* dst);
220 
221     // ncbieaa -> ...
222     static SIZE_TYPE x_ConvertEaaToIupacaa(const char* src, TSeqPos pos,
223         TSeqPos length, char* dst);
224     static SIZE_TYPE x_ConvertEaaToStdaa(const char* src, TSeqPos pos,
225         TSeqPos length, char* dst);
226 
227     // ncbistdaa (ncbi8aa) -> ...
228     static SIZE_TYPE x_ConvertStdaaToIupacaa(const char* src, TSeqPos pos,
229         TSeqPos length, char* dst);
230     static SIZE_TYPE x_ConvertStdaaToEaa(const char* src, TSeqPos pos,
231         TSeqPos length, char* dst);
232 
233     // Test for amibiguous bases (not A,C,G or T) starting at position 0.
234     static bool x_HasAmbig(const char* src, TCoding src_coding, size_t length);
235     static bool x_HasAmbigNcbi8na(const char* src, size_t length);
236     static bool x_HasAmbigNcbi4na(const char* src, size_t length);
237     static bool x_HasAmbigIupacna(const char* src, size_t length);
238 
239     // Advanced packing
240 
241     // General approach: always keep track of the best option ending
242     // in a full-width chunk, which may prove to be useful if a
243     // following short region wouldn't be worth the overhead.
244     // (Also, try to keep partial nucleotide bytes to a minimum.)
245 
246     class CPacker {
247     public:
CPacker(TCoding src_coding,const TCoding * best_coding,bool gaps_ok,IPackTarget & dst)248         CPacker(TCoding src_coding, const TCoding* best_coding, bool gaps_ok,
249                 IPackTarget& dst)
250             : m_SrcCoding(src_coding), m_BestCoding(best_coding),
251               m_Target(dst), m_SrcDensity(GetBasesPerByte(src_coding)),
252               m_GapsOK(gaps_ok), m_WideCoding(x_GetWideCoding(src_coding))
253             { }
254         ~CPacker();
255 
256         SIZE_TYPE Pack(const char* src, TSeqPos length);
257 
258     private:
259         void x_AddBoundary(TSeqPos pos, TCoding new_coding);
260         static TCoding x_GetWideCoding(const TCoding coding);
261 
262         struct SCodings {
263             enum {
264                 kBlockSize = 16
265             };
GetLastCSeqConvert_imp::CPacker::SCodings266             TCoding GetLast(void) const { return current[current_used - 1]; }
267             TCoding   current[16 /* kBlockSize */];
268             SCodings* previous;
269             unsigned  current_used;
270         };
271         struct SArrangement {
SArrangementCSeqConvert_imp::CPacker::SArrangement272             SArrangement()
273                 : codings(NULL), shared_codings(NULL), cost(0)
274                 { }
~SArrangementCSeqConvert_imp::CPacker::SArrangement275             ~SArrangement()
276                 { Reset(); }
277             SArrangement& operator= (SArrangement& arr);
278             void Reset(void);
279             void AddCoding(TCoding coding);
280             SCodings* codings;
281             SCodings* shared_codings; // last common ancestor
282             SIZE_TYPE cost;
283         };
284 
285         const TCoding        m_SrcCoding;
286         const TCoding* const m_BestCoding;
287         IPackTarget&         m_Target;
288         const size_t         m_SrcDensity;
289         const bool           m_GapsOK;
290         const TCoding        m_WideCoding;
291 
292         vector<TSeqPos> m_Boundaries;
293         SArrangement    m_EndingNarrow;
294         SArrangement    m_EndingWide;
295 
296         static const TCoding kNoCoding;
297     };
298 };
299 
300 
301 
302 END_NCBI_SCOPE
303 
304 
305 #endif  /* UTIL_SEQUTIL___SEQUTIL_CONVERT_IMP__HPP */
306