1  /*$Id: seqport_util.cpp 634628 2021-07-15 17:30:59Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Clifford Clausen
27  *          (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko)
28  *
29  * File Description:
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <objects/seq/seqport_util.hpp>
34 
35 #include <serial/serial.hpp>
36 #include <serial/objostr.hpp>
37 #include <serial/objistr.hpp>
38 
39 #include <objects/seq/NCBI2na.hpp>
40 #include <objects/seq/NCBI4na.hpp>
41 #include <objects/seq/NCBI8na.hpp>
42 #include <objects/seq/NCBI8aa.hpp>
43 #include <objects/seq/IUPACna.hpp>
44 #include <objects/seq/IUPACaa.hpp>
45 #include <objects/seq/NCBIeaa.hpp>
46 #include <objects/seq/NCBIstdaa.hpp>
47 #include <objects/seq/NCBIpaa.hpp>
48 
49 #include <objects/seqcode/Seq_code_set.hpp>
50 #include <objects/seqcode/Seq_code_table.hpp>
51 #include <objects/seqcode/Seq_code_type.hpp>
52 #include <objects/seqcode/Seq_map_table.hpp>
53 
54 #include <util/sequtil/sequtil.hpp>
55 #include <util/sequtil/sequtil_convert.hpp>
56 #include <util/sequtil/sequtil_manip.hpp>
57 #include <util/random_gen.hpp>
58 #include <corelib/ncbi_safe_static.hpp>
59 
60 #include <algorithm>
61 #include <string.h>
62 
63 
64 BEGIN_NCBI_SCOPE
65 BEGIN_objects_SCOPE
66 
67 static const bool kSymbol = true;
68 static const bool kName = false;
69 static const unsigned int kNumCodes = 11;
70 
EChoiceToESeq(CSeq_data::E_Choice from_type)71 static inline ESeq_code_type EChoiceToESeq (CSeq_data::E_Choice from_type)
72 {
73     switch (from_type) {
74     case CSeq_data::e_Iupacaa:
75         return eSeq_code_type_iupacaa;
76     case CSeq_data::e_Ncbi2na:
77         return eSeq_code_type_ncbi2na;
78     case CSeq_data::e_Ncbi4na:
79         return eSeq_code_type_ncbi4na;
80     case CSeq_data::e_Iupacna:
81         return eSeq_code_type_iupacna;
82     case CSeq_data::e_Ncbieaa:
83         return eSeq_code_type_ncbieaa;
84     case CSeq_data::e_Ncbistdaa:
85         return eSeq_code_type_ncbistdaa;
86     case CSeq_data::e_Ncbi8na:
87         return eSeq_code_type_ncbi8na;
88     case CSeq_data::e_Ncbipna:
89         return eSeq_code_type_ncbipna;
90     case CSeq_data::e_Ncbi8aa:
91         return eSeq_code_type_ncbi8aa;
92     case CSeq_data::e_Ncbipaa:
93         return eSeq_code_type_ncbipaa;
94     default:
95         throw CSeqportUtil::CBadType("EChoiceToESeq");
96     }
97 }
98 
99 // CSeqportUtil_implementation is a singleton.
100 
101 class CSeqportUtil_implementation {
102 public:
103     CSeqportUtil_implementation();
104     ~CSeqportUtil_implementation();
105 
106     typedef CSeqportUtil::TIndex TIndex;
107     typedef CSeqportUtil::TPair  TPair;
108 
109     TSeqPos Convert
110     (const CSeq_data&      in_seq,
111      CSeq_data*            out_seq,
112      CSeq_data::E_Choice   to_code,
113      TSeqPos               uBeginIdx,
114      TSeqPos               uLength,
115      bool                  bAmbig,
116      CRandom::TValue       seed,
117      TSeqPos               total_length = 0,
118      TSeqPos*              out_seq_length = 0,
119      vector<Uint4>*        blast_ambig = 0)
120         const;
121 
122     TSeqPos Pack
123     (CSeq_data*   in_seq,
124      TSeqPos      uLength)
125         const;
126 
127     bool FastValidate
128     (const CSeq_data&   in_seq,
129      TSeqPos            uBeginIdx,
130      TSeqPos            uLength)
131         const;
132 
133     void Validate
134     (const CSeq_data&   in_seq,
135      vector<TSeqPos>*   badIdx,
136      TSeqPos            uBeginIdx,
137      TSeqPos            uLength)
138         const;
139 
140     TSeqPos GetAmbigs
141     (const CSeq_data&      in_seq,
142      CSeq_data*            out_seq,
143      vector<TSeqPos>*      out_indices,
144      CSeq_data::E_Choice   to_code,
145      TSeqPos               uBeginIdx,
146      TSeqPos               uLength)
147         const;
148 
149     TSeqPos GetCopy
150     (const CSeq_data&   in_seq,
151      CSeq_data*         out_seq,
152      TSeqPos            uBeginIdx,
153      TSeqPos            uLength)
154         const;
155 
156     TSeqPos Keep
157     (CSeq_data*   in_seq,
158      TSeqPos      uBeginIdx,
159      TSeqPos      uLength)
160         const;
161 
162     TSeqPos Append
163     (CSeq_data*         out_seq,
164      const CSeq_data&   in_seq1,
165      TSeqPos            uBeginIdx1,
166      TSeqPos            uLength1,
167      const CSeq_data&   in_seq2,
168      TSeqPos            uBeginIdx2,
169      TSeqPos            uLength2)
170         const;
171 
172     TSeqPos Complement
173     (CSeq_data*   in_seq,
174      TSeqPos      uBeginIdx,
175      TSeqPos      uLength)
176         const;
177 
178     TSeqPos Complement
179     (const CSeq_data&   in_seq,
180      CSeq_data*         out_seq,
181      TSeqPos            uBeginIdx,
182      TSeqPos            uLength)
183         const;
184 
185     TSeqPos Reverse
186     (CSeq_data*   in_seq,
187      TSeqPos      uBeginIdx,
188      TSeqPos      uLength)
189         const;
190 
191     TSeqPos Reverse
192     (const CSeq_data&  in_seq,
193      CSeq_data*        out_seq,
194      TSeqPos           uBeginIdx,
195      TSeqPos           uLength)
196         const;
197 
198     TSeqPos ReverseComplement
199     (CSeq_data*   in_seq,
200      TSeqPos      uBeginIdx,
201      TSeqPos      uLength)
202         const;
203 
204     TSeqPos ReverseComplement
205     (const CSeq_data&   in_seq,
206      CSeq_data*         out_seq,
207      TSeqPos            uBeginIdx,
208      TSeqPos            uLength)
209         const;
210 
211     const string& GetIupacaa3(TIndex ncbistdaa);
212 
213     bool IsCodeAvailable(CSeq_data::E_Choice code_type);
214 
215     bool IsCodeAvailable(ESeq_code_type code_type);
216 
217     TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type);
218 
219     TPair GetCodeIndexFromTo(ESeq_code_type code_type);
220 
221     const string& GetCodeOrName(CSeq_data::E_Choice code_type,
222                                 TIndex              idx,
223                                 bool                get_code);
224 
225     const string& GetCodeOrName(ESeq_code_type code_type,
226                                 TIndex         idx,
227                                 bool           get_code);
228 
229     TIndex GetIndex(CSeq_data::E_Choice code_type,
230                     const string&       code);
231 
232     TIndex GetIndex(ESeq_code_type code_type,
233                     const string&       code);
234 
235     TIndex GetIndexComplement(CSeq_data::E_Choice code_type,
236                               TIndex              idx);
237 
238     TIndex GetIndexComplement(ESeq_code_type code_type,
239                               TIndex         idx);
240 
241     TIndex GetMapToIndex(CSeq_data::E_Choice from_type,
242                          CSeq_data::E_Choice to_type,
243                          TIndex              from_idx);
244 
245     TIndex GetMapToIndex(ESeq_code_type from_type,
246                          ESeq_code_type to_type,
247                          TIndex         from_idx);
248     // Template wrapper class used to create data type specific
249     // classes to delete code tables on exit from main
250     template <class T>
251     class CWrapper_table : public CObject
252     {
253     public:
CWrapper_table(int size,int start)254         CWrapper_table(int size, int start)
255         {
256             m_Table   = new T[256];
257             m_StartAt = start;
258             m_Size    = size;
259         }
~CWrapper_table()260         ~CWrapper_table() {
261             drop_table();
262         }
drop_table()263         void drop_table()
264         {
265             delete[] m_Table;
266             m_Table = 0;
267         }
268 
269         T*  m_Table;
270         int m_StartAt;
271         int m_Size;
272     };
273 
274     // Template wrapper class used for two-dimensional arrays.
275     template <class T>
276     class CWrapper_2D : public CObject
277     {
278     public:
CWrapper_2D(int size1,int start1,int size2,int start2)279         CWrapper_2D(int size1, int start1, int size2, int start2)
280         {
281             m_Size_D1 = size1;
282             m_Size_D2 = size2;
283             m_StartAt_D1 = start1;
284             m_StartAt_D2 = start2;
285             m_Table = new T*[size1];
286             for(int i=0; i<size1; i++)
287                 {
288                     m_Table[i] = new T[size2] - start2;
289                 }
290             m_Table -= start1;
291         }
~CWrapper_2D()292         ~CWrapper_2D()
293         {
294             m_Table += m_StartAt_D1;
295             for(int i=0; i<m_Size_D1; i++)
296                 {
297                     delete[](m_Table[i] + m_StartAt_D2);
298                 }
299             delete[] m_Table;
300         }
301 
302         T** m_Table;
303         int m_Size_D1;
304         int m_Size_D2;
305         int m_StartAt_D1;
306         int m_StartAt_D2;
307     };
308 
309     // Typedefs making use of wrapper classes above.
310     typedef CWrapper_table<char>           CCode_table;
311     typedef CWrapper_table<string>         CCode_table_str;
312     typedef CWrapper_table<int>            CMap_table;
313     typedef CWrapper_table<unsigned int>   CFast_table4;
314     typedef CWrapper_table<unsigned short> CFast_table2;
315     typedef CWrapper_table<unsigned char>  CAmbig_detect;
316     typedef CWrapper_table<char>           CCode_comp;
317     typedef CWrapper_table<char>           CCode_rev;
318 
319     typedef CWrapper_2D<unsigned char>     CFast_4_1;
320     typedef CWrapper_2D<unsigned char>     CFast_2_1;
321 
322 private:
323     // String to initialize CSeq_code_set
324     // This string is initialized in seqport_util.h
325     static const char* sm_StrAsnData[];
326 
327     // CSeq_code_set member holding code and map table data
328     CRef<CSeq_code_set> m_SeqCodeSet;
329 
330     // Helper function used internally to initialize m_SeqCodeSet
331     CRef<CSeq_code_set> Init();
332 
333     // Member variables holding code tables
334     CRef<CCode_table> m_Iupacna;
335     CRef<CCode_table> m_Ncbieaa;
336     CRef<CCode_table> m_Ncbistdaa;
337     CRef<CCode_table> m_Iupacaa;
338 
339     // Helper function to initialize code tables
340     CRef<CCode_table> InitCodes(ESeq_code_type code_type);
341 
342     // Member variables holding na complement information
343     CRef<CCode_comp> m_Iupacna_complement;
344     CRef<CCode_comp> m_Ncbi2naComplement;
345     CRef<CCode_comp> m_Ncbi4naComplement;
346 
347     // Helper functions to initialize complement tables
348     CRef<CCode_comp> InitIupacnaComplement();
349     CRef<CCode_comp> InitNcbi2naComplement();
350     CRef<CCode_comp> InitNcbi4naComplement();
351 
352     // Member variables holding na reverse information
353     // Used to reverse residues packed within a byte.
354     CRef<CCode_rev> m_Ncbi2naRev;
355     CRef<CCode_rev> m_Ncbi4naRev;
356 
357     // Helper functions to initialize reverse tables
358     CRef<CCode_rev> InitNcbi2naRev();
359     CRef<CCode_rev> InitNcbi4naRev();
360 
361     // Member variables holding map tables
362 
363     CRef<CMap_table> m_Ncbi2naIupacna;
364     CRef<CMap_table> m_Ncbi2naNcbi4na;
365     CRef<CMap_table> m_Ncbi4naIupacna;
366     CRef<CMap_table> m_IupacnaNcbi2na;
367     CRef<CMap_table> m_IupacnaNcbi4na;
368     CRef<CMap_table> m_Ncbi4naNcbi2na;
369     CRef<CMap_table> m_IupacaaNcbieaa;
370     CRef<CMap_table> m_NcbieaaIupacaa;
371     CRef<CMap_table> m_IupacaaNcbistdaa;
372     CRef<CMap_table> m_NcbieaaNcbistdaa;
373     CRef<CMap_table> m_NcbistdaaNcbieaa;
374     CRef<CMap_table> m_NcbistdaaIupacaa;
375 
376 
377     TSeqPos x_ConvertAmbig
378     (const CSeq_data&      in_seq,
379      CSeq_data*            out_seq,
380      CSeq_data::E_Choice   to_code,
381      TSeqPos               uBeginIdx,
382      TSeqPos               uLength,
383      CRandom::TValue       seed,
384      TSeqPos               total_length = 0,
385      TSeqPos*              out_seq_length = 0,
386      vector<Uint4>*        blast_ambig = 0)
387         const;
388 
389     // Helper function to initialize map tables
390     CRef<CMap_table> InitMaps(ESeq_code_type from_type,
391                               ESeq_code_type to_type);
392 
393     // Member variables holding fast conversion tables
394 
395     // Takes a byte as an index and returns a unsigned int with
396     // 4 characters, each character being one of ATGC
397     //CRef<CFast_table4> m_FastNcbi2naIupacna;
398 
399     // Takes a byte (each byte with 4 Ncbi2na codes) as an index and
400     // returns a Unit2 with 2 bytes, each byte formated as 2 Ncbi4na codes
401     //CRef<CFast_table2> m_FastNcbi2naNcbi4na;
402 
403     // Takes a byte (each byte with 2 Ncbi4na codes) as an index and
404     // returns a 2 byte string, each byte with an Iupacna code.
405     //CRef<CFast_table2> m_FastNcbi4naIupacna;
406 
407     // Table used for fast compression from Iupacna to Ncbi2na (4 bytes to 1
408     // byte). This table is a 2 dimensional table. The first dimension
409     // corresponds to the iupacna position modulo 4 (0-3). The second dimension
410     //  is the value of the iupacna byte (0-255). The 4 resulting values from 4
411     // iupancna bytes are bitwise or'd to produce 1 byte.
412     CRef<CFast_4_1> m_FastIupacnaNcbi2na;
413 
414     // Table used for fast compression from Iupacna to Ncbi4na
415     // (2 bytes to 1 byte). Similar to m_FastIupacnaNcbi2na
416     CRef<CFast_2_1> m_FastIupacnaNcbi4na;
417 
418     // Table used for fast compression from Ncbi4na to Ncbi2na
419     // (2 bytes to 1 byte). Similar to m_FastIupacnaNcbi4na
420     CRef<CFast_2_1> m_FastNcbi4naNcbi2na;
421 
422     // Tables used to convert an index for a code type to a symbol or name
423     // for the same code type
424     vector<vector<string> > m_IndexString[2];
425     vector<vector<TIndex> > m_IndexComplement;
426     vector<map<string, TIndex> > m_StringIndex;
427     vector<TIndex> m_StartAt;
428 
429     // Helper function to initialize fast conversion tables
430     //CRef<CFast_table4> InitFastNcbi2naIupacna();
431     CRef<CFast_table2> InitFastNcbi2naNcbi4na();
432     CRef<CFast_table2> InitFastNcbi4naIupacna();
433     CRef<CFast_4_1>    InitFastIupacnaNcbi2na();
434     CRef<CFast_2_1>    InitFastIupacnaNcbi4na();
435     CRef<CFast_2_1>    InitFastNcbi4naNcbi2na();
436 
437     // Helper functions to initialize Index to/from code/name conversion tables
438     // and complement tables
439     void               InitIndexCodeName();
440 
441     // Data members and functions used for random disambiguation
442 
443     // structure used for ncbi4na --> ncbi2na
444     struct SMasksArray : public CObject
445     {
446         // Structure to hold all masks applicable to an input byte
447         struct SMasks {
448             int           nMasks;
449             unsigned char cMask[16];
450         };
451         SMasks m_Table[256];
452     };
453 
454     CRef<SMasksArray> m_Masks;
455 
456     // Helper function to initialize m_Masks
457     CRef<SMasksArray> InitMasks();
458 
459     // Data members used for detecting ambiguities
460 
461     // Data members used by GetAmbig methods to get a list of
462     // ambiguities resulting from alphabet conversions
463     CRef<CAmbig_detect> m_DetectAmbigNcbi4naNcbi2na;
464     CRef<CAmbig_detect> m_DetectAmbigIupacnaNcbi2na;
465 
466     // Helper functiond to initialize m_Detect_Ambig_ data members
467     CRef<CAmbig_detect> InitAmbigNcbi4naNcbi2na();
468     CRef<CAmbig_detect> InitAmbigIupacnaNcbi2na();
469 
470     // Alphabet conversion functions. Functions return
471     // the number of converted codes.
472 
473     /*
474     // Fuction to convert ncbi2na (1 byte) to iupacna (4 bytes)
475     TSeqPos MapNcbi2naToIupacna(const CSeq_data&  in_seq,
476                                 CSeq_data*        out_seq,
477                                 TSeqPos           uBeginIdx,
478                                 TSeqPos           uLength)
479         const;
480 
481     // Function to convert ncbi2na (1 byte) to ncbi4na (2 bytes)
482     TSeqPos MapNcbi2naToNcbi4na(const CSeq_data&  in_seq,
483                                 CSeq_data*        out_seq,
484                                 TSeqPos           uBeginIdx,
485                                 TSeqPos           uLength)
486         const;
487 
488     // Function to convert ncbi4na (1 byte) to iupacna (2 bytes)
489     TSeqPos MapNcbi4naToIupacna(const CSeq_data& in_seq,
490                                 CSeq_data*       out_seq,
491                                 TSeqPos          uBeginIdx,
492                                 TSeqPos          uLength)
493         const;
494     */
495     // Function to convert iupacna (4 bytes) to ncbi2na (1 byte)
496     TSeqPos MapIupacnaToNcbi2na(const CSeq_data& in_seq,
497                                 CSeq_data*       out_seq,
498                                 TSeqPos          uBeginIdx,
499                                 TSeqPos          uLength,
500                                 bool             bAmbig,
501                                 CRandom::TValue  seed,
502                                 TSeqPos          total_length,
503                                 TSeqPos*         out_seq_length,
504                                 vector<Uint4>*   blast_ambig)
505         const;
506     /*
507 
508     // Function to convert iupacna (2 bytes) to ncbi4na (1 byte)
509     TSeqPos MapIupacnaToNcbi4na(const CSeq_data& in_seq,
510                                 CSeq_data*       out_seq,
511                                 TSeqPos          uBeginIdx,
512                                 TSeqPos          uLength)
513         const;
514     */
515     // Function to convert ncbi4na (2 bytes) to ncbi2na (1 byte)
516     TSeqPos MapNcbi4naToNcbi2na(const CSeq_data& in_seq,
517                                 CSeq_data*      out_seq,
518                                 TSeqPos         uBeginIdx,
519                                 TSeqPos         uLength,
520                                 bool            bAmbig,
521                                 CRandom::TValue seed,
522                                 TSeqPos         total_length,
523                                 TSeqPos*        out_seq_length,
524                                 vector<Uint4>*  blast_ambig)
525         const;
526     /*
527 
528     // Function to convert iupacaa (byte) to ncbieaa (byte)
529     TSeqPos MapIupacaaToNcbieaa(const CSeq_data& in_seq,
530                                 CSeq_data*       out_seq,
531                                 TSeqPos          uBeginIdx,
532                                 TSeqPos          uLength) const;
533 
534     // Function to convert ncbieaa (byte) to iupacaa (byte)
535     TSeqPos MapNcbieaaToIupacaa(const CSeq_data& in_seq,
536                                 CSeq_data*       out_seq,
537                                 TSeqPos          uBeginIdx,
538                                 TSeqPos          uLength)
539         const;
540 
541     // Function to convert iupacaa (byte) to ncbistdaa (byte)
542     TSeqPos MapIupacaaToNcbistdaa(const CSeq_data& in_seq,
543                                   CSeq_data*       out_seq,
544                                   TSeqPos          uBeginIdx,
545                                   TSeqPos          uLength)
546         const;
547 
548     // Function to convert ncbieaa (byte) to ncbistdaa (byte)
549     TSeqPos MapNcbieaaToNcbistdaa(const CSeq_data& in_seq,
550                                   CSeq_data*       out_seq,
551                                   TSeqPos          uBeginIdx,
552                                   TSeqPos          uLength)
553         const;
554 
555     // Function to convert ncbistdaa (byte) to ncbieaa (byte)
556     TSeqPos MapNcbistdaaToNcbieaa(const CSeq_data& in_seq,
557                                   CSeq_data*       out_seq,
558                                   TSeqPos          uBeginIdx,
559                                   TSeqPos          uLength)
560         const;
561 
562     // Function to convert ncbistdaa (byte) to iupacaa (byte)
563     TSeqPos MapNcbistdaaToIupacaa(const CSeq_data& in_seq,
564                                   CSeq_data*       out_seq,
565                                   TSeqPos          uBeginIdx,
566                                   TSeqPos          uLength)
567         const;
568         */
569 
570     // Fast Validation functions
571     bool FastValidateIupacna(const CSeq_data& in_seq,
572                              TSeqPos          uBeginIdx,
573                              TSeqPos          uLength)
574         const;
575 
576     bool FastValidateNcbieaa(const CSeq_data& in_seq,
577                              TSeqPos          uBeginIdx,
578                              TSeqPos          uLength)
579         const;
580 
581 
582     bool FastValidateNcbistdaa(const CSeq_data& in_seq,
583                                TSeqPos          uBeginIdx,
584                                TSeqPos          uLength)
585         const;
586 
587 
588     bool FastValidateIupacaa(const CSeq_data& in_seq,
589                              TSeqPos          uBeginIdx,
590                              TSeqPos          uLength)
591         const;
592 
593     // Full Validation functions
594     void ValidateIupacna(const CSeq_data&       in_seq,
595                          vector<TSeqPos>*       badIdx,
596                          TSeqPos                uBeginIdx,
597                          TSeqPos                uLength)
598         const;
599 
600     void ValidateNcbieaa(const CSeq_data&       in_seq,
601                          vector<TSeqPos>*       badIdx,
602                          TSeqPos                uBeginIdx,
603                          TSeqPos                uLength)
604         const;
605 
606     void ValidateNcbistdaa(const CSeq_data&       in_seq,
607                            vector<TSeqPos>*       badIdx,
608                            TSeqPos                uBeginIdx,
609                            TSeqPos                uLength)
610         const;
611 
612     void ValidateIupacaa(const CSeq_data&       in_seq,
613                          vector<TSeqPos>*       badIdx,
614                          TSeqPos                uBeginIdx,
615                          TSeqPos                uLength)
616         const;
617 
618     // Functions to make copies of the different types of sequences
619     TSeqPos GetNcbi2naCopy(const CSeq_data& in_seq,
620                            CSeq_data*       out_seq,
621                            TSeqPos          uBeginIdx,
622                            TSeqPos          uLength)
623         const;
624 
625     TSeqPos GetNcbi4naCopy(const CSeq_data& in_seq,
626                            CSeq_data*       out_seq,
627                            TSeqPos          uBeginIdx,
628                            TSeqPos          uLength)
629         const;
630 
631     TSeqPos GetIupacnaCopy(const CSeq_data& in_seq,
632                            CSeq_data*       out_seq,
633                            TSeqPos          uBeginIdx,
634                            TSeqPos          uLength)
635         const;
636 
637     TSeqPos GetNcbieaaCopy(const CSeq_data& in_seq,
638                            CSeq_data*       out_seq,
639                            TSeqPos          uBeginIdx,
640                            TSeqPos          uLength)
641         const;
642 
643     TSeqPos GetNcbistdaaCopy(const CSeq_data& in_seq,
644                              CSeq_data*       out_seq,
645                              TSeqPos          uBeginIdx,
646                              TSeqPos          uLength)
647         const;
648 
649     TSeqPos GetIupacaaCopy(const CSeq_data& in_seq,
650                            CSeq_data*       out_seq,
651                            TSeqPos          uBeginIdx,
652                            TSeqPos          uLength)
653         const;
654 
655     // Function to adjust uBeginIdx to lie on an in_seq byte boundary
656     // and uLength to lie on on an out_seq byte boundary. Returns
657     // overhang, the number of out seqs beyond byte boundary determined
658     // by uBeginIdx + uLength
659     TSeqPos Adjust(TSeqPos* uBeginIdx,
660                    TSeqPos* uLength,
661                    TSeqPos  uInSeqBytes,
662                    TSeqPos  uInSeqsPerByte,
663                    TSeqPos  uOutSeqsPerByte)
664         const;
665 
666     // GetAmbig methods
667 
668     // Loops through an ncbi4na input sequence and determines
669     // the ambiguities that would result from conversion to an ncbi2na sequence
670     // On return, out_seq contains the ncbi4na bases that become ambiguous and
671     // out_indices contains the indices of the abiguous bases in in_seq
672     TSeqPos GetAmbigs_ncbi4na_ncbi2na(const CSeq_data&  in_seq,
673                                       CSeq_data*        out_seq,
674                                       vector<TSeqPos>*  out_indices,
675                                       TSeqPos           uBeginIdx,
676                                       TSeqPos           uLength)
677         const;
678 
679     // Loops through an iupacna input sequence and determines
680     // the ambiguities that would result from conversion to an ncbi2na sequence
681     // On return, out_seq contains the iupacna bases that become ambiguous and
682     // out_indices contains the indices of the abiguous bases in in_seq. The
683     // return is the number of ambiguities found.
684     TSeqPos GetAmbigs_iupacna_ncbi2na(const CSeq_data&  in_seq,
685                                       CSeq_data*        out_seq,
686                                       vector<TSeqPos>*  out_indices,
687                                       TSeqPos           uBeginIdx,
688                                       TSeqPos           uLength)
689         const;
690 
691     // Methods to perform Keep on specific seq types. Methods
692     // return length of kept sequence.
693     TSeqPos KeepNcbi2na(CSeq_data*  in_seq,
694                         TSeqPos     uBeginIdx,
695                         TSeqPos     uLength)
696         const;
697 
698     TSeqPos KeepNcbi4na(CSeq_data*  in_seq,
699                         TSeqPos     uBeginIdx,
700                         TSeqPos     uLength)
701         const;
702 
703     TSeqPos KeepIupacna(CSeq_data*  in_seq,
704                         TSeqPos     uBeginIdx,
705                         TSeqPos     uLength)
706         const;
707 
708     TSeqPos KeepNcbieaa(CSeq_data*  in_seq,
709                         TSeqPos     uBeginIdx,
710                         TSeqPos     uLength)
711         const;
712 
713     TSeqPos KeepNcbistdaa(CSeq_data*  in_seq,
714                           TSeqPos     uBeginIdx,
715                           TSeqPos     uLength)
716         const;
717 
718     TSeqPos KeepIupacaa(CSeq_data*  in_seq,
719                         TSeqPos     uBeginIdx,
720                         TSeqPos     uLength)
721         const;
722 
723     // Methods to complement na sequences
724 
725     // In place methods. Return number of complemented residues.
726     TSeqPos ComplementIupacna(CSeq_data*  in_seq,
727                               TSeqPos     uBeginIdx,
728                               TSeqPos    uLength)
729         const;
730 
731     TSeqPos ComplementNcbi2na(CSeq_data*  in_seq,
732                               TSeqPos     uBeginIdx,
733                               TSeqPos     uLength)
734         const;
735 
736     TSeqPos ComplementNcbi4na(CSeq_data*  in_seq,
737                               TSeqPos     uBeginIdx,
738                               TSeqPos     uLength)
739         const;
740 
741 
742     // Complement in copy methods
743     TSeqPos ComplementIupacna(const CSeq_data&  in_seq,
744                               CSeq_data*        out_seq,
745                               TSeqPos           uBeginIdx,
746                               TSeqPos           uLength)
747         const;
748 
749     TSeqPos ComplementNcbi2na(const CSeq_data&  in_seq,
750                               CSeq_data*        out_seq,
751                               TSeqPos           uBeginIdx,
752                               TSeqPos           uLength)
753         const;
754 
755     TSeqPos ComplementNcbi4na(const CSeq_data&  in_seq,
756                               CSeq_data*        out_seq,
757                               TSeqPos           uBeginIdx,
758                               TSeqPos           uLength)
759         const;
760 
761 
762     // Methods to reverse na sequences
763 
764     // In place methods
765     TSeqPos ReverseIupacna(CSeq_data*  in_seq,
766                            TSeqPos     uBeginIdx,
767                            TSeqPos     uLength)
768         const;
769 
770     TSeqPos ReverseNcbi2na(CSeq_data*  in_seq,
771                            TSeqPos     uBeginIdx,
772                            TSeqPos     uLength)
773         const;
774 
775     TSeqPos ReverseNcbi4na(CSeq_data*  in_seq,
776                            TSeqPos     uBeginIdx,
777                            TSeqPos     uLength)
778         const;
779 
780     // Reverse in copy methods
781     TSeqPos ReverseIupacna(const CSeq_data&  in_seq,
782                            CSeq_data*        out_seq,
783                            TSeqPos           uBeginIdx,
784                            TSeqPos           uLength)
785         const;
786 
787     TSeqPos ReverseNcbi2na(const CSeq_data&  in_seq,
788                            CSeq_data*        out_seq,
789                            TSeqPos           uBeginIdx,
790                            TSeqPos           uLength)
791         const;
792 
793     TSeqPos ReverseNcbi4na(const CSeq_data&  in_seq,
794                            CSeq_data*        out_seq,
795                            TSeqPos           uBeginIdx,
796                            TSeqPos           uLength)
797         const;
798 
799     // Methods to reverse-complement an na sequences
800 
801     // In place methods
802     TSeqPos ReverseComplementIupacna(CSeq_data*  in_seq,
803                                      TSeqPos     uBeginIdx,
804                                      TSeqPos     uLength)
805         const;
806 
807     TSeqPos ReverseComplementNcbi2na(CSeq_data*  in_seq,
808                                      TSeqPos     uBeginIdx,
809                                      TSeqPos     uLength)
810         const;
811 
812     TSeqPos ReverseComplementNcbi4na(CSeq_data*  in_seq,
813                                      TSeqPos     uBeginIdx,
814                                      TSeqPos     uLength)
815         const;
816 
817     // Reverse in copy methods
818     TSeqPos ReverseComplementIupacna(const CSeq_data& in_seq,
819                                      CSeq_data*       out_seq,
820                                      TSeqPos          uBeginIdx,
821                                      TSeqPos          uLength)
822         const;
823 
824     TSeqPos ReverseComplementNcbi2na(const CSeq_data& in_seq,
825                                      CSeq_data*       out_seq,
826                                      TSeqPos          uBeginIdx,
827                                      TSeqPos          uLength)
828         const;
829 
830     TSeqPos ReverseComplementNcbi4na(const CSeq_data& in_seq,
831                                      CSeq_data*       out_seq,
832                                      TSeqPos          uBeginIdx,
833                                      TSeqPos          uLength)
834         const;
835 
836     // Append methods
837     TSeqPos AppendIupacna(CSeq_data*        out_seq,
838                           const CSeq_data&  in_seq1,
839                           TSeqPos           uBeginIdx1,
840                           TSeqPos           uLength1,
841                           const CSeq_data&  in_seq2,
842                           TSeqPos           uBeginIdx2,
843                           TSeqPos           uLength2)
844         const;
845 
846     TSeqPos AppendNcbi2na(CSeq_data*          out_seq,
847                           const CSeq_data&  in_seq1,
848                           TSeqPos           uBeginIdx1,
849                           TSeqPos           uLength1,
850                           const CSeq_data&  in_seq2,
851                           TSeqPos           uBeginIdx2,
852                           TSeqPos           uLength2)
853         const;
854 
855     TSeqPos AppendNcbi4na(CSeq_data*          out_seq,
856                           const CSeq_data&  in_seq1,
857                           TSeqPos           uBeginIdx1,
858                           TSeqPos           uLength1,
859                           const CSeq_data&  in_seq2,
860                           TSeqPos           uBeginIdx2,
861                           TSeqPos           uLength2)
862         const;
863 
864     TSeqPos AppendNcbieaa(CSeq_data*          out_seq,
865                           const CSeq_data&  in_seq1,
866                           TSeqPos           uBeginIdx1,
867                           TSeqPos           uLength1,
868                           const CSeq_data&  in_seq2,
869                           TSeqPos           uBeginIdx2,
870                           TSeqPos           uLength2)
871         const;
872 
873     TSeqPos AppendNcbistdaa(CSeq_data*        out_seq,
874                           const CSeq_data&  in_seq1,
875                           TSeqPos           uBeginIdx1,
876                           TSeqPos           uLength1,
877                           const CSeq_data&  in_seq2,
878                           TSeqPos           uBeginIdx2,
879                           TSeqPos           uLength2)
880         const;
881 
882     TSeqPos AppendIupacaa(CSeq_data*          out_seq,
883                           const CSeq_data&  in_seq1,
884                           TSeqPos           uBeginIdx1,
885                           TSeqPos           uLength1,
886                           const CSeq_data&  in_seq2,
887                           TSeqPos           uBeginIdx2,
888                           TSeqPos           uLength2)
889         const;
890 
891     void x_GetSeqFromSeqData(const CSeq_data& data,
892                              const string** str,
893                              const vector<char>** vec)
894         const;
895     void x_GetSeqFromSeqData(CSeq_data& data,
896                              string** str,
897                              vector<char>** vec)
898         const;
899 };
900 
901 
902 static CSafeStatic<CSeqportUtil_implementation> sx_Implementation;
903 
x_GetImplementation(void)904 CSeqportUtil_implementation& CSeqportUtil::x_GetImplementation(void)
905 {
906     return *sx_Implementation;
907 }
908 
909 
910 
911 
912 /////////////////////////////////////////////////////////////////////////////
913 //  PUBLIC (static wrappers to CSeqportUtil_implementation public methods)::
914 //
915 
916 
Convert(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed)917 TSeqPos CSeqportUtil::Convert
918 (const CSeq_data&     in_seq,
919  CSeq_data*           out_seq,
920  CSeq_data::E_Choice  to_code,
921  TSeqPos              uBeginIdx,
922  TSeqPos              uLength,
923  bool                 bAmbig,
924  CRandom::TValue      seed)
925 {
926     return x_GetImplementation().Convert
927         (in_seq, out_seq, to_code, uBeginIdx, uLength, bAmbig, seed,
928          0, 0, 0);
929 }
930 
ConvertWithBlastAmbig(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig)931 TSeqPos CSeqportUtil::ConvertWithBlastAmbig
932 (const CSeq_data&     in_seq,
933  CSeq_data*           out_seq,
934  TSeqPos              uBeginIdx,
935  TSeqPos              uLength,
936  TSeqPos              total_length,
937  TSeqPos*             out_seq_length,
938  vector<Uint4>*       blast_ambig)
939 {
940     return x_GetImplementation().Convert
941         (in_seq, out_seq, CSeq_data::e_Ncbi2na, uBeginIdx, uLength, true,
942          17734276, total_length, out_seq_length, blast_ambig);
943 }
944 
Pack(CSeq_data * in_seq,TSeqPos uLength)945 TSeqPos CSeqportUtil::Pack
946 (CSeq_data*   in_seq,
947  TSeqPos uLength)
948 {
949     return x_GetImplementation().Pack
950         (in_seq, uLength);
951 }
952 
953 
FastValidate(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength)954 bool CSeqportUtil::FastValidate
955 (const CSeq_data&   in_seq,
956  TSeqPos            uBeginIdx,
957  TSeqPos            uLength)
958 {
959     return x_GetImplementation().FastValidate
960         (in_seq, uBeginIdx, uLength);
961 }
962 
963 
Validate(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength)964 void CSeqportUtil::Validate
965 (const CSeq_data&   in_seq,
966  vector<TSeqPos>*   badIdx,
967  TSeqPos            uBeginIdx,
968  TSeqPos            uLength)
969 {
970     x_GetImplementation().Validate
971         (in_seq, badIdx, uBeginIdx, uLength);
972 }
973 
974 
GetAmbigs(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength)975 TSeqPos CSeqportUtil::GetAmbigs
976 (const CSeq_data&     in_seq,
977  CSeq_data*           out_seq,
978  vector<TSeqPos>*     out_indices,
979  CSeq_data::E_Choice  to_code,
980  TSeqPos              uBeginIdx,
981  TSeqPos              uLength)
982 {
983     return x_GetImplementation().GetAmbigs
984         (in_seq, out_seq, out_indices, to_code, uBeginIdx, uLength);
985 }
986 
987 
GetCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)988 TSeqPos CSeqportUtil::GetCopy
989 (const CSeq_data&   in_seq,
990  CSeq_data*         out_seq,
991  TSeqPos            uBeginIdx,
992  TSeqPos            uLength)
993 {
994     return x_GetImplementation().GetCopy
995         (in_seq, out_seq, uBeginIdx, uLength);
996 }
997 
998 
999 
Keep(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1000 TSeqPos CSeqportUtil::Keep
1001 (CSeq_data*   in_seq,
1002  TSeqPos      uBeginIdx,
1003  TSeqPos      uLength)
1004 {
1005     return x_GetImplementation().Keep
1006         (in_seq, uBeginIdx, uLength);
1007 }
1008 
1009 
Append(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2)1010 TSeqPos CSeqportUtil::Append
1011 (CSeq_data*         out_seq,
1012  const CSeq_data&   in_seq1,
1013  TSeqPos            uBeginIdx1,
1014  TSeqPos            uLength1,
1015  const CSeq_data&   in_seq2,
1016  TSeqPos            uBeginIdx2,
1017  TSeqPos            uLength2)
1018 {
1019     return x_GetImplementation().Append
1020         (out_seq,
1021          in_seq1, uBeginIdx1, uLength1, in_seq2, uBeginIdx2, uLength2);
1022 }
1023 
1024 
Complement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1025 TSeqPos CSeqportUtil::Complement
1026 (CSeq_data*   in_seq,
1027  TSeqPos      uBeginIdx,
1028  TSeqPos      uLength)
1029 {
1030     return x_GetImplementation().Complement
1031         (in_seq, uBeginIdx, uLength);
1032 }
1033 
1034 
Complement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1035 TSeqPos CSeqportUtil::Complement
1036 (const CSeq_data&   in_seq,
1037  CSeq_data*         out_seq,
1038  TSeqPos            uBeginIdx,
1039  TSeqPos            uLength)
1040 {
1041     return x_GetImplementation().Complement
1042         (in_seq, out_seq, uBeginIdx, uLength);
1043 }
1044 
1045 
Reverse(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1046 TSeqPos CSeqportUtil::Reverse
1047 (CSeq_data*   in_seq,
1048  TSeqPos      uBeginIdx,
1049  TSeqPos      uLength)
1050 {
1051     return x_GetImplementation().Reverse
1052         (in_seq, uBeginIdx, uLength);
1053 }
1054 
1055 
Reverse(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1056 TSeqPos CSeqportUtil::Reverse
1057 (const CSeq_data&  in_seq,
1058  CSeq_data*        out_seq,
1059  TSeqPos           uBeginIdx,
1060  TSeqPos           uLength)
1061 {
1062     return x_GetImplementation().Reverse
1063         (in_seq, out_seq, uBeginIdx, uLength);
1064 }
1065 
1066 
ReverseComplement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1067 TSeqPos CSeqportUtil::ReverseComplement
1068 (CSeq_data*  in_seq,
1069  TSeqPos     uBeginIdx,
1070  TSeqPos     uLength)
1071 {
1072     return x_GetImplementation().ReverseComplement
1073         (in_seq, uBeginIdx, uLength);
1074 }
1075 
1076 
ReverseComplement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1077 TSeqPos CSeqportUtil::ReverseComplement
1078 (const CSeq_data&  in_seq,
1079  CSeq_data*        out_seq,
1080  TSeqPos           uBeginIdx,
1081  TSeqPos           uLength)
1082 {
1083     return x_GetImplementation().ReverseComplement
1084         (in_seq, out_seq, uBeginIdx, uLength);
1085 }
1086 
1087 
GetIupacaa3(TIndex ncbistdaa)1088 const string& CSeqportUtil::GetIupacaa3(TIndex ncbistdaa)
1089 {
1090     return x_GetImplementation().GetIupacaa3(ncbistdaa);
1091 }
1092 
IsCodeAvailable(CSeq_data::E_Choice code_type)1093 bool CSeqportUtil::IsCodeAvailable(CSeq_data::E_Choice code_type)
1094 {
1095     return x_GetImplementation().IsCodeAvailable(code_type);
1096 }
1097 
IsCodeAvailable(ESeq_code_type code_type)1098 bool CSeqportUtil::IsCodeAvailable(ESeq_code_type code_type)
1099 {
1100     return x_GetImplementation().IsCodeAvailable(code_type);
1101 }
1102 
GetCodeIndexFromTo(CSeq_data::E_Choice code_type)1103 CSeqportUtil::TPair CSeqportUtil::GetCodeIndexFromTo
1104 (CSeq_data::E_Choice code_type)
1105 {
1106     return x_GetImplementation().GetCodeIndexFromTo(code_type);
1107 }
1108 
GetCodeIndexFromTo(ESeq_code_type code_type)1109 CSeqportUtil::TPair CSeqportUtil::GetCodeIndexFromTo
1110 (ESeq_code_type code_type)
1111 {
1112     return x_GetImplementation().GetCodeIndexFromTo(code_type);
1113 }
1114 
GetCode(CSeq_data::E_Choice code_type,TIndex idx)1115 const string& CSeqportUtil::GetCode
1116 (CSeq_data::E_Choice code_type,
1117  TIndex              idx)
1118 {
1119     return x_GetImplementation().GetCodeOrName(code_type, idx, true);
1120 }
1121 
GetCode(ESeq_code_type code_type,TIndex idx)1122 const string& CSeqportUtil::GetCode
1123 (ESeq_code_type code_type,
1124  TIndex         idx)
1125 {
1126     return x_GetImplementation().GetCodeOrName(code_type, idx, true);
1127 }
1128 
GetName(CSeq_data::E_Choice code_type,TIndex idx)1129 const string& CSeqportUtil::GetName
1130 (CSeq_data::E_Choice code_type,
1131  TIndex              idx)
1132 {
1133     return x_GetImplementation().GetCodeOrName(code_type, idx, false);
1134 }
1135 
GetName(ESeq_code_type code_type,TIndex idx)1136 const string& CSeqportUtil::GetName
1137 (ESeq_code_type code_type,
1138  TIndex         idx)
1139 {
1140     return x_GetImplementation().GetCodeOrName(code_type, idx, false);
1141 }
1142 
GetIndex(CSeq_data::E_Choice code_type,const string & code)1143 CSeqportUtil::TIndex CSeqportUtil::GetIndex
1144 (CSeq_data::E_Choice code_type,
1145  const string&       code)
1146 {
1147     return x_GetImplementation().GetIndex(code_type, code);
1148 }
1149 
GetIndex(ESeq_code_type code_type,const string & code)1150 CSeqportUtil::TIndex CSeqportUtil::GetIndex
1151 (ESeq_code_type code_type,
1152  const string&  code)
1153 {
1154     return x_GetImplementation().GetIndex(code_type, code);
1155 }
1156 
GetIndexComplement(CSeq_data::E_Choice code_type,TIndex idx)1157 CSeqportUtil::TIndex CSeqportUtil::GetIndexComplement
1158 (CSeq_data::E_Choice code_type,
1159  TIndex        idx)
1160 {
1161     return x_GetImplementation().GetIndexComplement(code_type, idx);
1162 }
1163 
GetIndexComplement(ESeq_code_type code_type,TIndex idx)1164 CSeqportUtil::TIndex CSeqportUtil::GetIndexComplement
1165 (ESeq_code_type code_type,
1166  TIndex         idx)
1167 {
1168     return x_GetImplementation().GetIndexComplement(code_type, idx);
1169 }
1170 
GetMapToIndex(CSeq_data::E_Choice from_type,CSeq_data::E_Choice to_type,TIndex from_idx)1171 CSeqportUtil::TIndex CSeqportUtil::GetMapToIndex
1172 (CSeq_data::E_Choice from_type,
1173  CSeq_data::E_Choice to_type,
1174  TIndex              from_idx)
1175 {
1176     return x_GetImplementation().GetMapToIndex(from_type, to_type, from_idx);
1177 }
1178 
GetMapToIndex(ESeq_code_type from_type,ESeq_code_type to_type,TIndex from_idx)1179 CSeqportUtil::TIndex CSeqportUtil::GetMapToIndex
1180 (ESeq_code_type from_type,
1181  ESeq_code_type to_type,
1182  TIndex         from_idx)
1183 {
1184     return x_GetImplementation().GetMapToIndex(from_type, to_type, from_idx);
1185 }
1186 
CSeqportUtil_implementation()1187 CSeqportUtil_implementation::CSeqportUtil_implementation()
1188 {
1189 
1190     // Initialize m_SeqCodeSet
1191     m_SeqCodeSet = Init();
1192 
1193     // Initialize code tables
1194     m_Iupacna = InitCodes(eSeq_code_type_iupacna);
1195 
1196     m_Ncbieaa = InitCodes(eSeq_code_type_ncbieaa);
1197 
1198     m_Ncbistdaa = InitCodes(eSeq_code_type_ncbistdaa);
1199 
1200     m_Iupacaa = InitCodes(eSeq_code_type_iupacaa);
1201 
1202 
1203     // Initialize na complement tables
1204     m_Iupacna_complement = InitIupacnaComplement();
1205 
1206     m_Ncbi2naComplement = InitNcbi2naComplement();
1207 
1208     m_Ncbi4naComplement = InitNcbi4naComplement();
1209 
1210 
1211 
1212     // Initialize na reverse tables
1213     m_Ncbi2naRev = InitNcbi2naRev();
1214 
1215     m_Ncbi4naRev = InitNcbi4naRev();
1216 
1217 
1218     // Initialize map tables
1219 
1220     m_Ncbi2naIupacna = InitMaps(eSeq_code_type_ncbi2na,
1221                                 eSeq_code_type_iupacna);
1222 
1223     m_Ncbi2naNcbi4na = InitMaps(eSeq_code_type_ncbi2na,
1224                                 eSeq_code_type_ncbi4na);
1225 
1226     m_Ncbi4naIupacna = InitMaps(eSeq_code_type_ncbi4na,
1227                                 eSeq_code_type_iupacna);
1228 
1229     m_IupacnaNcbi2na = InitMaps(eSeq_code_type_iupacna,
1230                                 eSeq_code_type_ncbi2na);
1231 
1232     m_IupacnaNcbi4na = InitMaps(eSeq_code_type_iupacna,
1233                                 eSeq_code_type_ncbi4na);
1234 
1235     m_Ncbi4naNcbi2na = InitMaps(eSeq_code_type_ncbi4na,
1236                                 eSeq_code_type_ncbi2na);
1237 
1238     m_IupacaaNcbieaa = InitMaps(eSeq_code_type_iupacaa,
1239                                 eSeq_code_type_ncbieaa);
1240 
1241     m_NcbieaaIupacaa = InitMaps(eSeq_code_type_ncbieaa,
1242                                 eSeq_code_type_iupacaa);
1243 
1244     m_IupacaaNcbistdaa = InitMaps(eSeq_code_type_iupacaa,
1245                                   eSeq_code_type_ncbistdaa);
1246 
1247     m_NcbieaaNcbistdaa = InitMaps(eSeq_code_type_ncbieaa,
1248                                   eSeq_code_type_ncbistdaa);
1249 
1250     m_NcbistdaaNcbieaa = InitMaps(eSeq_code_type_ncbistdaa,
1251                                   eSeq_code_type_ncbieaa);
1252 
1253     m_NcbistdaaIupacaa = InitMaps(eSeq_code_type_ncbistdaa,
1254                                   eSeq_code_type_iupacaa);
1255 
1256     // Initialize fast conversion tables
1257     //m_FastNcbi2naIupacna = InitFastNcbi2naIupacna();
1258     //m_FastNcbi2naNcbi4na = InitFastNcbi2naNcbi4na();
1259     //m_FastNcbi4naIupacna = InitFastNcbi4naIupacna();
1260     m_FastIupacnaNcbi2na = InitFastIupacnaNcbi2na();
1261     m_FastIupacnaNcbi4na = InitFastIupacnaNcbi4na();
1262     m_FastNcbi4naNcbi2na = InitFastNcbi4naNcbi2na();
1263 
1264     // Initialize tables for conversion of index to codes or names
1265     InitIndexCodeName();
1266 
1267     // Initialize m_Masks used for random ambiguity resolution
1268     m_Masks = CSeqportUtil_implementation::InitMasks();
1269 
1270     // Initialize m_DetectAmbigNcbi4naNcbi2na used for ambiguity
1271     // detection and reporting
1272     m_DetectAmbigNcbi4naNcbi2na = InitAmbigNcbi4naNcbi2na();
1273 
1274     // Initialize m_DetectAmbigIupacnaNcbi2na used for ambiguity detection
1275     // and reporting
1276     m_DetectAmbigIupacnaNcbi2na = InitAmbigIupacnaNcbi2na();
1277 
1278 }
1279 
1280 // Destructor. All memory allocated on the
1281 // free store is wrapped in smart pointers.
1282 // Therefore, the destructor does not need
1283 // to deallocate memory.
~CSeqportUtil_implementation()1284 CSeqportUtil_implementation::~CSeqportUtil_implementation()
1285 {
1286     return;
1287 }
1288 
1289 
1290 /////////////////////////////////////////////////////////////////////////////
1291 //  PRIVATE::
1292 //
1293 
1294 
1295 // Helper function to initialize m_SeqCodeSet from sm_StrAsnData
Init()1296 CRef<CSeq_code_set> CSeqportUtil_implementation::Init()
1297 {
1298     // Compose a long-long string
1299     string str;
1300     for (size_t i = 0;  sm_StrAsnData[i];  i++) {
1301         str += sm_StrAsnData[i];
1302     }
1303 
1304     // Create an in memory stream on sm_StrAsnData
1305     CNcbiIstrstream is(str);
1306 
1307     auto_ptr<CObjectIStream>
1308         asn_codes_in(CObjectIStream::Open(eSerial_AsnText, is));
1309 
1310     // Create a CSeq_code_set
1311     CRef<CSeq_code_set> ptr_seq_code_set(new CSeq_code_set());
1312 
1313     // Initialize the newly created CSeq_code_set
1314     *asn_codes_in >> *ptr_seq_code_set;
1315 
1316     // Return a newly created CSeq_code_set
1317     return ptr_seq_code_set;
1318 }
1319 
1320 
1321 // Function to initialize code tables
1322 CRef<CSeqportUtil_implementation::CCode_table>
InitCodes(ESeq_code_type code_type)1323 CSeqportUtil_implementation::InitCodes(ESeq_code_type code_type)
1324 {
1325     // Get list of code tables
1326     const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1327 
1328     // Get table for code_type
1329     list<CRef<CSeq_code_table> >::const_iterator i_ct;
1330     for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1331         if((*i_ct)->GetCode() == code_type)
1332             break;
1333 
1334 
1335     if(i_ct == code_list.end())
1336         throw runtime_error("Requested code table not found");
1337 
1338     // Get table data
1339     const list<CRef<CSeq_code_table::C_E> >& table_data = (*i_ct)->GetTable();
1340     SIZE_TYPE size = table_data.size();
1341     int start_at = (*i_ct)->GetStart_at();
1342     CRef<CCode_table> codeTable(new CCode_table(size, start_at));
1343 
1344     // Initialize codeTable to 255
1345     for(int i=0; i<256; i++)
1346         codeTable->m_Table[i] = '\xff';
1347 
1348     // Copy table data to codeTable
1349     int nIdx = start_at;
1350     list<CRef<CSeq_code_table::C_E> >::const_iterator i_td;
1351     for(i_td = table_data.begin(); i_td != table_data.end(); ++i_td) {
1352         codeTable->m_Table[nIdx] =  *((*i_td)->GetSymbol().c_str());
1353         if(codeTable->m_Table[nIdx] == '\x00')
1354             codeTable->m_Table[nIdx++] = '\xff';
1355         else
1356             nIdx++;
1357     }
1358 
1359     // Return codeTable
1360     return codeTable;
1361 }
1362 
1363 
1364 // Function to initialize iupacna complement table
1365 CRef<CSeqportUtil_implementation::CCode_comp>
InitIupacnaComplement()1366 CSeqportUtil_implementation::InitIupacnaComplement()
1367 {
1368 
1369     // Get list of code tables
1370     const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1371 
1372     // Get table for code_type iupacna
1373     list<CRef<CSeq_code_table> >::const_iterator i_ct;
1374     for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1375         if((*i_ct)->GetCode() == eSeq_code_type_iupacna)
1376             break;
1377 
1378 
1379     if(i_ct == code_list.end())
1380         throw runtime_error("Code table for Iupacna not found");
1381 
1382     // Check that complements are set
1383     if(!(*i_ct)->IsSetComps())
1384         throw runtime_error("Complement data is not set for iupacna table");
1385 
1386     // Get complement data, start at and size of complement data
1387     const list<int>& comp_data = (*i_ct)->GetComps();
1388     int start_at = (*i_ct)->GetStart_at();
1389 
1390     // Allocate memory for complement data
1391     CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1392 
1393     // Initialize compTable to 255 for illegal codes
1394     for(unsigned int i = 0; i<256; i++)
1395         compTable->m_Table[i] = (char) 255;
1396 
1397     // Loop trhough the complement data and set compTable
1398     list<int>::const_iterator i_comp;
1399     unsigned int nIdx = start_at;
1400     for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1401         compTable->m_Table[nIdx++] = (*i_comp);
1402 
1403     // Return the complement data
1404     return compTable;
1405 
1406 }
1407 
1408 
1409 // Function to initialize ncbi2na complement table
1410 CRef<CSeqportUtil_implementation::CCode_comp>
InitNcbi2naComplement()1411 CSeqportUtil_implementation::InitNcbi2naComplement()
1412 {
1413 
1414     // Get list of code tables
1415     const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1416 
1417     // Get table for code_type ncbi2na
1418     list<CRef<CSeq_code_table> >::const_iterator i_ct;
1419     for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1420         if((*i_ct)->GetCode() == eSeq_code_type_ncbi2na)
1421             break;
1422 
1423     if(i_ct == code_list.end())
1424         throw runtime_error("Code table for Iupacna not found");
1425 
1426     // Check that complements are set
1427     if(!(*i_ct)->IsSetComps())
1428         throw runtime_error("Complement data is not set for ncbi2na table");
1429 
1430     // Get complement data, start at and size of complement data
1431     const list<int>& comp_data = (*i_ct)->GetComps();
1432     int start_at = (*i_ct)->GetStart_at();
1433 
1434     // Allocate memory for complement data
1435     CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1436 
1437     // Put complement data in an array
1438     char compArray[4];
1439     int nIdx = start_at;
1440     list<int>::const_iterator i_comp;
1441     for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1442         compArray[nIdx++] = (*i_comp);
1443 
1444     // Set compTable
1445     for(unsigned int i = 0; i < 4; i++)
1446         for(unsigned int j = 0; j < 4; j++)
1447             for(unsigned int k = 0; k < 4; k++)
1448                 for(unsigned int l = 0; l < 4; l++)
1449                     {
1450                         nIdx = i<<6 | j<<4 | k<<2 | l;
1451                         char c1 = compArray[i] << 6;
1452                         char c2 = compArray[j] << 4;
1453                         char c3 = compArray[k] << 2;
1454                         char c4 = compArray[l];
1455                         compTable->m_Table[nIdx] = c1 | c2 | c3 | c4;
1456                     }
1457 
1458     // Return complement data
1459     return compTable;
1460 
1461 }
1462 
1463 
1464 // Function to initialize ncbi4na complement table
1465 CRef<CSeqportUtil_implementation::CCode_comp>
InitNcbi4naComplement()1466 CSeqportUtil_implementation::InitNcbi4naComplement()
1467 {
1468 
1469     // Get list of code tables
1470     const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1471 
1472     // Get table for code_type ncbi2na
1473     list<CRef<CSeq_code_table> >::const_iterator i_ct;
1474     for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1475         if((*i_ct)->GetCode() == eSeq_code_type_ncbi4na)
1476             break;
1477 
1478     if(i_ct == code_list.end())
1479         throw runtime_error("Code table for Iupacna not found");
1480 
1481     // Check that complements are set
1482     if(!(*i_ct)->IsSetComps())
1483         throw runtime_error("Complement data is not set for iupacna table");
1484 
1485     // Get complement data, start at and size of complement data
1486     const list<int>& comp_data = (*i_ct)->GetComps();
1487     int start_at = (*i_ct)->GetStart_at();
1488 
1489     // Allocate memory for complement data
1490     CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1491 
1492 
1493     // Put complement data in an array
1494     char compArray[16];
1495     int nIdx = start_at;
1496     list<int>::const_iterator i_comp;
1497     for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1498         compArray[nIdx++] = (*i_comp);
1499 
1500     // Set compTable
1501     for(unsigned int i = 0; i<16; i++)
1502         for(unsigned int j = 0; j < 16; j++)
1503             {
1504                 nIdx = i<<4 | j;
1505                 char c1 = compArray[i] << 4;
1506                 char c2 = compArray[j];
1507                 compTable->m_Table[nIdx] = c1 | c2;
1508             }
1509 
1510     // Return complement data
1511     return compTable;
1512 
1513 }
1514 
1515 
1516 // Function to initialize m_Ncbi2naRev
InitNcbi2naRev()1517 CRef<CSeqportUtil_implementation::CCode_rev> CSeqportUtil_implementation::InitNcbi2naRev()
1518 {
1519 
1520     // Allocate memory for reverse table
1521     CRef<CCode_rev> revTable(new CCode_rev(256, 0));
1522 
1523     // Initialize table used to reverse a byte.
1524     for(unsigned int i = 0; i < 4; i++)
1525         for(unsigned int j = 0; j < 4; j++)
1526             for(unsigned int k = 0; k < 4; k++)
1527                 for(unsigned int l = 0; l < 4; l++)
1528                     revTable->m_Table[64*i + 16*j + 4*k + l] =
1529                         64*l + 16*k + 4*j +i;
1530 
1531     // Return the reverse table
1532     return revTable;
1533 }
1534 
1535 
1536 // Function to initialize m_Ncbi4naRev
InitNcbi4naRev()1537 CRef<CSeqportUtil_implementation::CCode_rev> CSeqportUtil_implementation::InitNcbi4naRev()
1538 {
1539 
1540     // Allocate memory for reverse table
1541     CRef<CCode_rev> revTable(new CCode_rev(256, 0));
1542 
1543     // Initialize table used to reverse a byte.
1544     for(unsigned int i = 0; i < 16; i++)
1545         for(unsigned int j = 0; j < 16; j++)
1546             revTable->m_Table[16*i + j] = 16*j + i;
1547 
1548     // Return the reverse table
1549     return revTable;
1550 }
1551 
1552 
1553 
1554 // Function to initialize map tables
1555 CRef<CSeqportUtil_implementation::CMap_table>
InitMaps(ESeq_code_type from_type,ESeq_code_type to_type)1556 CSeqportUtil_implementation::InitMaps
1557 (ESeq_code_type from_type,
1558  ESeq_code_type to_type)
1559 {
1560 
1561     // Get list of map tables
1562     const list< CRef< CSeq_map_table > >& map_list = m_SeqCodeSet->GetMaps();
1563 
1564     // Get requested map table
1565     list<CRef<CSeq_map_table> >::const_iterator i_mt;
1566     for(i_mt = map_list.begin(); i_mt != map_list.end(); ++i_mt)
1567         if((*i_mt)->GetFrom() == from_type && (*i_mt)->GetTo() == to_type)
1568             break;
1569 
1570     if(i_mt == map_list.end())
1571         throw runtime_error("Requested map table not found");
1572 
1573     // Get the map table
1574     const list<int>& table_data = (*i_mt)->GetTable();
1575 
1576     // Create a map table reference
1577     SIZE_TYPE size = table_data.size();
1578     int start_at = (*i_mt)->GetStart_at();
1579     CRef<CMap_table> mapTable(new CMap_table(size,start_at));
1580 
1581     // Copy the table data to mapTable
1582     int nIdx = start_at;
1583     list<int>::const_iterator i_td;
1584     for(i_td = table_data.begin(); i_td != table_data.end(); ++i_td)
1585         {
1586             mapTable->m_Table[nIdx++] = *i_td;
1587         }
1588 
1589     return mapTable;
1590 }
1591 
1592 
1593 // Functions to initialize fast conversion tables
1594 // Function to initialize FastNcib2naIupacna
1595 /*
1596 CRef<CSeqportUtil_implementation::CFast_table4> CSeqportUtil_implementation::InitFastNcbi2naIupacna()
1597 {
1598 
1599     CRef<CFast_table4> fastTable(new CFast_table4(256,0));
1600     unsigned char i,j,k,l;
1601     for(i = 0; i < 4; i++)
1602         for(j = 0; j < 4; j++)
1603             for(k = 0; k < 4; k++)
1604                 for(l = 0; l < 4; l++)
1605                     {
1606                         unsigned char aByte = (i<<6) | (j<<4) | (k<<2) | l;
1607                         char chi = m_Ncbi2naIupacna->m_Table[i];
1608                         char chj = m_Ncbi2naIupacna->m_Table[j];
1609                         char chk = m_Ncbi2naIupacna->m_Table[k];
1610                         char chl = m_Ncbi2naIupacna->m_Table[l];
1611 
1612                         // Note high order bit pair corresponds to low order
1613                         // byte etc., on Unix machines.
1614                         char *pt =
1615                             reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1616                         *(pt++) = chi;
1617                         *(pt++) = chj;
1618                         *(pt++) = chk;
1619                         *(pt) = chl;
1620                      }
1621     return fastTable;
1622 }
1623 */
1624 
1625 // Function to initialize FastNcib2naNcbi4na
InitFastNcbi2naNcbi4na()1626 CRef<CSeqportUtil_implementation::CFast_table2> CSeqportUtil_implementation::InitFastNcbi2naNcbi4na()
1627 {
1628 
1629     CRef<CFast_table2> fastTable(new CFast_table2(256,0));
1630     unsigned char i, j, k, l;
1631 
1632     for(i = 0; i < 4; i++)
1633         for(j = 0; j < 4; j++)
1634             for(k = 0; k < 4; k++)
1635                 for(l = 0; l < 4; l++) {
1636                     unsigned char aByte = (i<<6) | (j<<4) | (k<<2) | l;
1637                     unsigned char chi = m_Ncbi2naNcbi4na->m_Table[i];
1638                     unsigned char chj = m_Ncbi2naNcbi4na->m_Table[j];
1639                     unsigned char chk = m_Ncbi2naNcbi4na->m_Table[k];
1640                     unsigned char chl = m_Ncbi2naNcbi4na->m_Table[l];
1641                     char *pt =
1642 
1643                         reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1644                     *(pt++) = (chi << 4) | chj;
1645                     *pt = (chk << 4) | chl;
1646                 }
1647     return fastTable;
1648 }
1649 
1650 
1651 // Function to initialize FastNcib4naIupacna
InitFastNcbi4naIupacna()1652 CRef<CSeqportUtil_implementation::CFast_table2> CSeqportUtil_implementation::InitFastNcbi4naIupacna()
1653 {
1654 
1655     CRef<CFast_table2> fastTable(new CFast_table2(256,0));
1656     unsigned char i,j;
1657     for(i = 0; i < 16; i++)
1658         for(j = 0; j < 16; j++) {
1659             unsigned char aByte = (i<<4) | j;
1660             unsigned char chi = m_Ncbi4naIupacna->m_Table[i];
1661             unsigned char chj = m_Ncbi4naIupacna->m_Table[j];
1662 
1663             // Note high order nible corresponds to low order byte
1664             // etc., on Unix machines.
1665             char *pt = reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1666             *(pt++) = chi;
1667             *pt = chj;
1668         }
1669     return fastTable;
1670 }
1671 
1672 
1673 // Function to initialize m_FastIupacnancbi2na
InitFastIupacnaNcbi2na()1674 CRef<CSeqportUtil_implementation::CFast_4_1> CSeqportUtil_implementation::InitFastIupacnaNcbi2na()
1675 {
1676 
1677     int start_at = m_IupacnaNcbi2na->m_StartAt;
1678     int size = m_IupacnaNcbi2na->m_Size;
1679     CRef<CFast_4_1> fastTable(new CFast_4_1(4,0,256,0));
1680     for(int ch = 0; ch < 256; ch++) {
1681         if((ch >= start_at) && (ch < (start_at + size)))
1682             {
1683                 unsigned char uch = m_IupacnaNcbi2na->m_Table[ch];
1684                 uch &= '\x03';
1685                 for(unsigned int pos = 0; pos < 4; pos++)
1686                     fastTable->m_Table[pos][ch] = uch << (6-2*pos);
1687             }
1688         else
1689             for(unsigned int pos = 0; pos < 4; pos++)
1690                 fastTable->m_Table[pos][ch] = '\x00';
1691     }
1692     return fastTable;
1693 }
1694 
1695 
1696 // Function to initialize m_FastIupacnancbi4na
InitFastIupacnaNcbi4na()1697 CRef<CSeqportUtil_implementation::CFast_2_1> CSeqportUtil_implementation::InitFastIupacnaNcbi4na()
1698 {
1699 
1700     int start_at = m_IupacnaNcbi4na->m_StartAt;
1701     int size = m_IupacnaNcbi4na->m_Size;
1702     CRef<CFast_2_1> fastTable(new CFast_2_1(2,0,256,0));
1703     for(int ch = 0; ch < 256; ch++) {
1704         if((ch >= start_at) && (ch < (start_at + size)))
1705             {
1706                 unsigned char uch = m_IupacnaNcbi4na->m_Table[ch];
1707                 for(unsigned int pos = 0; pos < 2; pos++)
1708                     fastTable->m_Table[pos][ch] = uch << (4-4*pos);
1709             }
1710         else
1711             {
1712                 fastTable->m_Table[0][ch] = 0xF0;
1713                 fastTable->m_Table[1][ch] = 0x0F;
1714             }
1715     }
1716     return fastTable;
1717 }
1718 
1719 
1720 // Function to initialize m_FastNcbi4naNcbi2na
InitFastNcbi4naNcbi2na()1721 CRef<CSeqportUtil_implementation::CFast_2_1> CSeqportUtil_implementation::InitFastNcbi4naNcbi2na()
1722 {
1723 
1724     int start_at = m_Ncbi4naNcbi2na->m_StartAt;
1725     int size = m_Ncbi4naNcbi2na->m_Size;
1726     CRef<CFast_2_1> fastTable(new CFast_2_1(2,0,256,0));
1727     for(int n1 = 0; n1 < 16; n1++)
1728         for(int n2 = 0; n2 < 16; n2++) {
1729             int nIdx = 16*n1 + n2;
1730             unsigned char u1, u2;
1731             if((n1 >= start_at) && (n1 < start_at + size))
1732                 u1 = m_Ncbi4naNcbi2na->m_Table[n1] & 3;
1733             else
1734                 u1 = '\x00';
1735             if((n2 >= start_at) && (n2 < start_at + size))
1736                 u2 = m_Ncbi4naNcbi2na->m_Table[n2] & 3;
1737             else
1738                 u2 = '\x00';
1739             fastTable->m_Table[0][nIdx] = (u1<<6) | (u2<<4);
1740             fastTable->m_Table[1][nIdx] = (u1<<2) | u2;
1741         }
1742 
1743     return fastTable;
1744 }
1745 
1746 
1747 // Function to initialize m_IndexString and m_StringIndex
InitIndexCodeName()1748 void CSeqportUtil_implementation::InitIndexCodeName()
1749 {
1750     m_IndexString[kName].resize(kNumCodes);
1751     m_IndexString[kSymbol].resize(kNumCodes);
1752     m_IndexComplement.resize(kNumCodes);
1753     m_StringIndex.resize(kNumCodes);
1754     m_StartAt.resize(kNumCodes);
1755 
1756     bool found[kNumCodes];
1757     for (unsigned int ii = 0; ii < kNumCodes; ii++) {
1758         found[ii] = false;
1759     }
1760     ITERATE (CSeq_code_set::TCodes, it, m_SeqCodeSet->GetCodes()) {
1761         const ESeq_code_type& code = (*it)->GetCode();
1762         if (!found[code-1]) {
1763             found[code-1] = true;
1764             m_StartAt[code-1] = (*it)->IsSetStart_at() ?
1765                 (*it)->GetStart_at() : 0;
1766             TIndex i = m_StartAt[code-1];
1767             ITERATE(CSeq_code_table::TTable, is, (*it)->GetTable()) {
1768                 m_IndexString[kSymbol][code-1].push_back((*is)->GetSymbol());
1769                 m_IndexString[kName][code-1].push_back((*is)->GetName());
1770                 m_StringIndex[code-1].insert
1771                     (make_pair((*is)->GetSymbol(), i++));
1772             }
1773             if ( (*it)->IsSetComps() ) {
1774                 ITERATE (list<int>, ic, (*it)->GetComps()) {
1775                     m_IndexComplement[code-1].push_back(*ic);
1776                 }
1777             }
1778         }
1779     }
1780 
1781 
1782 }
1783 
1784 
1785 // Function to initialize m_Masks
InitMasks()1786 CRef<CSeqportUtil_implementation::SMasksArray> CSeqportUtil_implementation::InitMasks()
1787 {
1788 
1789     unsigned int i, j, uCnt;
1790     unsigned char cVal, cRslt;
1791     CRef<SMasksArray> aMask(new SMasksArray);
1792 
1793     // Initialize possible masks for converting ambiguous
1794     // ncbi4na bytes to unambiguous bytes
1795     static const unsigned char mask[16] = {
1796         0x11, 0x12, 0x14, 0x18,
1797         0x21, 0x22, 0x24, 0x28,
1798         0x41, 0x42, 0x44, 0x48,
1799         0x81, 0x82, 0x84, 0x88
1800     };
1801 
1802     static const unsigned char maskUpper[4] = { 0x10, 0x20, 0x40, 0x80 };
1803     static const unsigned char maskLower[4] = { 0x01, 0x02, 0x04, 0x08 };
1804 
1805     // Loop through possible ncbi4na bytes and
1806     // build masks that convert it to unambiguous na
1807     for(i = 0; i < 256; i++) {
1808         cVal = i;
1809         uCnt = 0;
1810 
1811         // Case where both upper and lower nible > 0
1812         if(((cVal & '\x0f') != 0) && ((cVal & '\xf0') != 0))
1813             for(j = 0; j < 16; j++) {
1814                 cRslt = cVal & mask[j];
1815                 if(cRslt == mask[j])
1816                     aMask->m_Table[i].cMask[uCnt++] = mask[j];
1817             }
1818 
1819         // Case where upper nible = 0 and lower nible > 0
1820         else if((cVal & '\x0f') != 0)
1821             for(j = 0; j < 4; j++)
1822                 {
1823                     cRslt = cVal & maskLower[j];
1824                     if(cRslt == maskLower[j])
1825                         aMask->m_Table[i].cMask[uCnt++] = maskLower[j];
1826                 }
1827 
1828 
1829         // Case where lower nible = 0 and upper nible > 0
1830         else if((cVal & '\xf0') != 0)
1831             for(j = 0; j < 4; j++)
1832                 {
1833                     cRslt = cVal & maskUpper[j];
1834                     if(cRslt == maskUpper[j])
1835                         aMask->m_Table[i].cMask[uCnt++] = maskUpper[j];
1836                 }
1837 
1838         // Both upper and lower nibles = 0
1839         else
1840             aMask->m_Table[i].cMask[uCnt++] = '\x00';
1841 
1842         // Number of distict masks for ncbi4na byte i
1843         aMask->m_Table[i].nMasks = uCnt;
1844 
1845         // Fill out the remainder of cMask array with copies
1846         // of first uCnt masks
1847         for(j = uCnt; j < 16 && uCnt > 0; j++)
1848             aMask->m_Table[i].cMask[j] = aMask->m_Table[i].cMask[j % uCnt];
1849 
1850     }
1851 
1852     return aMask;
1853 }
1854 
1855 
1856 // Function to initialize m_DetectAmbigNcbi4naNcbi2na used for
1857 // ambiguity detection
InitAmbigNcbi4naNcbi2na()1858 CRef<CSeqportUtil_implementation::CAmbig_detect> CSeqportUtil_implementation::InitAmbigNcbi4naNcbi2na()
1859 {
1860 
1861     unsigned char low, high, ambig;
1862 
1863     // Create am new CAmbig_detect object
1864     CRef<CAmbig_detect> ambig_detect(new CAmbig_detect(256,0));
1865 
1866     // Loop through low and high order nibles and assign
1867     // values as follows: 0 - no ambiguity, 1 - low order nible ambigiguous
1868     // 2 - high order ambiguous, 3 -- both high and low ambiguous.
1869 
1870     // Loop for low order nible
1871     for(low = 0; low < 16; low++) {
1872         // Determine if low order nible is ambiguous
1873         if((low == 1) || (low ==2) || (low == 4) || (low == 8))
1874             ambig = 0;  // Not ambiguous
1875         else
1876             ambig = 1;  // Ambiguous
1877 
1878         // Loop for high order nible
1879         for(high = 0; high < 16; high++) {
1880 
1881             // Determine if high order nible is ambiguous
1882             if((high != 1) && (high != 2) && (high != 4) && (high != 8))
1883                 ambig += 2;  // Ambiguous
1884 
1885             // Set ambiguity value
1886             ambig_detect->m_Table[16*high + low] = ambig;
1887 
1888             // Reset ambig
1889             ambig &= '\xfd';  // Set second bit to 0
1890         }
1891     }
1892 
1893     return ambig_detect;
1894 }
1895 
1896 
1897 // Function to initialize m_DetectAmbigIupacnaNcbi2na used for ambiguity
1898 // detection
InitAmbigIupacnaNcbi2na()1899 CRef<CSeqportUtil_implementation::CAmbig_detect> CSeqportUtil_implementation::InitAmbigIupacnaNcbi2na()
1900 {
1901 
1902     // Create am new CAmbig_detect object
1903     CRef<CAmbig_detect> ambig_detect(new CAmbig_detect(256,0));
1904 
1905     // 0 implies no ambiguity. 1 implies ambiguity
1906     // Initialize to 0
1907     for(unsigned int i = 0; i<256; i++)
1908         ambig_detect->m_Table[i] = 0;
1909 
1910     // Set iupacna characters that are ambiguous when converted
1911     // to ncib2na
1912     ambig_detect->m_Table[66] = 1; // B
1913     ambig_detect->m_Table[68] = 1; // D
1914     ambig_detect->m_Table[72] = 1; // H
1915     ambig_detect->m_Table[75] = 1; // K
1916     ambig_detect->m_Table[77] = 1; // M
1917     ambig_detect->m_Table[78] = 1; // N
1918     ambig_detect->m_Table[82] = 1; // R
1919     ambig_detect->m_Table[83] = 1; // S
1920     ambig_detect->m_Table[86] = 1; // V
1921     ambig_detect->m_Table[87] = 1; // W
1922     ambig_detect->m_Table[89] = 1; // Y
1923 
1924     return ambig_detect;
1925 }
1926 
1927 /*
1928 struct SSeqDataToSeqUtil
1929 {
1930     CSeq_data::E_Choice  seq_data_coding;
1931     CSeqConvert::TCoding seq_convert_coding;
1932 };
1933 
1934 
1935 static SSeqDataToSeqUtil s_SeqDataToSeqUtilMap[] = {
1936     { CSeq_data::e_Iupacna,   CSeqUtil::e_Iupacna },
1937     { CSeq_data::e_Iupacaa,   CSeqUtil::e_Iupacna },
1938     { CSeq_data::e_Ncbi2na,   CSeqUtil::e_Ncbi2na },
1939     { CSeq_data::e_Ncbi4na,   CSeqUtil::e_Ncbi4na },
1940     { CSeq_data::e_Ncbi8na,   CSeqUtil::e_Ncbi8na },
1941     { CSeq_data::e_Ncbi8aa,   CSeqUtil::e_Ncbi8aa },
1942     { CSeq_data::e_Ncbieaa,   CSeqUtil::e_Ncbieaa },
1943     { CSeq_data::e_Ncbistdaa, CSeqUtil::e_Ncbistdaa }
1944 };
1945 */
1946 
1947 static CSeqUtil::TCoding s_SeqDataToSeqUtil[] = {
1948     CSeqUtil::e_not_set,
1949     CSeqUtil::e_Iupacna,
1950     CSeqUtil::e_Iupacaa,
1951     CSeqUtil::e_Ncbi2na,
1952     CSeqUtil::e_Ncbi4na,
1953     CSeqUtil::e_Ncbi8na,
1954     CSeqUtil::e_not_set,
1955     CSeqUtil::e_Ncbi8aa,
1956     CSeqUtil::e_Ncbieaa,
1957     CSeqUtil::e_not_set,
1958     CSeqUtil::e_Ncbistdaa
1959 };
1960 
1961 
1962 // Convert from one coding scheme to another. The following
1963 // 12 conversions are supported: ncbi2na<=>ncbi4na;
1964 // ncbi2na<=>iupacna; ncbi4na<=>iupacna; ncbieaa<=>ncbistdaa;
1965 // ncbieaa<=>iupacaa; ncbistdaa<=>iupacaa. Convert is
1966 // really just a dispatch function--it calls the appropriate
1967 // priviate conversion function.
x_ConvertAmbig(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const1968 TSeqPos CSeqportUtil_implementation::x_ConvertAmbig
1969 (const CSeq_data&      in_seq,
1970  CSeq_data*            out_seq,
1971  CSeq_data::E_Choice   to_code,
1972  TSeqPos               uBeginIdx,
1973  TSeqPos               uLength,
1974  CRandom::TValue       seed,
1975  TSeqPos               total_length,
1976  TSeqPos*              out_seq_length,
1977  vector<Uint4>*        blast_ambig)
1978     const
1979 {
1980     CSeq_data::E_Choice from_code = in_seq.Which();
1981 
1982     if(to_code == CSeq_data::e_not_set || from_code == CSeq_data::e_not_set)
1983         throw std::runtime_error("to_code or from_code not set");
1984 
1985     if ( to_code != CSeq_data::e_Ncbi2na ) {
1986         throw std::runtime_error("to_code is not Ncbi2na");
1987     }
1988 
1989     switch (from_code) {
1990     case CSeq_data::e_Iupacna:
1991         return MapIupacnaToNcbi2na(in_seq, out_seq, uBeginIdx, uLength, true,
1992                                    seed, total_length, out_seq_length,
1993                                    blast_ambig);
1994     case CSeq_data::e_Ncbi4na:
1995         return MapNcbi4naToNcbi2na(in_seq, out_seq, uBeginIdx, uLength, true,
1996                                    seed, total_length, out_seq_length,
1997                                    blast_ambig);
1998     default:
1999         throw runtime_error("Requested conversion not implemented");
2000     }
2001 }
2002 
2003 // Convert from one coding scheme to another. The following
2004 // 12 conversions are supported: ncbi2na<=>ncbi4na;
2005 // ncbi2na<=>iupacna; ncbi4na<=>iupacna; ncbieaa<=>ncbistdaa;
2006 // ncbieaa<=>iupacaa; ncbistdaa<=>iupacaa. Convert is
2007 // really just a dispatch function--it calls the appropriate
2008 // priviate conversion function.
Convert(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const2009 TSeqPos CSeqportUtil_implementation::Convert
2010 (const CSeq_data&      in_seq,
2011  CSeq_data*            out_seq,
2012  CSeq_data::E_Choice   to_code,
2013  TSeqPos               uBeginIdx,
2014  TSeqPos               uLength,
2015  bool                  bAmbig,
2016  CRandom::TValue       seed,
2017  TSeqPos               total_length,
2018  TSeqPos*              out_seq_length,
2019  vector<Uint4>*        blast_ambig)
2020     const
2021 {
2022     CSeq_data::E_Choice from_code = in_seq.Which();
2023 
2024     // adjust uLength
2025     if ( uLength == 0 ) {
2026         uLength = numeric_limits<TSeqPos>::max();
2027     }
2028 
2029     if(to_code == CSeq_data::e_not_set || from_code == CSeq_data::e_not_set) {
2030         throw std::runtime_error("to_code or from_code not set");
2031     }
2032     if ( s_SeqDataToSeqUtil[to_code]  == CSeqUtil::e_not_set  ||
2033          s_SeqDataToSeqUtil[from_code] == CSeqUtil::e_not_set ) {
2034         throw runtime_error("Requested conversion not implemented");
2035     }
2036 
2037     // Note: for now use old code to convert to ncbi2na with random
2038     // conversion of ambiguous characters.
2039     if ( (to_code == CSeq_data::e_Ncbi2na)  &&  (bAmbig == true) ) {
2040         return x_ConvertAmbig(in_seq, out_seq, to_code, uBeginIdx, uLength,
2041                               seed, total_length, out_seq_length, blast_ambig);
2042     }
2043 
2044     const string* in_str = 0;
2045     const vector<char>* in_vec = 0;
2046 
2047     x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2048 
2049     TSeqPos retval = 0;
2050     if ( in_str != 0 ) {
2051         string result;
2052         retval = CSeqConvert::Convert(*in_str, s_SeqDataToSeqUtil[from_code],
2053                                       uBeginIdx, uLength,
2054                                       result, s_SeqDataToSeqUtil[to_code]);
2055         CSeq_data temp(result, to_code);
2056         out_seq->Assign(temp);
2057     } else if ( in_vec != 0 ) {
2058         vector<char> result;
2059         retval = CSeqConvert::Convert(*in_vec, s_SeqDataToSeqUtil[from_code],
2060                                       uBeginIdx, uLength,
2061                                       result, s_SeqDataToSeqUtil[to_code]);
2062         CSeq_data temp(result, to_code);
2063         out_seq->Assign(temp);
2064     }
2065     return retval;
2066 }
2067 
2068 
2069 // Provide maximum packing without loss of information
Pack(CSeq_data * in_seq,TSeqPos uLength) const2070 TSeqPos CSeqportUtil_implementation::Pack
2071 (CSeq_data*   in_seq,
2072  TSeqPos      uLength)
2073     const
2074 {
2075     _ASSERT(in_seq != 0);
2076 
2077     CSeq_data::E_Choice from_code = in_seq->Which();
2078     _ASSERT(from_code != CSeq_data::e_not_set);
2079 
2080     if ( s_SeqDataToSeqUtil[from_code] == CSeqUtil::e_not_set ) {
2081         throw runtime_error("Unable tp pack requested coding");
2082     }
2083 
2084 
2085     // nothing to pack for proteins
2086     switch ( from_code ) {
2087     case CSeq_data::e_Iupacaa:
2088         return in_seq->GetIupacaa().Get().size();
2089     case CSeq_data::e_Ncbi8aa:
2090         return in_seq->GetNcbi8aa().Get().size();
2091     case CSeq_data::e_Ncbieaa:
2092         return in_seq->GetNcbieaa().Get().size();
2093     case CSeq_data::e_Ncbipaa:
2094         return in_seq->GetNcbipaa().Get().size();
2095     case CSeq_data::e_Ncbistdaa:
2096         return in_seq->GetNcbistdaa().Get().size();
2097     default:
2098         break;
2099     }
2100     // nothing to convert
2101     if ( from_code == CSeq_data::e_Ncbi2na  &&
2102          in_seq->GetNcbi2na().Get().size() * 4 <= uLength ) {
2103         return in_seq->GetNcbi2na().Get().size() * 4;
2104     }
2105 
2106     const string*       in_str = 0;
2107     const vector<char>* in_vec = 0;
2108 
2109     x_GetSeqFromSeqData(*in_seq, &in_str, &in_vec);
2110 
2111     vector<char> out_vec;
2112     CSeqUtil::TCoding coding = CSeqUtil::e_not_set;
2113 
2114     TSeqPos retval = 0;
2115     if ( in_str != 0 ) {
2116         retval =
2117             CSeqConvert::Pack(*in_str, s_SeqDataToSeqUtil[from_code],
2118                               out_vec, coding, uLength);
2119     } else if ( in_vec != 0 ) {
2120         retval =
2121             CSeqConvert::Pack(*in_vec, s_SeqDataToSeqUtil[from_code],
2122                               out_vec, coding, uLength);
2123     }
2124 
2125     switch (coding) {
2126     case CSeqUtil::e_Ncbi2na:
2127         in_seq->SetNcbi2na().Set(out_vec);
2128         break;
2129     case CSeqUtil::e_Ncbi4na:
2130         in_seq->SetNcbi4na().Set(out_vec);
2131         break;
2132     default:
2133         _TROUBLE;
2134     }
2135 
2136     return retval;
2137 }
2138 
2139 
2140 // Method to quickly validate that a CSeq_data object contains valid data.
2141 // FastValidate is a dispatch function that calls the appropriate
2142 // private fast validation function.
FastValidate(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2143 bool CSeqportUtil_implementation::FastValidate
2144 (const CSeq_data&   in_seq,
2145  TSeqPos            uBeginIdx,
2146  TSeqPos            uLength)
2147     const
2148 {
2149     switch (in_seq.Which()) {
2150     case CSeq_data::e_Ncbi2na:
2151         return true; // ncbi2na sequences are always valid
2152     case CSeq_data::e_Ncbi4na:
2153         return true; // ncbi4na sequences are always valid
2154     case CSeq_data::e_Iupacna:
2155         return FastValidateIupacna(in_seq, uBeginIdx, uLength);
2156     case CSeq_data::e_Ncbieaa:
2157         return FastValidateNcbieaa(in_seq, uBeginIdx, uLength);
2158     case CSeq_data::e_Ncbistdaa:
2159         return FastValidateNcbistdaa(in_seq, uBeginIdx, uLength);
2160     case CSeq_data::e_Iupacaa:
2161         return FastValidateIupacaa(in_seq, uBeginIdx, uLength);
2162     default:
2163         throw runtime_error("Sequence could not be validated");
2164     }
2165 }
2166 
2167 
2168 // Function to perform full validation. Validate is a
2169 // dispatch function that calls the appropriate private
2170 // validation function.
Validate(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const2171 void CSeqportUtil_implementation::Validate
2172 (const CSeq_data&   in_seq,
2173  vector<TSeqPos>*   badIdx,
2174  TSeqPos            uBeginIdx,
2175  TSeqPos            uLength)
2176     const
2177 {
2178     switch (in_seq.Which()) {
2179     case CSeq_data::e_Ncbi2na:
2180         return; // ncbi2na sequences are always valid
2181     case CSeq_data::e_Ncbi4na:
2182         return; // ncbi4na sequences are always valid
2183     case CSeq_data::e_Iupacna:
2184         ValidateIupacna(in_seq, badIdx, uBeginIdx, uLength);
2185         break;
2186     case CSeq_data::e_Ncbieaa:
2187         ValidateNcbieaa(in_seq, badIdx, uBeginIdx, uLength);
2188         break;
2189     case CSeq_data::e_Ncbistdaa:
2190         ValidateNcbistdaa(in_seq, badIdx, uBeginIdx, uLength);
2191         break;
2192     case CSeq_data::e_Iupacaa:
2193         ValidateIupacaa(in_seq, badIdx, uBeginIdx, uLength);
2194         break;
2195     default:
2196         throw runtime_error("Sequence could not be validated");
2197     }
2198 }
2199 
2200 
2201 // Function to find ambiguous bases and vector of indices of
2202 // ambiguous bases in CSeq_data objects. GetAmbigs is a
2203 // dispatch function that calls the appropriate private get
2204 // ambigs function.
GetAmbigs(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength) const2205 TSeqPos CSeqportUtil_implementation::GetAmbigs
2206 (const CSeq_data&     in_seq,
2207  CSeq_data*           out_seq,
2208  vector<TSeqPos>*     out_indices,
2209  CSeq_data::E_Choice  to_code,
2210  TSeqPos              uBeginIdx,
2211  TSeqPos              uLength)
2212     const
2213 {
2214 
2215     // Determine and call applicable GetAmbig method.
2216     switch (in_seq.Which()) {
2217     case CSeq_data::e_Ncbi4na:
2218         switch (to_code) {
2219         case CSeq_data::e_Ncbi2na:
2220             return GetAmbigs_ncbi4na_ncbi2na(in_seq, out_seq, out_indices,
2221                                              uBeginIdx, uLength);
2222         default:
2223             return 0;
2224         }
2225     case CSeq_data::e_Iupacna:
2226         switch (to_code) {
2227         case CSeq_data::e_Ncbi2na:
2228             return GetAmbigs_iupacna_ncbi2na(in_seq, out_seq, out_indices,
2229                                              uBeginIdx, uLength);
2230         default:
2231             return 0;
2232         }
2233     default:
2234         return 0;
2235     }
2236 }
2237 
2238 
2239 // Get a copy of in_seq from uBeginIdx through uBeginIdx + uLength-1
2240 // and put in out_seq. See comments in alphabet.hpp for more information.
2241 // GetCopy is a dispatch function.
GetCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2242 TSeqPos CSeqportUtil_implementation::GetCopy
2243 (const CSeq_data&   in_seq,
2244  CSeq_data*         out_seq,
2245  TSeqPos            uBeginIdx,
2246  TSeqPos            uLength)
2247     const
2248 {
2249     // Do processing based on in_seq type
2250     switch (in_seq.Which()) {
2251     case CSeq_data::e_Ncbi2na:
2252         return GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
2253     case CSeq_data::e_Ncbi4na:
2254         return GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
2255     case CSeq_data::e_Iupacna:
2256         return GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
2257     case CSeq_data::e_Ncbieaa:
2258         return GetNcbieaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2259     case CSeq_data::e_Ncbistdaa:
2260         return GetNcbistdaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2261     case CSeq_data::e_Iupacaa:
2262         return GetIupacaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2263     default:
2264         throw runtime_error
2265             ("GetCopy() is not implemented for the requested sequence type");
2266     }
2267 }
2268 
2269 
2270 
2271 
2272 // Method to keep only a contiguous piece of a sequence beginning
2273 // at uBeginIdx and uLength residues long. Keep is a
2274 // dispatch function.
Keep(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2275 TSeqPos CSeqportUtil_implementation::Keep
2276 (CSeq_data*   in_seq,
2277  TSeqPos      uBeginIdx,
2278  TSeqPos      uLength)
2279     const
2280 {
2281     // Do proceessing based upon in_seq type
2282     switch (in_seq->Which()) {
2283     case CSeq_data::e_Ncbi2na:
2284         return KeepNcbi2na(in_seq, uBeginIdx, uLength);
2285     case CSeq_data::e_Ncbi4na:
2286         return KeepNcbi4na(in_seq, uBeginIdx, uLength);
2287     case CSeq_data::e_Iupacna:
2288         return KeepIupacna(in_seq, uBeginIdx, uLength);
2289     case CSeq_data::e_Ncbieaa:
2290         return KeepNcbieaa(in_seq, uBeginIdx, uLength);
2291     case CSeq_data::e_Ncbistdaa:
2292         return KeepNcbistdaa(in_seq, uBeginIdx, uLength);
2293     case CSeq_data::e_Iupacaa:
2294         return KeepIupacaa(in_seq, uBeginIdx, uLength);
2295     default:
2296         throw runtime_error("Cannot perform Keep on in_seq type.");
2297     }
2298 }
2299 
2300 
2301 // Append in_seq2 to in_seq1 and put result in out_seq. This
2302 // is a dispatch function.
Append(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const2303 TSeqPos CSeqportUtil_implementation::Append
2304 (CSeq_data*         out_seq,
2305  const CSeq_data&   in_seq1,
2306  TSeqPos            uBeginIdx1,
2307  TSeqPos            uLength1,
2308  const CSeq_data&   in_seq2,
2309  TSeqPos            uBeginIdx2,
2310  TSeqPos            uLength2)
2311     const
2312 {
2313     // Check that in_seqs or of same type
2314     if(in_seq1.Which() != in_seq2.Which())
2315         throw runtime_error("Append in_seq types do not match.");
2316 
2317     // Check that out_seq is not null
2318     if(!out_seq) {
2319         return 0;
2320     }
2321 
2322     // Call applicable append method base on in_seq types
2323     switch (in_seq1.Which()) {
2324     case CSeq_data::e_Iupacna:
2325         return AppendIupacna(out_seq, in_seq1, uBeginIdx1, uLength1,
2326                              in_seq2, uBeginIdx2, uLength2);
2327     case CSeq_data::e_Ncbi2na:
2328         return AppendNcbi2na(out_seq, in_seq1, uBeginIdx1, uLength1,
2329                              in_seq2, uBeginIdx2, uLength2);
2330     case CSeq_data::e_Ncbi4na:
2331         return AppendNcbi4na(out_seq, in_seq1, uBeginIdx1, uLength1,
2332                              in_seq2, uBeginIdx2, uLength2);
2333     case CSeq_data::e_Ncbieaa:
2334         return AppendNcbieaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2335                              in_seq2, uBeginIdx2, uLength2);
2336     case CSeq_data::e_Ncbistdaa:
2337         return AppendNcbistdaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2338                                in_seq2, uBeginIdx2, uLength2);
2339     case CSeq_data::e_Iupacaa:
2340         return AppendIupacaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2341                              in_seq2, uBeginIdx2, uLength2);
2342     default:
2343         throw runtime_error("Append for in_seq type not supported.");
2344     }
2345 }
2346 
2347 
2348 // Methods to complement na sequences. These are
2349 // dispatch functions.
2350 
2351 // Method to complement na sequence in place
Complement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2352 TSeqPos CSeqportUtil_implementation::Complement
2353 (CSeq_data*   in_seq,
2354  TSeqPos      uBeginIdx,
2355  TSeqPos      uLength)
2356     const
2357 {
2358     _ASSERT(in_seq != 0);
2359 
2360     CSeq_data complement;
2361     TSeqPos retval = Complement(*in_seq, &complement, uBeginIdx, uLength);
2362     in_seq->Assign(complement);
2363 
2364     return retval;
2365 }
2366 
2367 
2368 // Method to complement na sequence in a copy out_seq
Complement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2369 TSeqPos CSeqportUtil_implementation::Complement
2370 (const CSeq_data&   in_seq,
2371  CSeq_data*         out_seq,
2372  TSeqPos            uBeginIdx,
2373  TSeqPos            uLength)
2374     const
2375 {
2376     _ASSERT(out_seq != 0);
2377 
2378     if ( uLength == 0 ) {
2379         uLength = numeric_limits<TSeqPos>::max();
2380     }
2381     CSeq_data::E_Choice in_code = in_seq.Which();
2382     _ASSERT(in_code != CSeq_data::e_not_set);
2383 
2384     const string* in_str = 0;
2385     const vector<char>* in_vec = 0;
2386     x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2387 
2388     TSeqPos retval = 0;
2389     if ( in_str ) {
2390         string out_str;
2391         retval = CSeqManip::Complement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2392         CSeq_data temp(out_str, in_code);
2393         out_seq->Assign(temp);
2394     } else if (in_vec != 0) {
2395         vector<char> out_vec;
2396         retval = CSeqManip::Complement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2397         CSeq_data temp(out_vec, in_code);
2398         out_seq->Assign(temp);
2399     }
2400     return retval;
2401 }
2402 
2403 
2404 // Methods to reverse na sequences. These are
2405 // dispatch functions.
2406 
2407 // Method to reverse na sequence in place
Reverse(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2408 TSeqPos CSeqportUtil_implementation::Reverse
2409 (CSeq_data*   in_seq,
2410  TSeqPos      uBeginIdx,
2411  TSeqPos      uLength)
2412     const
2413 {
2414     CSeq_data temp;
2415     TSeqPos retval = Reverse(*in_seq, &temp, uBeginIdx, uLength);
2416     in_seq->Assign(temp);
2417 
2418     return retval;
2419 }
2420 
2421 
2422 // Method to reverse na sequence in a copy out_seq
Reverse(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2423 TSeqPos CSeqportUtil_implementation::Reverse
2424 (const CSeq_data&  in_seq,
2425  CSeq_data*        out_seq,
2426  TSeqPos           uBeginIdx,
2427  TSeqPos           uLength)
2428     const
2429 {
2430     _ASSERT(out_seq != 0);
2431 
2432     if ( uLength == 0 ) {
2433         uLength = numeric_limits<TSeqPos>::max();
2434     }
2435 
2436     CSeq_data::E_Choice in_code = in_seq.Which();
2437     _ASSERT(in_code != CSeq_data::e_not_set);
2438 
2439     const string* in_str = 0;
2440     const vector<char>* in_vec = 0;
2441     x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2442 
2443     TSeqPos retval = 0;
2444     if ( in_str ) {
2445         string out_str;
2446         retval = CSeqManip::Reverse(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2447         CSeq_data temp(out_str, in_code);
2448         out_seq->Assign(temp);
2449     } else if (in_vec != NULL) {
2450         vector<char> out_vec;
2451         retval = CSeqManip::Reverse(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2452         CSeq_data temp(out_vec, in_code);
2453         out_seq->Assign(temp);
2454     }
2455     return retval;
2456 }
2457 
2458 
2459 
2460 // Methods to reverse-complement a sequence. These are
2461 // dispatch functions.
2462 
2463 // Method to reverse-complement na sequence in place
ReverseComplement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2464 TSeqPos CSeqportUtil_implementation::ReverseComplement
2465 (CSeq_data*  in_seq,
2466  TSeqPos     uBeginIdx,
2467  TSeqPos     uLength)
2468     const
2469 {
2470     _ASSERT(in_seq != 0);
2471 
2472     CSeq_data::E_Choice in_code = in_seq->Which();
2473     _ASSERT(in_code != CSeq_data::e_not_set);
2474 
2475     string* in_str = 0;
2476     vector<char>* in_vec = 0;
2477     x_GetSeqFromSeqData(*in_seq, &in_str, &in_vec);
2478 
2479     if ( in_str ) {
2480         return CSeqManip::ReverseComplement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength);
2481     } else if (in_vec != NULL) {
2482         return CSeqManip::ReverseComplement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength);
2483     } else {
2484         return 0;
2485     }
2486 }
2487 
2488 
2489 // Method to reverse-complement na sequence in a copy out_seq
ReverseComplement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2490 TSeqPos CSeqportUtil_implementation::ReverseComplement
2491 (const CSeq_data&  in_seq,
2492  CSeq_data*        out_seq,
2493  TSeqPos           uBeginIdx,
2494  TSeqPos           uLength)
2495     const
2496 {
2497     _ASSERT(out_seq != 0);
2498 
2499     if ( uLength == 0 ) {
2500         uLength = numeric_limits<TSeqPos>::max();
2501     }
2502 
2503     CSeq_data::E_Choice in_code = in_seq.Which();
2504     _ASSERT(in_code != CSeq_data::e_not_set);
2505 
2506     const string* in_str = 0;
2507     const vector<char>* in_vec = 0;
2508     x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2509 
2510     TSeqPos retval = 0;
2511     if ( in_str ) {
2512         string out_str;
2513         retval = CSeqManip::ReverseComplement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2514         CSeq_data temp(out_str, in_code);
2515         out_seq->Assign(temp);
2516     } else if (in_vec != NULL) {
2517         vector<char> out_vec;
2518         retval = CSeqManip::ReverseComplement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2519         CSeq_data temp(out_vec, in_code);
2520         out_seq->Assign(temp);
2521     }
2522 
2523     return retval;
2524 }
2525 
2526 
2527 // Implement private worker functions called by public
2528 // dispatch functions.
2529 
2530 // Methods to convert between coding schemes
2531 
2532 /*
2533 // Convert in_seq from ncbi2na (1 byte) to iupacna (4 bytes)
2534 // and put result in out_seq
2535 TSeqPos CSeqportUtil_implementation::MapNcbi2naToIupacna
2536 (const CSeq_data&   in_seq,
2537  CSeq_data*         out_seq,
2538  TSeqPos            uBeginIdx,
2539  TSeqPos            uLength)
2540     const
2541 {
2542     // Save uBeginIdx and uLength for later use
2543     TSeqPos uBeginSav = uBeginIdx;
2544     TSeqPos uLenSav = uLength;
2545 
2546     // Get vector holding the in sequence
2547     const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
2548 
2549     // Get string where the out sequence will go
2550     out_seq->Reset();
2551     string& out_seq_data = out_seq->SetIupacna().Set();
2552 
2553     // Validate uBeginSav
2554     if(uBeginSav >= 4*in_seq_data.size())
2555         return 0;
2556 
2557     // Adjust uLenSav
2558     if((uLenSav == 0 )|| ((uLenSav + uBeginSav )> 4*in_seq_data.size()))
2559         uLenSav = 4*in_seq_data.size() - uBeginSav;
2560 
2561 
2562     // Adjust uBeginIdx and uLength, if necessary
2563     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 4, 1);
2564 
2565     // Declare iterator for in_seq
2566     vector<char>::const_iterator i_in;
2567 
2568     // Allocate string memory for result of conversion
2569     out_seq_data.resize(uLenSav);
2570 
2571     // Get pointer to data of out_seq_data (a string)
2572     string::iterator i_out = out_seq_data.begin()-1;
2573 
2574     // Determine begin and end bytes of in_seq_data
2575     vector<char>::const_iterator i_in_begin =
2576         in_seq_data.begin() + uBeginIdx/4;
2577     vector<char>::const_iterator i_in_end = i_in_begin + uLength/4;
2578     if((uLength % 4) != 0) ++i_in_end;
2579     --i_in_end;
2580 
2581     // Handle first input sequence byte
2582     unsigned int uVal =
2583         m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in_begin)];
2584     char *pchar, *pval;
2585     pval = reinterpret_cast<char*>(&uVal);
2586     for(pchar = pval + uBeginSav - uBeginIdx; pchar < pval + 4; ++pchar)
2587         *(++i_out) = *pchar;
2588 
2589     if(i_in_begin == i_in_end)
2590         return uLenSav;
2591     ++i_in_begin;
2592 
2593     // Loop through in_seq_data and convert to out_seq
2594     for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2595         uVal =
2596             m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in)];
2597         pchar = reinterpret_cast<char*>(&uVal);
2598         (*(++i_out)) = (*(pchar++));
2599         (*(++i_out)) = (*(pchar++));
2600         (*(++i_out)) = (*(pchar++));
2601         (*(++i_out)) = (*(pchar++));
2602     }
2603 
2604     // Handle last byte of input data
2605     uVal =
2606         m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in_end)];
2607     pval = reinterpret_cast<char*>(&uVal);
2608     TSeqPos uOverhang = (uBeginSav + uLenSav) % 4;
2609     uOverhang = (uOverhang ==0) ? 4 : uOverhang;
2610     for(pchar = pval; pchar < pval + uOverhang; ++pchar) {
2611         (*(++i_out)) = *pchar;
2612     }
2613 
2614     return uLenSav;
2615 }
2616 
2617 
2618 // Convert in_seq from ncbi2na (1 byte) to ncbi4na (2 bytes)
2619 // and put result in out_seq
2620 TSeqPos CSeqportUtil_implementation::MapNcbi2naToNcbi4na
2621 (const CSeq_data&  in_seq,
2622  CSeq_data*        out_seq,
2623  TSeqPos           uBeginIdx,
2624  TSeqPos           uLength)
2625     const
2626 {
2627     // Get vector holding the in sequence
2628     const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
2629 
2630     // Get vector where out sequence will go
2631     out_seq->Reset();
2632     vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
2633 
2634     // Save uBeginIdx and uLength for later use as they
2635     // are modified below
2636     TSeqPos uBeginSav = uBeginIdx;
2637     TSeqPos uLenSav = uLength;
2638 
2639     // Check that uBeginSav is not beyond end of in_seq_data
2640     if(uBeginSav >= 4*in_seq_data.size())
2641         return 0;
2642 
2643     // Adjust uLenSav
2644     if((uLenSav == 0) || ((uBeginSav + uLenSav) > 4*in_seq_data.size()))
2645         uLenSav = 4*in_seq_data.size() - uBeginSav;
2646 
2647 
2648     // Adjust uBeginIdx and uLength, if necessary
2649     TSeqPos uOverhang =
2650         Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 4, 2);
2651 
2652     // Declare iterator for in_seq
2653     vector<char>::const_iterator i_in;
2654 
2655     // Allocate memory for out_seq_data
2656     TSeqPos uInBytes = (uLength + uOverhang)/4;
2657     if(((uLength + uOverhang) % 4) != 0) uInBytes++;
2658     vector<char>::size_type nOutBytes = 2*uInBytes;
2659     out_seq_data.resize(nOutBytes);
2660 
2661     // Get an iterator of out_seq_data
2662     vector<char>::iterator i_out = out_seq_data.begin()-1;
2663 
2664     // Determine begin and end bytes of in_seq_data
2665     vector<char>::const_iterator i_in_begin =
2666         in_seq_data.begin() + uBeginIdx/4;
2667     vector<char>::const_iterator i_in_end = i_in_begin + uInBytes;
2668 
2669     // Loop through in_seq_data and convert to out_seq_data
2670     for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2671         unsigned short uVal =
2672             m_FastNcbi2naNcbi4na->m_Table[static_cast<unsigned char>(*i_in)];
2673         char* pch = reinterpret_cast<char*>(&uVal);
2674         (*(++i_out)) = (*(pch++));
2675         (*(++i_out)) = (*(pch++));
2676     }
2677     TSeqPos keepidx = uBeginSav - uBeginIdx;
2678     KeepNcbi4na(out_seq, keepidx, uLenSav);
2679 
2680     return uLenSav;
2681 }
2682 
2683 
2684 // Convert in_seq from ncbi4na (1 byte) to iupacna (2 bytes)
2685 // and put result in out_seq
2686 TSeqPos CSeqportUtil_implementation::MapNcbi4naToIupacna
2687 (const CSeq_data& in_seq,
2688  CSeq_data*       out_seq,
2689  TSeqPos          uBeginIdx,
2690  TSeqPos          uLength)
2691     const
2692 {
2693     // Save uBeginIdx and uLength for later use
2694     TSeqPos uBeginSav = uBeginIdx;
2695     TSeqPos uLenSav = uLength;
2696 
2697     // Get vector holding the in sequence
2698     const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
2699 
2700     // Get string where the out sequence will go
2701     out_seq->Reset();
2702     string& out_seq_data = out_seq->SetIupacna().Set();
2703 
2704     // Validate uBeginSav
2705     if(uBeginSav >= 2*in_seq_data.size())
2706         return 0;
2707 
2708     // Adjust uLenSav
2709     if((uLenSav == 0 )|| ((uLenSav + uBeginSav )> 2*in_seq_data.size()))
2710         uLenSav = 2*in_seq_data.size() - uBeginSav;
2711 
2712 
2713     // Adjust uBeginIdx and uLength, if necessary
2714     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 2, 1);
2715 
2716     // Declare iterator for in_seq
2717     vector<char>::const_iterator i_in;
2718 
2719     // Allocate string memory for result of conversion
2720     out_seq_data.resize(uLenSav);
2721 
2722     // Get pointer to data of out_seq_data (a string)
2723     string::iterator i_out = out_seq_data.begin() - 1;
2724 
2725     // Determine begin and end bytes of in_seq_data
2726     vector<char>::const_iterator i_in_begin =
2727         in_seq_data.begin() + uBeginIdx/2;
2728     vector<char>::const_iterator i_in_end = i_in_begin + uLength/2;
2729     if((uLength % 2) != 0) ++i_in_end;
2730     --i_in_end;
2731 
2732     // Handle first input sequence byte
2733     unsigned short uVal =
2734         m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in_begin)];
2735     char *pchar, *pval;
2736     pval = reinterpret_cast<char*>(&uVal);
2737     for(pchar = pval + uBeginSav - uBeginIdx; pchar < pval + 2; ++pchar)
2738         *(++i_out) = *pchar;
2739 
2740     if(i_in_begin == i_in_end)
2741         return uLenSav;
2742     ++i_in_begin;
2743 
2744     // Loop through in_seq_data and convert to out_seq
2745     for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2746         uVal =
2747             m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in)];
2748         pchar = reinterpret_cast<char*>(&uVal);
2749         (*(++i_out)) = (*(pchar++));
2750         (*(++i_out)) = (*(pchar++));
2751     }
2752 
2753     // Handle last byte of input data
2754     uVal =
2755         m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in_end)];
2756     pval = reinterpret_cast<char*>(&uVal);
2757     TSeqPos uOverhang = (uBeginSav + uLenSav) % 2;
2758     uOverhang = (uOverhang ==0) ? 2 : uOverhang;
2759     for(pchar = pval; pchar < pval + uOverhang; ++pchar)
2760         (*(++i_out)) = *pchar;
2761 
2762     return uLenSav;
2763 }
2764 */
2765 
2766 // Table for quick check of whether an ncbi4na residue represents an ambiguity.
2767 // The 0 value is not considered an ambiguity, as it represents the end of
2768 // buffer.
2769 static const char kAmbig4na[16] =
2770     { 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 };
2771 
2772 class CAmbiguityContext {
2773 public:
2774     CAmbiguityContext(vector<Uint4>& amb_buff, int seq_length);
2775     // Make sure the vector is not freed in the destructor
~CAmbiguityContext()2776     ~CAmbiguityContext() {}
2777     void UpdateBuffer();
2778     void AddAmbiguity(char in_byte, TSeqPos& seq_pos);
2779     void Finish();
2780 private:
2781     vector<Uint4>& m_vAmbBuf; ///< Ambiguity buffer to fill
2782     char m_LastAmbChar; ///< Last previous ambiguity character
2783     TSeqPos m_AmbCount;
2784     TSeqPos m_AmbStart;
2785     int m_BuffPos;
2786     bool m_bLongFormat;
2787     TSeqPos m_MaxAmbCount;
2788 };
2789 
CAmbiguityContext(vector<Uint4> & amb_buff,int seq_length)2790 CAmbiguityContext::CAmbiguityContext(vector<Uint4>& amb_buff, int seq_length)
2791     : m_vAmbBuf(amb_buff)
2792 {
2793     m_AmbCount = 0;
2794     m_AmbStart = 0;
2795     m_BuffPos = 0;
2796     m_LastAmbChar = 0;
2797     m_bLongFormat = (seq_length >= 0x00ffffff);
2798     m_MaxAmbCount = (m_bLongFormat ? 0x00000fff : 0x0000000f);
2799     // If "long format", set the top bit in the length element of the
2800     // ambiguity vector, but only if the input vector is empty. Otherwise,
2801     // assume that this initialization has already been done in the previous
2802     // invocation.
2803     if (m_vAmbBuf.size() == 0) {
2804         Uint4 amb_len = (m_bLongFormat ? 0x80000000 : 0);
2805         m_vAmbBuf.push_back(amb_len);
2806     }
2807 }
2808 
2809 
UpdateBuffer()2810 void CAmbiguityContext::UpdateBuffer()
2811 {
2812     // If there are no more unprocessed ambiguities, return.
2813     if (!m_LastAmbChar)
2814         return;
2815 
2816     Uint4 amb_element = m_LastAmbChar << 28;
2817     // In long format length occupies bits 16-27, and sequence position is stored
2818     // in the next integer element. In short format length occupies bits 24-27,
2819     // and sequence position is stored in the same integer element.
2820     if (m_bLongFormat) {
2821         amb_element |= (m_AmbCount << 16);
2822         m_vAmbBuf.push_back(amb_element);
2823         m_vAmbBuf.push_back(m_AmbStart);
2824     } else {
2825         amb_element |= (m_AmbCount << 24);
2826         amb_element |= m_AmbStart;
2827         m_vAmbBuf.push_back(amb_element);
2828     }
2829 }
2830 
AddAmbiguity(char in_byte,TSeqPos & seq_pos)2831 void CAmbiguityContext::AddAmbiguity(char in_byte, TSeqPos& seq_pos)
2832 {
2833     char res[2];
2834 
2835     res[0] = (in_byte >> 4) & 0x0f;
2836     res[1] = in_byte & 0x0f;
2837 
2838     for (int i = 0; i < 2; ++i, ++seq_pos) {
2839         if (kAmbig4na[(int)res[i]]) {
2840             if ((res[i] != m_LastAmbChar) || (m_AmbCount >= m_MaxAmbCount)) {
2841                 // Finish the previous ambiguity element, start new;
2842                 UpdateBuffer();
2843                 m_LastAmbChar = res[i];
2844                 m_AmbCount = 0;
2845                 m_AmbStart = seq_pos;
2846             } else {
2847                 // Just increment the count for the last ambiguity
2848                 ++m_AmbCount;
2849             }
2850         } else {
2851             // No ambiguity: finish the previous ambiguity element, if any,
2852             // reset the m_LastAmbChar and count.
2853             UpdateBuffer();
2854             m_LastAmbChar = 0;
2855             m_AmbCount = 0;
2856         }
2857     }
2858 }
2859 
Finish()2860 void CAmbiguityContext::Finish()
2861 {
2862     UpdateBuffer();
2863     // In the first element of the vector, preserve the top bit, and reset the
2864     // remainder to the number of ambiguity entries.
2865     m_vAmbBuf[0] =
2866         (m_vAmbBuf[0] & 0x80000000) | ((m_vAmbBuf.size() - 1) & 0x7fffffff);
2867 }
2868 
2869 // Function to convert iupacna (4 bytes) to ncbi2na (1 byte)
MapIupacnaToNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const2870 TSeqPos CSeqportUtil_implementation::MapIupacnaToNcbi2na
2871 (const CSeq_data& in_seq,
2872  CSeq_data*       out_seq,
2873  TSeqPos          uBeginIdx,
2874  TSeqPos          uLength,
2875  bool             bAmbig,
2876  CRandom::TValue  seed,
2877  TSeqPos          total_length,
2878  TSeqPos*         out_seq_length,
2879  vector<Uint4>*   blast_ambig)
2880     const
2881 {
2882     // Get string holding the in_seq
2883     const string& in_seq_data = in_seq.GetIupacna().Get();
2884 
2885     // Out sequence may contain unfinished byte from the previous segment
2886     if (out_seq_length != nullptr  &&  *out_seq_length == 0)
2887        out_seq->Reset();
2888     // Get vector where the out sequence will go
2889     vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
2890 
2891     // If uBeginIdx is after end of in_seq, return
2892     if(uBeginIdx >= in_seq_data.size())
2893         return 0;
2894 
2895     // Determine return value
2896     TSeqPos uLenSav = uLength;
2897     if((uLenSav == 0) || ((uLenSav + uBeginIdx)) > in_seq_data.size())
2898         uLenSav = in_seq_data.size() - uBeginIdx;
2899 
2900 
2901     // Adjust uBeginIdx and uLength, if necessary and get uOverhang
2902     TSeqPos uOverhang =
2903         Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 4);
2904 
2905     // Check if the output sequence data has already been filled
2906     // with some previous data, e.g. previous segment of a delta
2907     // sequence.
2908     TSeqPos out_seq_pos = 0;
2909     if (out_seq_length) {
2910         out_seq_pos = *out_seq_length;
2911         *out_seq_length += uLenSav;
2912     }
2913     TSeqPos rbit = 2*(out_seq_pos % 4);
2914     TSeqPos lbit = 8 - rbit;
2915 
2916     // Allocate vector memory for result of conversion
2917     // Note memory for overhang is added below.
2918     vector<char>::size_type nBytes = (out_seq_pos + uLenSav + 3) / 4;
2919     out_seq_data.resize(nBytes);
2920 
2921     // Instantiate an ambiguity context object, if BLAST-style
2922     // ambiguity output is requested.
2923     auto_ptr<CAmbiguityContext> amb_context(NULL);
2924     if (blast_ambig) {
2925         amb_context.reset(new CAmbiguityContext(*blast_ambig, total_length));
2926     }
2927 
2928     // Declare iterator for out_seq_data and determine begin and end
2929     vector<char>::iterator i_out;
2930     vector<char>::iterator i_out_begin = out_seq_data.begin() + out_seq_pos/4;
2931     vector<char>::iterator i_out_end = i_out_begin + uLength/4;
2932 
2933     // Determine begin of in_seq_data
2934     string::const_iterator i_in = in_seq_data.begin() + uBeginIdx;
2935 
2936     char new_byte;
2937     const int kOneByteMask = 0xff;
2938 
2939     if(bAmbig)
2940         {
2941             // Do random disambiguation
2942             unsigned char c1, c2;
2943             CRandom::TValue rv;
2944 
2945             // Declare a random number generator and set seed
2946             CRandom rg;
2947             rg.SetSeed(seed);
2948 
2949 	    // Do disambiguation by converting Iupacna to Ncbi4na
2950 	    // deterministically and then converting from Ncbi4na to Ncbi2na
2951 	    // with random disambiguation
2952 
2953 	    // Loop through the out_seq_data converting 4 Iupacna bytes to
2954 	    // one Ncbi2na byte. in_seq_data.size() % 4 bytes at end of
2955 	    // input handled separately below.
2956             for(i_out = i_out_begin; i_out != i_out_end; )
2957                 {
2958                     // Determine first Ncbi4na byte from 1st two Iupacna bytes
2959                     c1 =
2960                         m_FastIupacnaNcbi4na->m_Table
2961                         [0][static_cast<unsigned char>(*i_in)] |
2962                         m_FastIupacnaNcbi4na->m_Table
2963                         [1][static_cast<unsigned char>(*(i_in+1))];
2964 
2965 		    // Determine second Ncbi4na byte from 2nd two Iupacna bytes
2966                     c2 =
2967                         m_FastIupacnaNcbi4na->m_Table
2968                         [0][static_cast<unsigned char>(*(i_in+2))]|
2969                         m_FastIupacnaNcbi4na->m_Table
2970                         [1][static_cast<unsigned char>(*(i_in+3))];
2971 
2972                     if (blast_ambig) {
2973                         amb_context->AddAmbiguity(c1, out_seq_pos);
2974                         amb_context->AddAmbiguity(c2, out_seq_pos);
2975                     }
2976 
2977 		    // Randomly pick disambiguated Ncbi4na bytes
2978                     rv = rg.GetRand() % 16;
2979                     c1 &= m_Masks->m_Table[c1].cMask[rv];
2980                     rv = rg.GetRand() % 16;
2981                     c2 &= m_Masks->m_Table[c2].cMask[rv];
2982 
2983 		    // Convert from Ncbi4na to Ncbi2na
2984                     // Calculate the new byte. Assign parts of it to the
2985                     // remainder of the current output byte, and the
2986                     // front part of the next output byte, advancing the
2987                     // output iterator in the process.
2988                     new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] |
2989                         m_FastNcbi4naNcbi2na->m_Table[1][c2];
2990                     (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
2991                     ++i_out;
2992                     // Fill part of next byte only if it is necessary, i.e. when
2993                     // rbit is not 0.
2994                     if (rbit)
2995                         (*i_out) = ((new_byte & kOneByteMask) << lbit);
2996 
2997 		    // Increment input sequence iterator.
2998                     i_in+=4;
2999                 }
3000 
3001             // Handle overhang at end of in_seq
3002             switch (uOverhang) {
3003             case 1:
3004                 c1 =
3005                     m_FastIupacnaNcbi4na->m_Table
3006                     [0][static_cast<unsigned char>(*i_in)];
3007                 if (blast_ambig)
3008                     amb_context->AddAmbiguity(c1, out_seq_pos);
3009                 rv = rg.GetRand() % 16;
3010                 c1 &= m_Masks->m_Table[c1].cMask[rv];
3011                 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xC0;
3012                 break;
3013             case 2:
3014                 c1 =
3015                     m_FastIupacnaNcbi4na->m_Table
3016                     [0][static_cast<unsigned char>(*i_in)] |
3017                     m_FastIupacnaNcbi4na->m_Table
3018                     [1][static_cast<unsigned char>(*(i_in+1))];
3019                 if (blast_ambig)
3020                     amb_context->AddAmbiguity(c1, out_seq_pos);
3021                 rv = rg.GetRand() % 16;
3022                 c1 &= m_Masks->m_Table[c1].cMask[rv];
3023                 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xF0;
3024                 break;
3025             case 3:
3026                 c1 =
3027                     m_FastIupacnaNcbi4na->m_Table
3028                     [0][static_cast<unsigned char>(*i_in)] |
3029                     m_FastIupacnaNcbi4na->m_Table
3030                     [1][static_cast<unsigned char>(*(i_in+1))];
3031                 c2 =
3032                     m_FastIupacnaNcbi4na->m_Table
3033                     [0][static_cast<unsigned char>(*(i_in+2))];
3034                 if (blast_ambig) {
3035                     amb_context->AddAmbiguity(c1, out_seq_pos);
3036                     amb_context->AddAmbiguity(c2, out_seq_pos);
3037                 }
3038                 rv = rg.GetRand() % 16;
3039                 c1 &= m_Masks->m_Table[c1].cMask[rv];
3040                 rv = rg.GetRand() % 16;
3041                 c2 &= m_Masks->m_Table[c2].cMask[rv];
3042                 new_byte = (m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xF0) |
3043                     (m_FastNcbi4naNcbi2na->m_Table[1][c2] & 0x0C);
3044                 break;
3045             default:
3046                 // This is a bogus assignment, just to suppress a
3047                 // compiler warning. The value will not actually be
3048                 // used (see the "uOverhang > 0" condition below)
3049                 new_byte = 0;
3050                 break;
3051             }
3052 
3053             // Assign respective parts of the new byte to the remaining parts
3054             // of the output sequence. Output iterator only needs to be
3055             // incremented if the overhang is greater than the unfilled
3056             // remainder  of the last output byte.
3057             if (uOverhang > 0) {
3058                 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3059                 if (2*uOverhang > lbit) {
3060                     ++i_out;
3061                     (*i_out) = ((new_byte & kOneByteMask) << lbit);
3062                 }
3063             }
3064 
3065             if (blast_ambig)
3066                 amb_context->Finish();
3067         }
3068     else
3069         {
3070             // Pack uLength input characters into out_seq_data
3071             for(i_out = i_out_begin; i_out != i_out_end; )
3072                 {
3073                     new_byte =
3074                         m_FastIupacnaNcbi2na->m_Table
3075                         [0][static_cast<unsigned char>(*(i_in))] |
3076                         m_FastIupacnaNcbi2na->m_Table
3077                         [1][static_cast<unsigned char>(*(i_in+1))] |
3078                         m_FastIupacnaNcbi2na->m_Table
3079                         [2][static_cast<unsigned char>(*(i_in+2))] |
3080                         m_FastIupacnaNcbi2na->m_Table
3081                         [3][static_cast<unsigned char>(*(i_in+3))];
3082                     (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3083                     ++i_out;
3084                     // Fill part of next byte only if it is necessary, i.e. when
3085                     // rbit is not 0.
3086                     if (rbit)
3087                         (*i_out) = ((new_byte & kOneByteMask) << lbit);
3088                     i_in+=4;
3089                 }
3090 
3091             // Handle overhang
3092             if(uOverhang > 0) {
3093                 new_byte = '\x00';
3094                 for(TSeqPos i = 0; i < uOverhang; i++) {
3095                     new_byte |=
3096                         m_FastIupacnaNcbi2na->m_Table
3097                         [i][static_cast<unsigned char>(*(i_in+i))];
3098                 }
3099                 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3100                 if (2*uOverhang > lbit) {
3101                     ++i_out;
3102                     (*i_out) = ((new_byte & kOneByteMask) << lbit);
3103                 }
3104             }
3105         }
3106     return uLenSav;
3107 }
3108 /*
3109 
3110 // Function to convert iupacna (2 bytes) to ncbi4na (1 byte)
3111 TSeqPos CSeqportUtil_implementation::MapIupacnaToNcbi4na
3112 (const CSeq_data&  in_seq,
3113  CSeq_data*        out_seq,
3114  TSeqPos           uBeginIdx,
3115  TSeqPos           uLength)
3116     const
3117 {
3118     // Get string holding the in_seq
3119     const string& in_seq_data = in_seq.GetIupacna().Get();
3120 
3121     // Get vector where the out sequence will go
3122     out_seq->Reset();
3123     vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
3124 
3125     // If uBeginIdx beyond end of in_seq, return
3126     if(uBeginIdx >= in_seq_data.size())
3127         return 0;
3128 
3129     // Determine return value
3130     TSeqPos uLenSav = uLength;
3131     if((uLenSav == 0) || (uLenSav + uBeginIdx) > in_seq_data.size())
3132         uLenSav = in_seq_data.size() - uBeginIdx;
3133 
3134     // Adjust uBeginIdx and uLength and get uOverhang
3135     TSeqPos uOverhang =
3136         Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 2);
3137 
3138     // Allocate vector memory for result of conversion
3139     // Note memory for overhang is added below.
3140     vector<char>::size_type nBytes = uLength/2;
3141     out_seq_data.resize(nBytes);
3142 
3143     // Declare iterator for out_seq_data and determine begin and end
3144     vector<char>::iterator i_out;
3145     vector<char>::iterator i_out_begin = out_seq_data.begin();
3146     vector<char>::iterator i_out_end = i_out_begin + uLength/2;
3147 
3148     // Determine begin of in_seq_data offset by 1
3149     string::const_iterator i_in = in_seq_data.begin() + uBeginIdx;
3150 
3151     // Pack uLength input characters into out_seq_data
3152     for(i_out = i_out_begin; i_out != i_out_end; ++i_out) {
3153         (*i_out) =
3154             m_FastIupacnaNcbi4na->m_Table
3155             [0][static_cast<unsigned char>(*(i_in))] |
3156             m_FastIupacnaNcbi4na->m_Table
3157             [1][static_cast<unsigned char>(*(i_in+1))];
3158         i_in+=2;
3159     }
3160 
3161     // Handle overhang
3162     char ch = '\x00';
3163     if (uOverhang > 0) {
3164         ch |=
3165             m_FastIupacnaNcbi4na->
3166             m_Table[0][static_cast<unsigned char>(*i_in)];
3167         out_seq_data.push_back(ch);
3168     }
3169 
3170     return uLenSav;
3171 }
3172 */
3173 
3174 
3175 // Function to convert ncbi4na (2 bytes) to ncbi2na (1 byte)
MapNcbi4naToNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const3176 TSeqPos CSeqportUtil_implementation::MapNcbi4naToNcbi2na(
3177              const CSeq_data&  in_seq, CSeq_data* out_seq,
3178              TSeqPos uBeginIdx, TSeqPos uLength,
3179              bool bAmbig, CRandom::TValue seed,
3180              TSeqPos total_length,
3181              TSeqPos* out_seq_length, vector<Uint4>* blast_ambig)
3182     const
3183 {
3184     // Get vector holding the in_seq
3185     const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
3186 
3187     // Out sequence may contain unfinished byte from a previous segment.
3188     if (out_seq_length != nullptr  &&  *out_seq_length == 0)
3189        out_seq->Reset();
3190     // Get vector where the out sequence will go
3191     vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
3192 
3193     // Save uBeginIdx and uLength as they will be modified below
3194     TSeqPos uBeginSav = uBeginIdx;
3195     TSeqPos uLenSav = uLength;
3196 
3197 
3198     // Check that uBeginSav is not beyond end of in_seq
3199     if(uBeginSav >= 2*in_seq_data.size())
3200         return 0;
3201 
3202     // Adjust uLenSav if needed
3203     if((uLenSav == 0) || ((uBeginSav + uLenSav) > 2*in_seq_data.size()))
3204         uLenSav = 2*in_seq_data.size() - uBeginSav;
3205 
3206     // Adjust uBeginIdx and uLength and get uOverhang
3207     TSeqPos uOverhang =
3208         Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 2, 4);
3209 
3210     // Check if the output sequence data has already been filled
3211     // with some previous data, e.g. previous segment of a delta
3212     // sequence.
3213     TSeqPos out_seq_pos = 0;
3214     if (out_seq_length) {
3215         out_seq_pos = *out_seq_length;
3216         *out_seq_length += uLenSav;
3217     }
3218     TSeqPos rbit = 2*(out_seq_pos % 4);
3219     TSeqPos lbit = 8 - rbit;
3220 
3221     // Allocate vector memory for result of conversion
3222     // Note memory for overhang is added below.
3223     vector<char>::size_type nBytes = (out_seq_pos + uLenSav + 3) / 4;
3224     out_seq_data.resize(nBytes);
3225 
3226     // Instantiate an ambiguity context object, if BLAST-style
3227     // ambiguity output is requested.
3228     auto_ptr<CAmbiguityContext> amb_context(NULL);
3229     if (blast_ambig) {
3230         amb_context.reset(new CAmbiguityContext(*blast_ambig, total_length));
3231     }
3232 
3233     // Declare iterator for out_seq_data and determine begin and end
3234     vector<char>::iterator i_out;
3235     vector<char>::iterator i_out_begin = out_seq_data.begin() + out_seq_pos/4;
3236     vector<char>::iterator i_out_end = i_out_begin + uLength/4;
3237 
3238     // Make sure that the first byte of out_seq_data does not contain garbage
3239     // in the bits that are not yet supposed to be filled.
3240     *i_out_begin &= (0xff << lbit);
3241 
3242     // Determine begin of in_seq_data
3243     vector<char>::const_iterator i_in = in_seq_data.begin() + uBeginIdx/2;
3244 
3245     char new_byte;
3246     const int kOneByteMask = 0xff;
3247 
3248     if(bAmbig) {         // Do random disambiguation
3249         // Declare a random number generator and set seed
3250         CRandom rg;
3251         rg.SetSeed(seed);
3252 
3253         // Pack uLength input bytes into out_seq_data
3254         for(i_out = i_out_begin; i_out != i_out_end; ) {
3255             // Disambiguate
3256             unsigned char c1 = static_cast<unsigned char>(*i_in);
3257             unsigned char c2 = static_cast<unsigned char>(*(i_in+1));
3258 
3259             if (blast_ambig) {
3260                 amb_context->AddAmbiguity(c1, out_seq_pos);
3261                 amb_context->AddAmbiguity(c2, out_seq_pos);
3262             }
3263             CRandom::TValue rv = rg.GetRand() % 16;
3264             c1 &= m_Masks->m_Table[c1].cMask[rv];
3265             rv = rg.GetRand() % 16;
3266             c2 &= m_Masks->m_Table[c2].cMask[rv];
3267 
3268             // Convert
3269             new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] |
3270                 m_FastNcbi4naNcbi2na->m_Table[1][c2];
3271             (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3272             ++i_out;
3273             // Fill part of next byte only if it is necessary, i.e. when
3274             // rbit is not 0.
3275             if (rbit)
3276                 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3277             i_in+=2;
3278         }
3279 
3280         // Handle overhang
3281         new_byte = '\x00';
3282 
3283         if(uOverhang > 0) {
3284             // Disambiguate
3285             unsigned char c1 = static_cast<unsigned char>(*i_in);
3286             // If only one residue, make sure that the second half of the byte
3287             // is 0.
3288             if (uOverhang == 1)
3289                 c1 &= 0xf0;
3290             if (blast_ambig)
3291                 amb_context->AddAmbiguity(c1, out_seq_pos);
3292             CRandom::TValue rv = rg.GetRand() % 16;
3293             c1 &= m_Masks->m_Table[c1].cMask[rv];
3294 
3295             // Convert
3296             new_byte |= m_FastNcbi4naNcbi2na->m_Table[0][c1];
3297         }
3298 
3299         if(uOverhang == 3) {
3300             // Disambiguate; make sure that the second half of the byte
3301             // is 0.
3302             unsigned char c1 = static_cast<unsigned char>(*(++i_in)) & 0xf0;
3303             if (blast_ambig)
3304                 amb_context->AddAmbiguity(c1, out_seq_pos);
3305             CRandom::TValue rv = rg.GetRand() % 16;
3306             c1 &= m_Masks->m_Table[c1].cMask[rv];
3307 
3308             // Convert
3309             new_byte |= m_FastNcbi4naNcbi2na->m_Table[1][c1];
3310         }
3311 
3312         if(uOverhang > 0) {
3313             (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3314             if (2*uOverhang > lbit) {
3315                 ++i_out;
3316                 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3317             }
3318         }
3319 
3320         if (blast_ambig)
3321             amb_context->Finish();
3322     } else { // Do not do random disambiguation
3323 
3324         // Pack uLength input bytes into out_seq_data
3325         for(i_out = i_out_begin; i_out != i_out_end; ) {
3326             new_byte =
3327                 m_FastNcbi4naNcbi2na->m_Table
3328                 [0][static_cast<unsigned char>(*i_in)] |
3329                 m_FastNcbi4naNcbi2na->m_Table
3330                 [1][static_cast<unsigned char>(*(i_in+1))];
3331             (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3332             ++i_out;
3333             // Fill part of next byte only if it is necessary, i.e. when
3334             // rbit is not 0.
3335             if (rbit)
3336                 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3337             i_in+=2;
3338         }
3339 
3340         // Handle overhang
3341         if(uOverhang > 0) {
3342             new_byte = '\x00';
3343             new_byte |= m_FastNcbi4naNcbi2na->m_Table
3344                 [0][static_cast<unsigned char>(*i_in)];
3345 
3346             if(uOverhang == 3)
3347                 new_byte |= m_FastNcbi4naNcbi2na->m_Table
3348                     [1][static_cast<unsigned char>(*(++i_in))];
3349 
3350             (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3351             if (2*uOverhang > lbit) {
3352                 ++i_out;
3353                 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3354             }
3355         }
3356     }
3357 
3358     TSeqPos keepidx = uBeginSav - uBeginIdx;
3359     KeepNcbi2na(out_seq, keepidx, uLenSav);
3360 
3361     return uLenSav;
3362 }
3363 
3364 /*
3365 // Function to convert iupacaa (byte) to ncbieaa (byte)
3366 TSeqPos CSeqportUtil_implementation::MapIupacaaToNcbieaa
3367 (const CSeq_data&  in_seq,
3368  CSeq_data*        out_seq,
3369  TSeqPos           uBeginIdx,
3370  TSeqPos           uLength)
3371     const
3372 {
3373     // Get read-only reference to in_seq data
3374     const string& in_seq_data = in_seq.GetIupacaa().Get();
3375 
3376     // Get read & write reference to out_seq data
3377     out_seq->Reset();
3378     string& out_seq_data = out_seq->SetNcbieaa().Set();
3379 
3380     // If uBeginIdx beyond end of in_seq, return
3381     if(uBeginIdx >= in_seq_data.size())
3382         return 0;
3383 
3384     // Adjust uBeginIdx and uLength, if necessary
3385     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3386 
3387     // Allocate memory for out_seq
3388     out_seq_data.resize(uLength);
3389 
3390     // Declare iterator for out_seq_data
3391     string::iterator i_out = out_seq_data.begin();
3392 
3393     // Declare iterator for in_seq_data and determine begin and end
3394     string::const_iterator i_in;
3395     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3396     string::const_iterator i_in_end = i_in_begin + uLength;
3397 
3398     // Loop through input and convert to output
3399     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3400         (*(i_out++)) =
3401             m_IupacaaNcbieaa->m_Table[static_cast<unsigned char>(*i_in)];
3402 
3403     return uLength;
3404 }
3405 
3406 
3407 // Function to convert ncbieaa (byte) to iupacaa (byte)
3408 TSeqPos CSeqportUtil_implementation::MapNcbieaaToIupacaa
3409 (const CSeq_data&   in_seq,
3410  CSeq_data*         out_seq,
3411  TSeqPos            uBeginIdx,
3412  TSeqPos            uLength)
3413     const
3414 {
3415     // Get read-only reference to in_seq data
3416     const string& in_seq_data = in_seq.GetNcbieaa().Get();
3417 
3418     // Get read & write reference to out_seq data
3419     out_seq->Reset();
3420     string& out_seq_data = out_seq->SetIupacaa().Set();
3421 
3422     // If uBeginIdx beyond end of in_seq, return
3423     if(uBeginIdx >= in_seq_data.size())
3424         return 0;
3425 
3426     // Adjust uBeginIdx and uLength, if necessary
3427     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3428 
3429     // Allocate memory for out_seq
3430     out_seq_data.resize(uLength);
3431 
3432     // Declare iterator for out_seq_data
3433     string::iterator i_out = out_seq_data.begin();
3434 
3435     // Declare iterator for in_seq_data and determine begin and end
3436     string::const_iterator i_in;
3437     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3438     string::const_iterator i_in_end = i_in_begin + uLength;
3439 
3440     // Loop through input and convert to output
3441     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3442         (*(i_out++)) =
3443             m_NcbieaaIupacaa->m_Table[static_cast<unsigned char>(*i_in)];
3444 
3445     return uLength;
3446 }
3447 
3448 
3449 // Function to convert iupacaa (byte) to ncbistdaa (byte)
3450 TSeqPos CSeqportUtil_implementation::MapIupacaaToNcbistdaa
3451 (const CSeq_data&  in_seq,
3452  CSeq_data*        out_seq,
3453  TSeqPos           uBeginIdx,
3454  TSeqPos           uLength)
3455     const
3456 {
3457     // Get read-only reference to in_seq data
3458     const string& in_seq_data = in_seq.GetIupacaa().Get();
3459 
3460     // Get read & write reference to out_seq data
3461     out_seq->Reset();
3462     vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
3463 
3464     // If uBeginIdx beyond end of in_seq, return
3465     if(uBeginIdx >= in_seq_data.size())
3466         return 0;
3467 
3468     // Adjust uBeginIdx and uLength, if necessary
3469     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3470 
3471     // Allocate memory for out_seq
3472     out_seq_data.resize(uLength);
3473 
3474     // Declare iterator for out_seq_data
3475     vector<char>::iterator i_out = out_seq_data.begin();
3476 
3477     // Declare iterator for in_seq_data and determine begin and end
3478     string::const_iterator i_in;
3479     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3480     string::const_iterator i_in_end = i_in_begin + uLength;
3481 
3482     // Loop through input and convert to output
3483     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3484         (*(i_out++)) =
3485             m_IupacaaNcbistdaa->m_Table[static_cast<unsigned char>(*i_in)];
3486 
3487     return uLength;
3488 }
3489 
3490 
3491 
3492 
3493 
3494 // Function to convert ncbieaa (byte) to ncbistdaa (byte)
3495 TSeqPos CSeqportUtil_implementation::MapNcbieaaToNcbistdaa
3496 (const CSeq_data&   in_seq,
3497  CSeq_data*         out_seq,
3498  TSeqPos            uBeginIdx,
3499  TSeqPos            uLength)
3500     const
3501 {
3502     // Get read-only reference to in_seq data
3503     const string& in_seq_data = in_seq.GetNcbieaa().Get();
3504 
3505     // Get read & write reference to out_seq data
3506     out_seq->Reset();
3507     vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
3508 
3509     // If uBeginIdx beyond end of in_seq, return
3510     if(uBeginIdx >= in_seq_data.size())
3511         return 0;
3512 
3513     // Adjust uBeginIdx and uLength, if necessary
3514     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3515 
3516     // Allocate memory for out_seq
3517     out_seq_data.resize(uLength);
3518 
3519     // Declare iterator for out_seq_data
3520     vector<char>::iterator i_out = out_seq_data.begin();
3521 
3522     // Declare iterator for in_seq_data and determine begin and end
3523     string::const_iterator i_in;
3524     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3525     string::const_iterator i_in_end = i_in_begin + uLength;
3526 
3527     // Loop through input and convert to output
3528     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3529         (*(i_out++)) =
3530             m_NcbieaaNcbistdaa->m_Table[static_cast<unsigned char>(*i_in)];
3531 
3532     return uLength;
3533 }
3534 
3535 
3536 // Function to convert ncbistdaa (byte) to ncbieaa (byte)
3537 TSeqPos CSeqportUtil_implementation::MapNcbistdaaToNcbieaa
3538 (const CSeq_data&  in_seq,
3539  CSeq_data*        out_seq,
3540  TSeqPos           uBeginIdx,
3541  TSeqPos           uLength)
3542     const
3543 {
3544     // Get read-only reference to in_seq data
3545     const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3546 
3547     // Get read & write reference to out_seq data
3548     out_seq->Reset();
3549     string& out_seq_data = out_seq->SetNcbieaa().Set();
3550 
3551     // If uBeginIdx beyond end of in_seq, return
3552     if(uBeginIdx >= in_seq_data.size())
3553         return 0;
3554 
3555     // Adjust uBeginIdx and uLength if necessary
3556     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3557 
3558     // Allocate memory for out_seq
3559     out_seq_data.resize(uLength);
3560 
3561     // Get iterator for out_seq_data
3562     string::iterator i_out = out_seq_data.begin();
3563 
3564     // Declare iterator for in_seq_data and determine begin and end
3565     vector<char>::const_iterator i_in;
3566     vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3567     vector<char>::const_iterator i_in_end = i_in_begin + uLength;
3568 
3569     // Loop through input and convert to output
3570     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3571         *(i_out++) =
3572             m_NcbistdaaNcbieaa->m_Table[static_cast<unsigned char>(*i_in)];
3573 
3574     return uLength;
3575 }
3576 
3577 
3578 // Function to convert ncbistdaa (byte) to iupacaa (byte)
3579 TSeqPos CSeqportUtil_implementation::MapNcbistdaaToIupacaa
3580 (const CSeq_data&  in_seq,
3581  CSeq_data*        out_seq,
3582  TSeqPos           uBeginIdx,
3583  TSeqPos           uLength)
3584     const
3585 {
3586     // Get read-only reference to in_seq data
3587     const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3588 
3589     // Get read & write reference to out_seq data
3590     out_seq->Reset();
3591     string& out_seq_data = out_seq->SetIupacaa().Set();
3592 
3593     // If uBeginIdx beyond end of in_seq, return
3594     if(uBeginIdx >= in_seq_data.size())
3595         return 0;
3596 
3597     // Adjust uBeginIdx and uLength
3598     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3599 
3600     // Allocate memory for out_seq
3601     out_seq_data.resize(uLength);
3602 
3603     // Get iterator for out_seq_data
3604     string::iterator i_out = out_seq_data.begin();
3605 
3606     // Declare iterator for in_seq_data and determine begin and end
3607     vector<char>::const_iterator i_in;
3608     vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3609     vector<char>::const_iterator i_in_end = i_in_begin + uLength;
3610 
3611     // Loop through input and convert to output
3612     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3613         (*(i_out++)) =
3614             m_NcbistdaaIupacaa->m_Table[static_cast<unsigned char>(*i_in)];
3615 
3616     return uLength;
3617 }
3618 */
3619 
3620 // Fast validation of iupacna sequence
FastValidateIupacna(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3621 bool CSeqportUtil_implementation::FastValidateIupacna
3622 (const CSeq_data&  in_seq,
3623  TSeqPos           uBeginIdx,
3624  TSeqPos           uLength)
3625     const
3626 {
3627     // Get read-only reference to in_seq data
3628     const string& in_seq_data = in_seq.GetIupacna().Get();
3629 
3630     // Check that uBeginIdx is not beyond end of in_seq
3631     if(uBeginIdx >= in_seq_data.size())
3632         return true;
3633 
3634     // Adjust uBeginIdx, uLength
3635     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3636 
3637     // Declare in iterator on in_seq and determine begin and end
3638     string::const_iterator itor;
3639     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3640     string::const_iterator e_itor = b_itor + uLength;
3641 
3642     // Perform Fast Validation
3643     unsigned char ch = '\x00';
3644     for(itor = b_itor; itor != e_itor; ++itor)
3645         ch |= m_Iupacna->m_Table[static_cast<unsigned char>(*itor)];
3646 
3647     // Return true if valid, otherwise false
3648     return (ch != 255);
3649 }
3650 
3651 
FastValidateNcbieaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3652 bool CSeqportUtil_implementation::FastValidateNcbieaa
3653 (const CSeq_data&  in_seq,
3654  TSeqPos           uBeginIdx,
3655  TSeqPos           uLength)
3656     const
3657 {
3658     // Get read-only reference to in_seq data
3659     const string& in_seq_data = in_seq.GetNcbieaa().Get();
3660 
3661     // Check that uBeginIdx is not beyond end of in_seq
3662     if(uBeginIdx >= in_seq_data.size())
3663         return true;
3664 
3665     // Adjust uBeginIdx, uLength
3666     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3667 
3668     // Declare in iterator on in_seq and determine begin and end
3669     string::const_iterator itor;
3670     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3671     string::const_iterator e_itor = b_itor + uLength;
3672 
3673     // Perform Fast Validation
3674     unsigned char ch = '\x00';
3675     for(itor = b_itor; itor != e_itor; ++itor)
3676         ch |= m_Ncbieaa->m_Table[static_cast<unsigned char>(*itor)];
3677 
3678     // Return true if valid, otherwise false
3679     return (ch != 255);
3680 
3681 }
3682 
3683 
FastValidateNcbistdaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3684 bool CSeqportUtil_implementation::FastValidateNcbistdaa
3685 (const CSeq_data&  in_seq,
3686  TSeqPos           uBeginIdx,
3687  TSeqPos           uLength)
3688     const
3689 {
3690     // Get read-only reference to in_seq data
3691     const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3692 
3693     // Check that uBeginIdx is not beyond end of in_seq
3694     if(uBeginIdx >= in_seq_data.size())
3695         return true;
3696 
3697     // Adjust uBeginIdx, uLength
3698     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3699 
3700     // Declare in iterator on in_seq and determine begin and end
3701     vector<char>::const_iterator itor;
3702     vector<char>::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3703     vector<char>::const_iterator e_itor = b_itor + uLength;
3704 
3705     // Perform Fast Validation
3706     unsigned char ch = '\x00';
3707     for(itor = b_itor; itor != e_itor; ++itor)
3708         ch |= m_Ncbistdaa->m_Table[static_cast<unsigned char>(*itor)];
3709 
3710     // Return true if valid, otherwise false
3711     return (ch != 255);
3712 
3713 }
3714 
3715 
FastValidateIupacaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3716 bool CSeqportUtil_implementation::FastValidateIupacaa
3717 (const CSeq_data&  in_seq,
3718  TSeqPos           uBeginIdx,
3719  TSeqPos           uLength)
3720     const
3721 {
3722     // Get read-only reference to in_seq data
3723     const string& in_seq_data = in_seq.GetIupacaa().Get();
3724 
3725     // Check that uBeginIdx is not beyond end of in_seq
3726     if(uBeginIdx >= in_seq_data.size())
3727         return true;
3728 
3729     // Adjust uBeginIdx, uLength
3730     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3731 
3732     // Declare in iterator on in_seq and determine begin and end
3733     string::const_iterator itor;
3734     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3735     string::const_iterator e_itor = b_itor + uLength;
3736 
3737     // Perform Fast Validation
3738     unsigned char ch = '\x00';
3739     for(itor=b_itor; itor!=e_itor; ++itor)
3740         ch |= m_Iupacaa->m_Table[static_cast<unsigned char>(*itor)];
3741 
3742     // Return true if valid, otherwise false
3743     return (ch != 255);
3744 }
3745 
3746 
ValidateIupacna(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3747 void CSeqportUtil_implementation::ValidateIupacna
3748 (const CSeq_data&   in_seq,
3749  vector<TSeqPos>*   badIdx,
3750  TSeqPos            uBeginIdx,
3751  TSeqPos            uLength)
3752     const
3753 {
3754     // Get read-only reference to in_seq data
3755     const string& in_seq_data = in_seq.GetIupacna().Get();
3756 
3757     // clear out_indices
3758     badIdx->clear();
3759 
3760     // Check that uBeginIdx is not beyond end of in_seq
3761     if(uBeginIdx >= in_seq_data.size())
3762         return;
3763 
3764     // Adjust uBeginIdx, uLength
3765     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3766 
3767     // Declare in iterator on in_seq and determine begin and end
3768     string::const_iterator itor;
3769     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3770     string::const_iterator e_itor = b_itor + uLength;
3771 
3772     // Perform  Validation
3773     TSeqPos nIdx = uBeginIdx;
3774     for(itor = b_itor; itor != e_itor; ++itor)
3775         if(m_Iupacna->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3776             badIdx->push_back(nIdx++);
3777         else
3778             nIdx++;
3779 
3780     // Return list of bad indices
3781     return;
3782 }
3783 
3784 
ValidateNcbieaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3785 void CSeqportUtil_implementation::ValidateNcbieaa
3786 (const CSeq_data&   in_seq,
3787  vector<TSeqPos>*   badIdx,
3788  TSeqPos            uBeginIdx,
3789  TSeqPos            uLength)
3790     const
3791 {
3792     // Get read-only reference to in_seq data
3793     const string& in_seq_data = in_seq.GetNcbieaa().Get();
3794 
3795     // clear badIdx
3796     badIdx->clear();
3797 
3798     // Check that uBeginIdx is not beyond end of in_seq
3799     if(uBeginIdx >= in_seq_data.size())
3800         return;
3801 
3802     // Adjust uBeginIdx, uLength
3803     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3804 
3805     // Declare in iterator on in_seq and determine begin and end
3806     string::const_iterator itor;
3807     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3808     string::const_iterator e_itor = b_itor + uLength;
3809 
3810     // Perform  Validation
3811     TSeqPos nIdx = uBeginIdx;
3812     for(itor = b_itor; itor != e_itor; ++itor)
3813         if(m_Ncbieaa->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3814             badIdx->push_back(nIdx++);
3815         else
3816             nIdx++;
3817 
3818     // Return vector of bad indices
3819     return;
3820 }
3821 
3822 
ValidateNcbistdaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3823 void CSeqportUtil_implementation::ValidateNcbistdaa
3824 (const CSeq_data&   in_seq,
3825  vector<TSeqPos>*   badIdx,
3826  TSeqPos            uBeginIdx,
3827  TSeqPos            uLength)
3828     const
3829 {
3830     // Get read-only reference to in_seq data
3831     const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3832 
3833     // Create a vector to return
3834     badIdx->clear();
3835 
3836     // Check that uBeginIdx is not beyond end of in_seq
3837     if(uBeginIdx >= in_seq_data.size())
3838         return;
3839 
3840     // Adjust uBeginIdx, uLength
3841     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3842 
3843     // Declare in iterator on in_seq and determine begin and end
3844     vector<char>::const_iterator itor;
3845     vector<char>::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3846     vector<char>::const_iterator e_itor = b_itor + uLength;
3847 
3848     // Perform  Validation
3849     TSeqPos nIdx = uBeginIdx;
3850     for(itor=b_itor; itor!=e_itor; ++itor)
3851         if(m_Ncbistdaa->m_Table[static_cast<unsigned char>(*itor)]==char(255))
3852             badIdx->push_back(nIdx++);
3853         else
3854             nIdx++;
3855 
3856     // Return vector of bad indices
3857     return;
3858 }
3859 
3860 
ValidateIupacaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3861 void CSeqportUtil_implementation::ValidateIupacaa
3862 (const CSeq_data&   in_seq,
3863  vector<TSeqPos>*   badIdx,
3864  TSeqPos            uBeginIdx,
3865  TSeqPos            uLength)
3866     const
3867 {
3868     // Get read-only reference to in_seq data
3869     const string& in_seq_data = in_seq.GetIupacaa().Get();
3870 
3871     // Create a vector to return
3872     badIdx->clear();
3873 
3874     // Check that uBeginIdx is not beyond end of in_seq
3875     if(uBeginIdx >= in_seq_data.size())
3876         return;
3877 
3878     // Adjust uBeginIdx, uLength
3879     Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3880 
3881     // Declare in iterator on in_seq and determine begin and end
3882     string::const_iterator itor;
3883     string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3884     string::const_iterator e_itor = b_itor + uLength;
3885 
3886     // Perform  Validation
3887     TSeqPos nIdx = uBeginIdx;
3888     for(itor=b_itor; itor!=e_itor; ++itor)
3889         if(m_Iupacaa->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3890             badIdx->push_back(nIdx++);
3891         else
3892             nIdx++;
3893 
3894     // Return vector of bad indices
3895     return;
3896 }
3897 
3898 
3899 // Function to make copy of ncbi2na type sequences
GetNcbi2naCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3900 TSeqPos CSeqportUtil_implementation::GetNcbi2naCopy
3901 (const CSeq_data&  in_seq,
3902  CSeq_data*        out_seq,
3903  TSeqPos           uBeginIdx,
3904  TSeqPos           uLength)
3905     const
3906 {
3907     // Get reference to out_seq data
3908     out_seq->Reset();
3909     vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
3910 
3911     // Get reference to in_seq data
3912     const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
3913 
3914     // Return if uBeginIdx is after end of in_seq
3915     if(uBeginIdx >= 4 * in_seq_data.size())
3916         return 0;
3917 
3918     // Set uLength to actual valid length in out_seq
3919     if( (uLength ==0) || ((uBeginIdx + uLength) > (4*in_seq_data.size() )) )
3920         uLength = 4*in_seq_data.size() - uBeginIdx;
3921 
3922     // Allocate memory for out_seq data
3923     if((uLength % 4) == 0)
3924         out_seq_data.resize(uLength/4);
3925     else
3926         out_seq_data.resize(uLength/4 + 1);
3927 
3928     // Get iterator on out_seq_data
3929     vector<char>::iterator i_out = out_seq_data.begin() - 1;
3930 
3931     // Calculate amounts to shift bits
3932     unsigned int lShift, rShift;
3933     lShift = 2*(uBeginIdx % 4);
3934     rShift = 8 - lShift;
3935 
3936     // Get interators on in_seq
3937     vector<char>::const_iterator i_in;
3938     vector<char>::const_iterator i_in_begin =
3939         in_seq_data.begin() + uBeginIdx/4;
3940 
3941     // Determine number of input bytes to process
3942     SIZE_TYPE uNumBytes = uLength/4;
3943     if((uLength % 4) != 0)
3944         ++uNumBytes;
3945 
3946     // Prevent access beyond end of in_seq_data
3947     bool bDoLastByte = false;
3948     if((uBeginIdx/4 + uNumBytes) >= in_seq_data.size())
3949         {
3950             uNumBytes = in_seq_data.size() - uBeginIdx/4 - 1;
3951             bDoLastByte = true;
3952         }
3953     vector<char>::const_iterator i_in_end = i_in_begin + uNumBytes;
3954 
3955     // Loop through input sequence and copy to output sequence
3956     if(lShift > 0)
3957         for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3958             (*(++i_out)) =
3959                 ((*i_in) << lShift) | (((*(i_in+1)) & 255) >> rShift);
3960     else
3961         for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3962             (*(++i_out)) = (*i_in);
3963 
3964     // Handle last input byte if necessary
3965     if(bDoLastByte)
3966         (*(++i_out)) = (*i_in) << lShift;
3967 
3968     return uLength;
3969 }
3970 
3971 
3972 // Function to make copy of ncbi4na type sequences
GetNcbi4naCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3973 TSeqPos CSeqportUtil_implementation::GetNcbi4naCopy
3974 (const CSeq_data&  in_seq,
3975  CSeq_data*        out_seq,
3976  TSeqPos           uBeginIdx,
3977  TSeqPos           uLength)
3978     const
3979 {
3980     // Get reference to out_seq data
3981     out_seq->Reset();
3982     vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
3983 
3984     // Get reference to in_seq data
3985     const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
3986 
3987     // Return if uBeginIdx is after end of in_seq
3988     if(uBeginIdx >= 2 * in_seq_data.size())
3989         return 0;
3990 
3991     // Set uLength to actual valid length in out_seq
3992     if( (uLength ==0) || ((uBeginIdx + uLength) > (2*in_seq_data.size() )) )
3993         uLength = 2*in_seq_data.size() - uBeginIdx;
3994 
3995     // Allocate memory for out_seq data
3996     if((uLength % 2) == 0)
3997         out_seq_data.resize(uLength/2);
3998     else
3999         out_seq_data.resize(uLength/2 + 1);
4000 
4001 
4002     // Get iterator on out_seq_data
4003     vector<char>::iterator i_out = out_seq_data.begin() - 1;
4004 
4005     // Calculate amounts to shift bits
4006     unsigned int lShift, rShift;
4007     lShift = 4*(uBeginIdx % 2);
4008     rShift = 8 - lShift;
4009 
4010     // Get interators on in_seq
4011     vector<char>::const_iterator i_in;
4012     vector<char>::const_iterator i_in_begin =
4013         in_seq_data.begin() + uBeginIdx/2;
4014 
4015     // Determine number of input bytes to process
4016     SIZE_TYPE uNumBytes = uLength/2;
4017     if((uLength % 2) != 0)
4018         ++uNumBytes;
4019 
4020     // Prevent access beyond end of in_seq_data
4021     bool bDoLastByte = false;
4022     if((uBeginIdx/2 + uNumBytes) >= in_seq_data.size())
4023         {
4024             uNumBytes = in_seq_data.size() - uBeginIdx/2 - 1;
4025             bDoLastByte = true;
4026         }
4027     vector<char>::const_iterator i_in_end = i_in_begin + uNumBytes;
4028 
4029     // Loop through input sequence and copy to output sequence
4030     if(lShift > 0)
4031         for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4032             (*(++i_out)) =
4033                 ((*i_in) << lShift) | (((*(i_in+1)) & 255) >> rShift);
4034     else
4035         for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4036             (*(++i_out)) = (*i_in);
4037 
4038     // Handle last input byte
4039     if(bDoLastByte)
4040         (*(++i_out)) = (*i_in) << lShift;
4041 
4042     return uLength;
4043 }
4044 
4045 
4046 // Function to make copy of iupacna type sequences
GetIupacnaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4047 TSeqPos CSeqportUtil_implementation::GetIupacnaCopy
4048 (const CSeq_data& in_seq,
4049  CSeq_data*       out_seq,
4050  TSeqPos          uBeginIdx,
4051  TSeqPos          uLength)
4052     const
4053 {
4054     // Get reference to out_seq data
4055     out_seq->Reset();
4056     string& out_seq_data = out_seq->SetIupacna().Set();
4057 
4058     // Get reference to in_seq data
4059     const string& in_seq_data = in_seq.GetIupacna().Get();
4060 
4061     // Return if uBeginIdx is after end of in_seq
4062     if(uBeginIdx >= in_seq_data.size())
4063         return 0;
4064 
4065     // Set uLength to actual valid length in out_seq
4066     if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4067         uLength = in_seq_data.size() - uBeginIdx;
4068 
4069     // Allocate memory for out_seq data
4070     out_seq_data.resize(uLength);
4071 
4072     // Get iterator on out_seq_data
4073     string::iterator i_out = out_seq_data.begin() - 1;
4074 
4075     // Get interators on in_seq
4076     string::const_iterator i_in;
4077     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4078     string::const_iterator i_in_end = i_in_begin + uLength;
4079 
4080     // Loop through input sequence and copy to output sequence
4081     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4082         (*(++i_out)) = (*i_in);
4083 
4084     return uLength;
4085 }
4086 
4087 
4088 // Function to make copy of ncbieaa type sequences
GetNcbieaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4089 TSeqPos CSeqportUtil_implementation::GetNcbieaaCopy
4090 (const CSeq_data&  in_seq,
4091  CSeq_data*        out_seq,
4092  TSeqPos           uBeginIdx,
4093  TSeqPos           uLength)
4094     const
4095 {
4096     // Get reference to out_seq data
4097     out_seq->Reset();
4098     string& out_seq_data = out_seq->SetNcbieaa().Set();
4099 
4100     // Get reference to in_seq data
4101     const string& in_seq_data = in_seq.GetNcbieaa().Get();
4102 
4103     // Return if uBeginIdx is after end of in_seq
4104     if(uBeginIdx >= in_seq_data.size())
4105         return 0;
4106 
4107     // Set uLength to actual valid length in out_seq
4108     if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4109         uLength = in_seq_data.size() - uBeginIdx;
4110 
4111     // Allocate memory for out_seq data
4112     out_seq_data.resize(uLength);
4113 
4114     // Get iterator on out_seq_data
4115     string::iterator i_out = out_seq_data.begin() - 1;
4116 
4117     // Get interators on in_seq
4118     string::const_iterator i_in;
4119     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4120     string::const_iterator i_in_end = i_in_begin + uLength;
4121 
4122     // Loop through input sequence and copy to output sequence
4123     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4124         (*(++i_out)) = (*i_in);
4125 
4126     return uLength;
4127 }
4128 
4129 
4130 // Function to make copy of ncbistdaa type sequences
GetNcbistdaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4131 TSeqPos CSeqportUtil_implementation::GetNcbistdaaCopy
4132 (const CSeq_data&  in_seq,
4133  CSeq_data*        out_seq,
4134  TSeqPos           uBeginIdx,
4135  TSeqPos           uLength)
4136     const
4137 {
4138     // Get reference to out_seq data
4139     out_seq->Reset();
4140     vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
4141 
4142     // Get reference to in_seq data
4143     const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
4144 
4145     // Return if uBeginIdx is after end of in_seq
4146     if(uBeginIdx >= in_seq_data.size())
4147         return 0;
4148 
4149     // Set uLength to actual valid length in out_seq
4150     if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4151         uLength = in_seq_data.size() - uBeginIdx;
4152 
4153     // Allocate memory for out_seq data
4154     out_seq_data.resize(uLength);
4155 
4156     // Get iterator on out_seq_data
4157     vector<char>::iterator i_out = out_seq_data.begin() - 1;
4158 
4159     // Get interators on in_seq
4160     vector<char>::const_iterator i_in;
4161     vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4162     vector<char>::const_iterator i_in_end = i_in_begin + uLength;
4163 
4164     // Loop through input sequence and copy to output sequence
4165     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4166         (*(++i_out)) = (*i_in);
4167 
4168     return uLength;
4169 }
4170 
4171 
4172 // Function to make copy of iupacaa type sequences
GetIupacaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4173 TSeqPos CSeqportUtil_implementation::GetIupacaaCopy
4174 (const CSeq_data&  in_seq,
4175  CSeq_data*        out_seq,
4176  TSeqPos           uBeginIdx,
4177  TSeqPos           uLength)
4178     const
4179 {
4180     // Get reference to out_seq data
4181     out_seq->Reset();
4182     string& out_seq_data = out_seq->SetIupacaa().Set();
4183 
4184     // Get reference to in_seq data
4185     const string& in_seq_data = in_seq.GetIupacaa().Get();
4186 
4187     // Return if uBeginIdx is after end of in_seq
4188     if(uBeginIdx >= in_seq_data.size())
4189         return 0;
4190 
4191     // Set uLength to actual valid length in out_seq
4192     if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4193         uLength = in_seq_data.size() - uBeginIdx;
4194 
4195     // Allocate memory for out_seq data
4196     out_seq_data.resize(uLength);
4197 
4198     // Get iterator on out_seq_data
4199     string::iterator i_out = out_seq_data.begin() - 1;
4200 
4201     // Get interators on in_seq
4202     string::const_iterator i_in;
4203     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4204     string::const_iterator i_in_end = i_in_begin + uLength;
4205 
4206     // Loop through input sequence and copy to output sequence
4207     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4208         (*(++i_out)) = (*i_in);
4209 
4210     return uLength;
4211 }
4212 
4213 
4214 // Function to adjust uBeginIdx to lie on an in_seq byte boundary
4215 // and uLength to lie on on an out_seq byte boundary. Returns
4216 // overhang
Adjust(TSeqPos * uBeginIdx,TSeqPos * uLength,TSeqPos uInSeqBytes,TSeqPos uInSeqsPerByte,TSeqPos uOutSeqsPerByte) const4217 TSeqPos CSeqportUtil_implementation::Adjust
4218 (TSeqPos*  uBeginIdx,
4219  TSeqPos*  uLength,
4220  TSeqPos   uInSeqBytes,
4221  TSeqPos   uInSeqsPerByte,
4222  TSeqPos   uOutSeqsPerByte)
4223     const
4224 {
4225     // Adjust uBeginIdx and uLength to acceptable values
4226 
4227     // If uLength = 0, assume convert to end of sequence
4228     if(*uLength == 0)
4229         *uLength = uInSeqsPerByte * uInSeqBytes;
4230 
4231     // Ensure that uBeginIdx does not start at or after end of in_seq_data
4232     if(*uBeginIdx >= uInSeqsPerByte * uInSeqBytes)
4233         *uBeginIdx = uInSeqsPerByte * uInSeqBytes - uInSeqsPerByte;
4234 
4235     // Ensure that uBeginIdx is a multiple of uInSeqsPerByte and adjust uLength
4236     *uLength += *uBeginIdx % uInSeqsPerByte;
4237     *uBeginIdx = uInSeqsPerByte * (*uBeginIdx/uInSeqsPerByte);
4238 
4239     // Adjust uLength so as not to go beyond end of in_seq_data
4240     if(*uLength > uInSeqsPerByte * uInSeqBytes - *uBeginIdx)
4241         *uLength = uInSeqsPerByte * uInSeqBytes - *uBeginIdx;
4242 
4243     // Adjust uLength down to multiple of uOutSeqsPerByte
4244     // and calculate overhang (overhang handled separately at end)
4245     TSeqPos uOverhang = *uLength % uOutSeqsPerByte;
4246     *uLength = uOutSeqsPerByte * (*uLength / uOutSeqsPerByte);
4247 
4248     return uOverhang;
4249 
4250 }
4251 
4252 
4253 // Loops through an ncbi4na input sequence and determines
4254 // the ambiguities that would result from conversion to an ncbi2na sequence
4255 // On return, out_seq contains the ncbi4na bases that become ambiguous and
4256 // out_indices contains the indices of the abiguous bases in in_seq
GetAmbigs_ncbi4na_ncbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,TSeqPos uBeginIdx,TSeqPos uLength) const4257 TSeqPos CSeqportUtil_implementation::GetAmbigs_ncbi4na_ncbi2na
4258 (const CSeq_data&   in_seq,
4259  CSeq_data*         out_seq,
4260  vector<TSeqPos>*   out_indices,
4261  TSeqPos            uBeginIdx,
4262  TSeqPos            uLength)
4263     const
4264 {
4265     // Get read-only reference to in_seq data
4266     const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
4267 
4268     // Get read & write reference to out_seq data
4269     out_seq->Reset();
4270     vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
4271 
4272     // Adjust uBeginIdx and uLength, if necessary
4273     if(uBeginIdx >= 2*in_seq_data.size())
4274         return 0;
4275 
4276     if((uLength == 0) || (((uBeginIdx + uLength) > 2*in_seq_data.size())))
4277         uLength = 2*in_seq_data.size() - uBeginIdx;
4278 
4279     // Save uBeginIdx and adjust uBeginIdx = 0 mod 2
4280     TSeqPos uBeginSav = uBeginIdx;
4281     TSeqPos uLenSav = uLength;
4282     uLength += uBeginIdx % 2;
4283     uBeginIdx = 2*(uBeginIdx/2);
4284 
4285     // Allocate memory for out_seq_data and out_indices
4286     // Note, these will be shrunk at the end to correspond
4287     // to actual memory needed.  Note, in test cases, over 50% of the
4288     // time spent in this method is spent in the next two
4289     // statements and 3/4 of that is spent in the second statement.
4290     out_seq_data.resize(uLength/2 + (uLength % 2));
4291     out_indices->resize(uLength);
4292 
4293     // Variable to track number of ambigs
4294     TSeqPos uNumAmbigs = 0;
4295 
4296     // Get iterators to input sequence
4297     vector<char>::const_iterator i_in;
4298     vector<char>::const_iterator i_in_begin =
4299         in_seq_data.begin() + uBeginIdx/2;
4300     vector<char>::const_iterator i_in_end =
4301         i_in_begin + uLength/2 + (uLength % 2);
4302 
4303     // Get iterators to out_seq_data and out_indices
4304     vector<char>::iterator i_out_seq = out_seq_data.begin();
4305     vector<TSeqPos>::iterator i_out_idx = out_indices->begin();
4306 
4307     // Index of current input seq base
4308     TSeqPos uIdx = uBeginIdx;
4309 
4310     // Loop through input sequence looking for ambiguities
4311     for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
4312         switch (m_DetectAmbigNcbi4naNcbi2na->m_Table
4313                 [static_cast<unsigned char>(*i_in)]) {
4314 
4315         case 1:    // Low order input nible ambiguous
4316 
4317             // Put low order input nible in low order output nible
4318             if(uNumAmbigs & 1)
4319                 {
4320                     (*i_out_seq) |= (*i_in) & '\x0f';
4321                     ++i_out_seq;
4322                 }
4323 
4324             // Put low order input nible in high order output nible
4325             else
4326                 (*i_out_seq) = (*i_in) << 4;
4327 
4328             // Record input index that was ambiguous
4329             (*i_out_idx) = uIdx + 1;
4330             ++i_out_idx;
4331 
4332             // Increment number of ambiguities
4333             uNumAmbigs++;
4334             break;
4335 
4336         case 2:    // High order input nible ambiguous
4337 
4338             // Put high order input nible in low order output nible
4339             if(uNumAmbigs & 1)
4340                 {
4341                     (*i_out_seq) |= ((*i_in) >> 4) & '\x0f';
4342                     ++i_out_seq;
4343                 }
4344 
4345             // Put high order input nible in high order output nible
4346             else
4347                 (*i_out_seq) = (*i_in) & '\xf0';
4348 
4349             // Record input index that was ambiguous
4350             (*i_out_idx) = uIdx;
4351             ++i_out_idx;
4352 
4353             // Increment number of ambiguities
4354             uNumAmbigs++;
4355             break;
4356 
4357         case 3:    // Both input nibles ambiguous
4358 
4359             // Put high order input nible in low order
4360             // output nible, move to the next output byte
4361             // and put the low order input nibble in the
4362             // high order output nible.
4363             if(uNumAmbigs & 1)
4364                 {
4365                     (*i_out_seq) |= ((*i_in) >> 4) & '\x0f';
4366                     (*(++i_out_seq)) = (*i_in) << 4;
4367                 }
4368 
4369             // Put high order input nible in high order
4370             // output nible, put low order input nible
4371             // in low order output nible, and move to
4372             // next output byte
4373             else
4374                 {
4375                     (*i_out_seq) = (*i_in);
4376                     ++i_out_seq;
4377                 }
4378 
4379             // Record indices that were ambiguous
4380             (*i_out_idx) = uIdx;
4381             (*(++i_out_idx)) = uIdx + 1;
4382             ++i_out_idx;
4383 
4384             // Increment the number of ambiguities
4385             uNumAmbigs+=2;
4386             break;
4387         }
4388 
4389         // Increment next input byte.
4390         uIdx += 2;
4391     }
4392 
4393     // Shrink out_seq_data and out_indices to actual sizes needed
4394     out_indices->resize(uNumAmbigs);
4395     out_seq_data.resize(uNumAmbigs/2 + uNumAmbigs % 2);
4396 
4397     // Check to ensure that ambigs outside of requested range are not included
4398     TSeqPos uKeepBeg = 0;
4399     TSeqPos uKeepLen = 0;
4400     if((*out_indices)[0] < uBeginSav)
4401         {
4402             uKeepBeg = 1;
4403             out_indices->erase(out_indices->begin(), out_indices->begin() + 1);
4404         }
4405 
4406     if((*out_indices)[out_indices->size()-1] >= uBeginSav + uLenSav)
4407         {
4408             out_indices->pop_back();
4409             uKeepLen = out_indices->size();
4410         }
4411 
4412     if((uKeepBeg != 0) || (uKeepLen != 0))
4413         uNumAmbigs = KeepNcbi4na(out_seq, uKeepBeg, uKeepLen);
4414 
4415     return uNumAmbigs;
4416 }
4417 
4418 
4419 // Loops through an iupacna input sequence and determines
4420 // the ambiguities that would result from conversion to an ncbi2na sequence.
4421 // On return, out_seq contains the iupacna bases that become ambiguous and
4422 // out_indices contains the indices of the abiguous bases in in_seq. The
4423 // return is the number of ambiguities found.
GetAmbigs_iupacna_ncbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,TSeqPos uBeginIdx,TSeqPos uLength) const4424 TSeqPos CSeqportUtil_implementation::GetAmbigs_iupacna_ncbi2na
4425 (const CSeq_data&   in_seq,
4426  CSeq_data*         out_seq,
4427  vector<TSeqPos>*   out_indices,
4428  TSeqPos            uBeginIdx,
4429  TSeqPos            uLength)
4430     const
4431 {
4432     // Get read-only reference to in_seq data
4433     const string& in_seq_data = in_seq.GetIupacna().Get();
4434 
4435     // Get read & write reference to out_seq data
4436     out_seq->Reset();
4437     string& out_seq_data = out_seq->SetIupacna().Set();
4438 
4439     // Validate/adjust uBeginIdx and uLength
4440     if(uBeginIdx >= in_seq_data.size())
4441         return 0;
4442 
4443     if((uLength == 0) || ((uBeginIdx + uLength) > in_seq_data.size()))
4444         uLength = in_seq_data.size() - uBeginIdx;
4445 
4446     // Allocate memory for out_seq_data and out_indices
4447     // Note, these will be shrunk at the end to correspond
4448     // to actual memory needed.
4449     out_seq_data.resize(uLength);
4450     out_indices->resize(uLength);
4451 
4452     // Variable to track number of ambigs
4453     TSeqPos uNumAmbigs = 0;
4454 
4455     // Get iterators to input sequence
4456     string::const_iterator i_in;
4457     string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4458     string::const_iterator i_in_end = i_in_begin + uLength;
4459 
4460     // Get iterators to out_seq_data and out_indices
4461     string::iterator i_out_seq = out_seq_data.begin();
4462     vector<TSeqPos>::iterator i_out_idx = out_indices->begin();
4463 
4464     // Index of current input seq base
4465     TSeqPos uIdx = uBeginIdx;
4466 
4467     // Loop through input sequence looking for ambiguities
4468     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4469         {
4470             if(m_DetectAmbigIupacnaNcbi2na->m_Table
4471                [static_cast<unsigned char>(*i_in)] == 1)
4472                 {
4473                     (*i_out_seq) = (*i_in);
4474                     ++i_out_seq;
4475                     (*i_out_idx) = uIdx;
4476                     ++i_out_idx;
4477                     ++uNumAmbigs;
4478                 }
4479 
4480             ++uIdx;
4481         }
4482 
4483     out_seq_data.resize(uNumAmbigs);
4484     out_indices->resize(uNumAmbigs);
4485 
4486     return uNumAmbigs;
4487 }
4488 
4489 
4490 // Method to implement Keep for Ncbi2na. Returns length of
4491 // kept sequence
KeepNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4492 TSeqPos CSeqportUtil_implementation::KeepNcbi2na
4493 (CSeq_data*   in_seq,
4494  TSeqPos      uBeginIdx,
4495  TSeqPos      uLength)
4496     const
4497 {
4498     // Get a reference to in_seq
4499     vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
4500 
4501     // If uBeginIdx past the end of in_seq, return empty in_seq
4502     if(uBeginIdx >= in_seq_data.size()*4)
4503         {
4504             in_seq_data.clear();
4505             return 0;
4506         }
4507 
4508     // If uLength == 0, Keep from uBeginIdx to end of in_seq
4509     if(uLength == 0)
4510         uLength = 4*in_seq_data.size() - uBeginIdx;
4511 
4512 
4513     // If uLength goes beyond the end of the sequence, trim
4514     // it back to the end of the sequence
4515     if(uLength > (4*in_seq_data.size() - uBeginIdx))
4516         uLength = 4*in_seq_data.size() - uBeginIdx;
4517 
4518     // If entire sequence is being requested, just return
4519     if((uBeginIdx == 0) && (uLength >= 4*in_seq_data.size()))
4520         return uLength;
4521 
4522     // Determine index in in_seq_data that holds uBeginIdx residue
4523     TSeqPos uStart = uBeginIdx/4;
4524 
4525     // Determine index within start byte
4526     TSeqPos uStartInByte = 2 * (uBeginIdx % 4);
4527 
4528     // Calculate masks
4529     unsigned char rightMask = 0xff << uStartInByte;
4530     unsigned char leftMask = ~rightMask;
4531 
4532     // Determine index in in_seq_data that holds uBeginIdx + uLength
4533     // residue
4534     TSeqPos uEnd = (uBeginIdx + uLength - 1)/4;
4535 
4536     // Get iterator for writting
4537     vector<char>::iterator i_write;
4538 
4539     // Determine begin and end of read
4540     vector<char>::iterator i_read = in_seq_data.begin() + uStart;
4541     vector<char>::iterator i_read_end = in_seq_data.begin() + uEnd;
4542 
4543     // Loop through in_seq_data and copy data of desire
4544     // sub sequence to begining of in_seq_data
4545     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4546         (*i_write) = (((*i_read) << uStartInByte) | leftMask) &
4547             (((*(i_read+1)) >> (8-uStartInByte)) | rightMask);
4548         ++i_read;
4549     }
4550 
4551     // Handle last byte
4552     (*i_write) = (*i_read) << uStartInByte;
4553 
4554     // Shrink in_seq to to size needed
4555     TSeqPos uSize = uLength/4;
4556     if((uLength % 4) != 0)
4557         uSize++;
4558     in_seq_data.resize(uSize);
4559 
4560     return uLength;
4561 }
4562 
4563 
4564 // Method to implement Keep for Ncbi4na. Returns length of
4565 // kept sequence.
KeepNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4566 TSeqPos CSeqportUtil_implementation::KeepNcbi4na
4567 (CSeq_data*   in_seq,
4568  TSeqPos      uBeginIdx,
4569  TSeqPos      uLength)
4570     const
4571 {
4572     // Get a reference to in_seq
4573     vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
4574 
4575     // If uBeginIdx past the end of in_seq, return empty in_seq
4576     if(uBeginIdx >= in_seq_data.size()*2)
4577         {
4578             in_seq_data.clear();
4579             return 0;
4580         }
4581 
4582     // If uLength == 0, Keep from uBeginIdx to end of in_seq
4583     if(uLength == 0)
4584         uLength = 2*in_seq_data.size() - uBeginIdx;
4585 
4586 
4587     // If uLength goes beyond the end of the sequence, trim
4588     // it back to the end of the sequence
4589     if(uLength > (2*in_seq_data.size() - uBeginIdx))
4590         uLength = 2*in_seq_data.size() - uBeginIdx;
4591 
4592     // If entire sequence is being requested, just return
4593     if((uBeginIdx == 0) && (uLength >= 2*in_seq_data.size()))
4594         return uLength;
4595 
4596     // Determine index in in_seq_data that holds uBeginIdx residue
4597     TSeqPos uStart = uBeginIdx/2;
4598 
4599     // Determine index within start byte
4600     unsigned int uStartInByte = 4 * (uBeginIdx % 2);
4601 
4602     // Calculate masks
4603     unsigned char rightMask = 0xff << uStartInByte;
4604     unsigned char leftMask = ~rightMask;
4605 
4606     // Determine index in in_seq_data that holds uBeginIdx + uLength
4607     // residue
4608     TSeqPos uEnd = (uBeginIdx + uLength - 1)/2;
4609 
4610     // Get iterator for writting
4611     vector<char>::iterator i_write;
4612 
4613     // Determine begin and end of read
4614     vector<char>::iterator i_read = in_seq_data.begin() + uStart;
4615     vector<char>::iterator i_read_end = in_seq_data.begin() + uEnd;
4616 
4617     // Loop through in_seq_data and copy data of desire
4618     // sub sequence to begining of in_seq_data
4619     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4620         (*i_write) = (((*i_read) << uStartInByte) | leftMask) &
4621             (((*(i_read+1)) >> (8-uStartInByte)) | rightMask);
4622         ++i_read;
4623     }
4624 
4625     // Handle last byte
4626     (*i_write) = (*i_read) << uStartInByte;
4627 
4628     // Shrink in_seq to to size needed
4629     TSeqPos uSize = uLength/2;
4630     if((uLength % 2) != 0)
4631         uSize++;
4632     in_seq_data.resize(uSize);
4633 
4634     return uLength;
4635 }
4636 
4637 
4638 // Method to implement Keep for Iupacna. Return length
4639 // of kept sequence
KeepIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4640 TSeqPos CSeqportUtil_implementation::KeepIupacna
4641 (CSeq_data*   in_seq,
4642  TSeqPos      uBeginIdx,
4643  TSeqPos      uLength)
4644     const
4645 {
4646     // Get a reference to in_seq
4647     string& in_seq_data = in_seq->SetIupacna().Set();
4648 
4649 
4650     // If uBeginIdx past end of in_seq, return empty in_seq
4651     if(uBeginIdx >= in_seq_data.size())
4652         {
4653             in_seq_data.erase();
4654             return 0;
4655         }
4656 
4657     // If uLength is 0, Keep from uBeginIdx to end of in_seq
4658     if(uLength == 0)
4659         uLength = in_seq_data.size() - uBeginIdx;
4660 
4661     // Check that uLength does not go beyond end of in_seq
4662     if((uBeginIdx + uLength) > in_seq_data.size())
4663         uLength = in_seq_data.size() - uBeginIdx;
4664 
4665     // If uBeginIdx == 0 and uLength == in_seq_data.size()
4666     // just return as the entire sequence is being requested
4667     if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4668         return uLength;
4669 
4670     // Get two iterators on in_seq, one read and one write
4671     string::iterator i_read;
4672     string::iterator i_write;
4673 
4674     // Determine begin and end of read
4675     i_read = in_seq_data.begin() + uBeginIdx;
4676     string::iterator i_read_end = i_read + uLength;
4677 
4678     // Loop through in_seq for uLength bases
4679     // and shift uBeginIdx to beginning
4680     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write)
4681         {
4682             (*i_write) = (*i_read);
4683             ++i_read;
4684         }
4685 
4686     // Resize in_seq_data to uLength
4687     in_seq_data.resize(uLength);
4688 
4689     return uLength;
4690 }
4691 
4692 
4693 // Method to implement Keep for Ncbieaa
KeepNcbieaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4694 TSeqPos CSeqportUtil_implementation::KeepNcbieaa
4695 (CSeq_data*   in_seq,
4696  TSeqPos      uBeginIdx,
4697  TSeqPos      uLength)
4698     const
4699 {
4700     // Get a reference to in_seq
4701     string& in_seq_data = in_seq->SetNcbieaa().Set();
4702 
4703 
4704     // If uBeginIdx past end of in_seq, return empty in_seq
4705     if(uBeginIdx >= in_seq_data.size())
4706         {
4707             in_seq_data.erase();
4708             return 0;
4709         }
4710 
4711     // If uLength is 0, Keep from uBeginIdx to end of in_seq
4712     if(uLength == 0)
4713         uLength = in_seq_data.size() - uBeginIdx;
4714 
4715     // Check that uLength does not go beyond end of in_seq
4716     if((uBeginIdx + uLength) > in_seq_data.size())
4717         uLength = in_seq_data.size() - uBeginIdx;
4718 
4719     // If uBeginIdx == 0 and uLength == in_seq_data.size()
4720     // just return as the entire sequence is being requested
4721     if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4722         return uLength;
4723 
4724     // Get two iterators on in_seq, one read and one write
4725     string::iterator i_read;
4726     string::iterator i_write;
4727 
4728     // Determine begin and end of read
4729     i_read = in_seq_data.begin() + uBeginIdx;
4730     string::iterator i_read_end = i_read + uLength;
4731 
4732     // Loop through in_seq for uLength bases
4733     // and shift uBeginIdx to beginning
4734     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4735         (*i_write) = (*i_read);
4736         ++i_read;
4737     }
4738 
4739     // Resize in_seq_data to uLength
4740     in_seq_data.resize(uLength);
4741 
4742     return uLength;
4743 }
4744 
4745 
4746 // Method to implement Keep for Ncbistdaa
KeepNcbistdaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4747 TSeqPos CSeqportUtil_implementation::KeepNcbistdaa
4748 (CSeq_data*   in_seq,
4749  TSeqPos      uBeginIdx,
4750  TSeqPos      uLength)
4751     const
4752 {
4753     // Get a reference to in_seq
4754     vector<char>& in_seq_data = in_seq->SetNcbistdaa().Set();
4755 
4756     // If uBeginIdx past end of in_seq, return empty in_seq
4757     if(uBeginIdx >= in_seq_data.size())
4758         {
4759             in_seq_data.clear();
4760             return 0;
4761         }
4762 
4763     // If uLength is 0, Keep from uBeginIdx to end of in_seq
4764     if(uLength == 0)
4765         uLength = in_seq_data.size() - uBeginIdx;
4766 
4767     // Check that uLength does not go beyond end of in_seq
4768     if((uBeginIdx + uLength) > in_seq_data.size())
4769         uLength = in_seq_data.size() - uBeginIdx;
4770 
4771     // If uBeginIdx == 0 and uLength == in_seq_data.size()
4772     // just return as the entire sequence is being requested
4773     if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4774         return uLength;
4775 
4776     // Get two iterators on in_seq, one read and one write
4777     vector<char>::iterator i_read;
4778     vector<char>::iterator i_write;
4779 
4780     // Determine begin and end of read
4781     i_read = in_seq_data.begin() + uBeginIdx;
4782     vector<char>::iterator i_read_end = i_read + uLength;
4783 
4784     // Loop through in_seq for uLength bases
4785     // and shift uBeginIdx to beginning
4786     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4787         (*i_write) = (*i_read);
4788         ++i_read;
4789     }
4790 
4791     // Resize in_seq_data to uLength
4792     in_seq_data.resize(uLength);
4793 
4794     return uLength;
4795 }
4796 
4797 
4798 // Method to implement Keep for Iupacaa
KeepIupacaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4799 TSeqPos CSeqportUtil_implementation::KeepIupacaa
4800 (CSeq_data*   in_seq,
4801  TSeqPos      uBeginIdx,
4802  TSeqPos      uLength)
4803     const
4804 {
4805     // Get a reference to in_seq
4806     string& in_seq_data = in_seq->SetIupacaa().Set();
4807 
4808 
4809     // If uBeginIdx past end of in_seq, return empty in_seq
4810     if (uBeginIdx >= in_seq_data.size()) {
4811         in_seq_data.erase();
4812         return 0;
4813     }
4814 
4815     // If uLength is 0, Keep from uBeginIdx to end of in_seq
4816     if(uLength == 0)
4817         uLength = in_seq_data.size() - uBeginIdx;
4818 
4819     // Check that uLength does not go beyond end of in_seq
4820     if((uBeginIdx + uLength) > in_seq_data.size())
4821         uLength = in_seq_data.size() - uBeginIdx;
4822 
4823     // If uBeginIdx == 0 and uLength == in_seq_data.size()
4824     // just return as the entire sequence is being requested
4825     if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4826         return uLength;
4827 
4828     // Get two iterators on in_seq, one read and one write
4829     string::iterator i_read;
4830     string::iterator i_write;
4831 
4832     // Determine begin and end of read
4833     i_read = in_seq_data.begin() + uBeginIdx;
4834     string::iterator i_read_end = i_read + uLength;
4835 
4836     // Loop through in_seq for uLength bases
4837     // and shift uBeginIdx to beginning
4838     for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4839         (*i_write) = (*i_read);
4840         ++i_read;
4841     }
4842 
4843     // Resize in_seq_data to uLength
4844     in_seq_data.resize(uLength);
4845 
4846     return uLength;
4847 }
4848 
4849 
4850 
4851 // Methods to complement na sequences
4852 
4853 // In place methods
ComplementIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4854 TSeqPos CSeqportUtil_implementation::ComplementIupacna
4855 (CSeq_data*   in_seq,
4856  TSeqPos      uBeginIdx,
4857  TSeqPos      uLength)
4858     const
4859 {
4860     // Keep just the part of in_seq that will be complemented
4861     TSeqPos uKept = KeepIupacna(in_seq, uBeginIdx, uLength);
4862 
4863     // Get in_seq data
4864     string& in_seq_data = in_seq->SetIupacna().Set();
4865 
4866     // Get an iterator to in_seq_data
4867     string::iterator i_data;
4868 
4869     // Get end of iteration--needed for performance
4870     string::iterator i_data_end = in_seq_data.end();
4871 
4872     // Loop through the input sequence and complement it
4873     for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4874         (*i_data) =
4875             m_Iupacna_complement->m_Table[static_cast<unsigned char>(*i_data)];
4876 
4877     return uKept;
4878 }
4879 
4880 
ComplementNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4881 TSeqPos CSeqportUtil_implementation::ComplementNcbi2na
4882 (CSeq_data*   in_seq,
4883  TSeqPos      uBeginIdx,
4884  TSeqPos      uLength)
4885     const
4886 {
4887     // Keep just the part of in_seq that will be complemented
4888     TSeqPos uKept = KeepNcbi2na(in_seq, uBeginIdx, uLength);
4889 
4890     // Get in_seq data
4891     vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
4892 
4893     // Get an iterator to in_seq_data
4894     vector<char>::iterator i_data;
4895 
4896     // Get end of iteration
4897     vector<char>::iterator i_data_end = in_seq_data.end();
4898 
4899     // Loop through the input sequence and complement it
4900     for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4901         (*i_data) =
4902             m_Ncbi2naComplement->m_Table[static_cast<unsigned char>(*i_data)];
4903 
4904     return uKept;
4905 }
4906 
4907 
ComplementNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4908 TSeqPos CSeqportUtil_implementation::ComplementNcbi4na
4909 (CSeq_data*   in_seq,
4910  TSeqPos      uBeginIdx,
4911  TSeqPos      uLength)
4912     const
4913 {
4914     // Keep just the part of in_seq that will be complemented
4915     TSeqPos uKept = KeepNcbi4na(in_seq, uBeginIdx, uLength);
4916 
4917     // Get in_seq data
4918     vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
4919 
4920     // Get an iterator to in_seq_data
4921     vector<char>::iterator i_data;
4922 
4923     // Get end of iteration--done for performance
4924     vector<char>::iterator i_data_end = in_seq_data.end();
4925 
4926     // Loop through the input sequence and complement it
4927     for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4928         (*i_data) =
4929             m_Ncbi4naComplement->m_Table[static_cast<unsigned char>(*i_data)];
4930 
4931     return uKept;
4932 }
4933 
4934 
4935 // Complement in copy methods
ComplementIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4936 TSeqPos CSeqportUtil_implementation::ComplementIupacna
4937 (const CSeq_data&  in_seq,
4938  CSeq_data*        out_seq,
4939  TSeqPos           uBeginIdx,
4940  TSeqPos           uLength)
4941     const
4942 {
4943     TSeqPos uKept = GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
4944     TSeqPos uIdx1 = 0, uIdx2 = 0;
4945     ComplementIupacna(out_seq, uIdx1, uIdx2);
4946     return uKept;
4947 }
4948 
4949 
ComplementNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4950 TSeqPos CSeqportUtil_implementation::ComplementNcbi2na
4951 (const CSeq_data&  in_seq,
4952  CSeq_data*        out_seq,
4953  TSeqPos           uBeginIdx,
4954  TSeqPos           uLength)
4955     const
4956 {
4957     TSeqPos uKept = GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
4958     TSeqPos uIdx1 = 0, uIdx2 = 0;
4959     ComplementNcbi2na(out_seq, uIdx1, uIdx2);
4960     return uKept;
4961 }
4962 
4963 
ComplementNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4964 TSeqPos CSeqportUtil_implementation::ComplementNcbi4na
4965 (const CSeq_data&  in_seq,
4966  CSeq_data*        out_seq,
4967  TSeqPos           uBeginIdx,
4968  TSeqPos           uLength)
4969     const
4970 {
4971     TSeqPos uKept = GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
4972     TSeqPos uIdx1 = 0, uIdx2 = 0;
4973     ComplementNcbi4na(out_seq, uIdx1, uIdx2);
4974     return uKept;
4975 }
4976 
4977 
4978 // Methods to reverse na sequences
4979 
4980 // In place methods
ReverseIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4981 TSeqPos CSeqportUtil_implementation::ReverseIupacna
4982 (CSeq_data*  in_seq,
4983  TSeqPos     uBeginIdx,
4984  TSeqPos     uLength)
4985     const
4986 {
4987     // Keep just the part of in_seq that will be reversed
4988     TSeqPos uKept = KeepIupacna(in_seq, uBeginIdx, uLength);
4989 
4990     // Get in_seq data
4991     string& in_seq_data = in_seq->SetIupacna().Set();
4992 
4993     // Reverse the order of the string
4994     reverse(in_seq_data.begin(), in_seq_data.end());
4995 
4996     return uKept;
4997 }
4998 
4999 
ReverseNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5000 TSeqPos CSeqportUtil_implementation::ReverseNcbi2na
5001 (CSeq_data*  in_seq,
5002  TSeqPos     uBeginIdx,
5003  TSeqPos     uLength)
5004     const
5005 {
5006     // Get a reference to in_seq data
5007     vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
5008 
5009     // Validate and adjust uBeginIdx and uLength
5010     if(uBeginIdx >= 4*in_seq_data.size())
5011         {
5012             in_seq_data.erase(in_seq_data.begin(), in_seq_data.end());
5013             return 0;
5014         }
5015 
5016     // If uLength is zero, set to end of sequence
5017     if(uLength == 0)
5018         uLength = 4*in_seq_data.size() - uBeginIdx;
5019 
5020     // Ensure that uLength not beyond end of sequence
5021     if((uBeginIdx + uLength) > (4 * in_seq_data.size()))
5022         uLength = 4*in_seq_data.size() - uBeginIdx;
5023 
5024     // Determine start and end bytes
5025     TSeqPos uStart = uBeginIdx/4;
5026     TSeqPos uEnd = uStart + (uLength - 1 +(uBeginIdx % 4))/4 + 1;
5027 
5028     // Declare an iterator and get end of sequence
5029     vector<char>::iterator i_in;
5030     vector<char>::iterator i_in_begin = in_seq_data.begin() + uStart;
5031     vector<char>::iterator i_in_end = in_seq_data.begin() + uEnd;
5032 
5033     // Loop through in_seq_data and reverse residues in each byte
5034     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
5035         (*i_in) = m_Ncbi2naRev->m_Table[static_cast<unsigned char>(*i_in)];
5036 
5037     // Reverse the bytes in the sequence
5038     reverse(i_in_begin, i_in_end);
5039 
5040     // Keep just the requested part of the sequence
5041     TSeqPos uJagged = 3 - ((uBeginIdx + uLength - 1) % 4) + 4*uStart;
5042     return KeepNcbi2na(in_seq, uJagged, uLength);
5043 }
5044 
5045 
ReverseNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5046 TSeqPos CSeqportUtil_implementation::ReverseNcbi4na
5047 (CSeq_data*   in_seq,
5048  TSeqPos      uBeginIdx,
5049  TSeqPos      uLength)
5050     const
5051 {
5052     // Get a reference to in_seq data
5053     vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
5054 
5055     // Validate and adjust uBeginIdx and uLength
5056     if(uBeginIdx >= 2*in_seq_data.size())
5057         {
5058             in_seq_data.erase(in_seq_data.begin(), in_seq_data.end());
5059             return 0;
5060         }
5061 
5062     // If uLength is zero, set to end of sequence
5063     if(uLength == 0)
5064         uLength = 2*in_seq_data.size() - uBeginIdx;
5065 
5066     // Ensure that uLength not beyond end of sequence
5067     if((uBeginIdx + uLength) > (2 * in_seq_data.size()))
5068         uLength = 2*in_seq_data.size() - uBeginIdx;
5069 
5070     // Determine start and end bytes
5071     TSeqPos uStart = uBeginIdx/2;
5072     TSeqPos uEnd = uStart + (uLength - 1 +(uBeginIdx % 2))/2 + 1;
5073 
5074     // Declare an iterator and get end of sequence
5075     vector<char>::iterator i_in;
5076     vector<char>::iterator i_in_begin = in_seq_data.begin() + uStart;
5077     vector<char>::iterator i_in_end = in_seq_data.begin() + uEnd;
5078 
5079     // Loop through in_seq_data and reverse residues in each byte
5080     for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
5081         (*i_in) = m_Ncbi4naRev->m_Table[static_cast<unsigned char>(*i_in)];
5082 
5083     // Reverse the bytes in the sequence
5084     reverse(i_in_begin, i_in_end);
5085 
5086     // Keep just the requested part of the sequence
5087     TSeqPos uJagged = 1 - ((uBeginIdx + uLength - 1) % 2) + 2*uStart;
5088     return KeepNcbi4na(in_seq, uJagged, uLength);
5089 }
5090 
5091 
5092 // Reverse in copy methods
ReverseIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5093 TSeqPos CSeqportUtil_implementation::ReverseIupacna
5094 (const CSeq_data&  in_seq,
5095  CSeq_data*        out_seq,
5096  TSeqPos           uBeginIdx,
5097  TSeqPos           uLength)
5098     const
5099 {
5100     GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
5101 
5102     TSeqPos uIdx1 = 0, uIdx2 = uLength;
5103     return ReverseIupacna(out_seq, uIdx1, uIdx2);
5104 }
5105 
5106 
ReverseNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5107 TSeqPos CSeqportUtil_implementation::ReverseNcbi2na
5108 (const CSeq_data&  in_seq,
5109  CSeq_data*        out_seq,
5110  TSeqPos           uBeginIdx,
5111  TSeqPos           uLength)
5112     const
5113 {
5114     GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
5115 
5116     TSeqPos uIdx1 = 0, uIdx2 = uLength;
5117     return ReverseNcbi2na(out_seq, uIdx1, uIdx2);
5118 }
5119 
5120 
ReverseNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5121 TSeqPos CSeqportUtil_implementation::ReverseNcbi4na
5122 (const CSeq_data&  in_seq,
5123  CSeq_data*        out_seq,
5124  TSeqPos           uBeginIdx,
5125  TSeqPos           uLength)
5126     const
5127 {
5128     GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
5129 
5130     TSeqPos uIdx1 = 0, uIdx2 = uLength;
5131     return ReverseNcbi4na(out_seq, uIdx1, uIdx2);
5132 }
5133 
5134 
5135 // Methods to reverse-complement an na sequences
5136 
5137 // In place methods
ReverseComplementIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5138 TSeqPos CSeqportUtil_implementation::ReverseComplementIupacna
5139 (CSeq_data*        in_seq,
5140  TSeqPos           uBeginIdx,
5141  TSeqPos           uLength)
5142     const
5143 {
5144     ReverseIupacna(in_seq, uBeginIdx, uLength);
5145 
5146     TSeqPos uIdx = 0;
5147     return ComplementIupacna(in_seq, uIdx, uLength);
5148 }
5149 
5150 
ReverseComplementNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5151 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi2na
5152 (CSeq_data*        in_seq,
5153  TSeqPos           uBeginIdx,
5154  TSeqPos           uLength)
5155     const
5156 {
5157     ReverseNcbi2na(in_seq, uBeginIdx, uLength);
5158 
5159     TSeqPos uIdx = 0;
5160     return ComplementNcbi2na(in_seq, uIdx, uLength);
5161 }
5162 
5163 
ReverseComplementNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5164 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi4na
5165 (CSeq_data*        in_seq,
5166  TSeqPos           uBeginIdx,
5167  TSeqPos           uLength)
5168     const
5169 {
5170     ReverseNcbi4na(in_seq, uBeginIdx, uLength);
5171 
5172     TSeqPos uIdx = 0;
5173     return ComplementNcbi4na(in_seq, uIdx, uLength);
5174 }
5175 
5176 
5177 // Reverse in copy methods
ReverseComplementIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5178 TSeqPos CSeqportUtil_implementation::ReverseComplementIupacna
5179 (const CSeq_data&  in_seq,
5180  CSeq_data*        out_seq,
5181  TSeqPos           uBeginIdx,
5182  TSeqPos           uLength)
5183     const
5184 {
5185     ReverseIupacna(in_seq, out_seq, uBeginIdx, uLength);
5186 
5187     TSeqPos uIdx = 0;
5188     return ComplementIupacna(out_seq, uIdx, uLength);
5189 }
5190 
5191 
ReverseComplementNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5192 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi2na
5193 (const CSeq_data&  in_seq,
5194  CSeq_data*        out_seq,
5195  TSeqPos           uBeginIdx,
5196  TSeqPos           uLength)
5197     const
5198 {
5199     ReverseNcbi2na(in_seq, out_seq, uBeginIdx, uLength);
5200 
5201     TSeqPos uIdx = 0;
5202     return ComplementNcbi2na(out_seq, uIdx, uLength);
5203 }
5204 
5205 
ReverseComplementNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5206 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi4na
5207 (const CSeq_data&  in_seq,
5208  CSeq_data*        out_seq,
5209  TSeqPos           uBeginIdx,
5210  TSeqPos           uLength)
5211     const
5212 {
5213     ReverseNcbi4na(in_seq, out_seq, uBeginIdx, uLength);
5214 
5215     TSeqPos uIdx = 0;
5216     return ComplementNcbi4na(out_seq, uIdx, uLength);
5217 }
5218 
5219 
5220 // Append methods
AppendIupacna(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5221 TSeqPos CSeqportUtil_implementation::AppendIupacna
5222 (CSeq_data*        out_seq,
5223  const CSeq_data&  in_seq1,
5224  TSeqPos           uBeginIdx1,
5225  TSeqPos           uLength1,
5226  const CSeq_data&  in_seq2,
5227  TSeqPos           uBeginIdx2,
5228  TSeqPos           uLength2)
5229     const
5230 {
5231     // Get references to in_seqs
5232     const string& in_seq1_data = in_seq1.GetIupacna().Get();
5233     const string& in_seq2_data = in_seq2.GetIupacna().Get();
5234 
5235     // Get a reference to out_seq
5236     out_seq->Reset();
5237     string& out_seq_data = out_seq->SetIupacna().Set();
5238 
5239     // Validate and Adjust uBeginIdx_ and uLength_
5240     if((uBeginIdx1 >= in_seq1_data.size()) &&
5241        (uBeginIdx2 >= in_seq2_data.size()))
5242         return 0;
5243 
5244     if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5245         uLength1 = in_seq1_data.size() - uBeginIdx1;
5246 
5247     if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5248         uLength2 = in_seq2_data.size() - uBeginIdx2;
5249 
5250     // Append the strings
5251     out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5252     out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5253 
5254     return uLength1 + uLength2;
5255 }
5256 
5257 
AppendNcbi2na(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5258 TSeqPos CSeqportUtil_implementation::AppendNcbi2na
5259 (CSeq_data*        out_seq,
5260  const CSeq_data&  in_seq1,
5261  TSeqPos           uBeginIdx1,
5262  TSeqPos           uLength1,
5263  const CSeq_data&  in_seq2,
5264  TSeqPos           uBeginIdx2,
5265  TSeqPos           uLength2)
5266     const
5267 {
5268     // Get references to in_seqs
5269     const vector<char>& in_seq1_data = in_seq1.GetNcbi2na().Get();
5270     const vector<char>& in_seq2_data = in_seq2.GetNcbi2na().Get();
5271 
5272     // Get a reference to out_seq
5273     out_seq->Reset();
5274     vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
5275 
5276     // Handle case where both uBeginidx go beyond in_seq
5277     if((uBeginIdx1 >= 4*in_seq1_data.size()) &&
5278        (uBeginIdx2 >= 4*in_seq2_data.size()))
5279         return 0;
5280 
5281     // Handle case where uBeginIdx1 goes beyond end of in_seq1
5282     if(uBeginIdx1 >= 4*in_seq1_data.size())
5283         return GetNcbi2naCopy(in_seq2, out_seq, uBeginIdx2, uLength2);
5284 
5285     // Handle case where uBeginIdx2 goes beyond end of in_seq2
5286     if(uBeginIdx2 >= 4*in_seq2_data.size())
5287         return GetNcbi2naCopy(in_seq1, out_seq, uBeginIdx1, uLength1);
5288 
5289     // Validate and Adjust uBeginIdx_ and uLength_
5290     if(((uBeginIdx1 + uLength1) > 4*in_seq1_data.size()) || uLength1 == 0)
5291         uLength1 = 4*in_seq1_data.size() - uBeginIdx1;
5292 
5293     if(((uBeginIdx2 + uLength2) > 4*in_seq2_data.size()) || uLength2 == 0)
5294         uLength2 = 4*in_seq2_data.size() - uBeginIdx2;
5295 
5296 
5297     // Resize out_seq_data to hold appended sequence
5298     TSeqPos uTotalLength = uLength1 + uLength2;
5299     if((uTotalLength % 4) == 0)
5300         out_seq_data.resize(uTotalLength/4);
5301     else
5302         out_seq_data.resize(uTotalLength/4 + 1);
5303 
5304     // Calculate bit shifts required for in_seq1
5305     unsigned int lShift1 = 2*(uBeginIdx1 % 4);
5306     unsigned int rShift1 = 8 - lShift1;
5307 
5308     // Calculate bit shifts required for in_seq2
5309     unsigned int lShift2, rShift2, uCase;
5310     unsigned int uVacantIdx = 2*(uLength1 % 4);
5311     unsigned int uStartIdx = 2*(uBeginIdx2 % 4);
5312     if((uVacantIdx < uStartIdx) && (uVacantIdx > 0))
5313         {
5314             uCase = 0;
5315             lShift2 = uStartIdx - uVacantIdx;
5316             rShift2 = 8 - lShift2;
5317         }
5318     else if((uVacantIdx < uStartIdx) && (uVacantIdx == 0))
5319         {
5320             uCase = 1;
5321             lShift2 = uStartIdx;
5322             rShift2 = 8 - lShift2;
5323         }
5324     else if((uVacantIdx == uStartIdx) && (uVacantIdx > 0))
5325         {
5326             uCase = 2;
5327             lShift2 = 0;
5328             rShift2 = 8;
5329         }
5330     else if((uVacantIdx == uStartIdx) && (uVacantIdx == 0))
5331         {
5332             uCase = 3;
5333             lShift2 = 0;
5334             rShift2 = 8;
5335         }
5336     else
5337         {
5338             uCase = 4;
5339             rShift2 = uVacantIdx - uStartIdx;
5340             lShift2 = 8 - rShift2;
5341         }
5342 
5343 
5344     // Determine begin and end points for iterators.
5345     TSeqPos uStart1 = uBeginIdx1/4;
5346     TSeqPos uEnd1;
5347     if(((uBeginIdx1 + uLength1) % 4) == 0)
5348         uEnd1 = (uBeginIdx1 + uLength1)/4;
5349     else
5350         uEnd1 = (uBeginIdx1 + uLength1)/4 + 1;
5351 
5352     TSeqPos uStart2 = uBeginIdx2/4;
5353     TSeqPos uEnd2;
5354     if(((uBeginIdx2 + uLength2) % 4) == 0)
5355         uEnd2 = (uBeginIdx2 + uLength2)/4;
5356     else
5357         uEnd2 = (uBeginIdx2 + uLength2)/4 + 1;
5358 
5359     // Get begin and end positions on in_seqs
5360     vector<char>::const_iterator i_in1_begin = in_seq1_data.begin() + uStart1;
5361     vector<char>::const_iterator i_in1_end = in_seq1_data.begin() + uEnd1 - 1;
5362     vector<char>::const_iterator i_in2_begin = in_seq2_data.begin() + uStart2;
5363     vector<char>::const_iterator i_in2_end = in_seq2_data.begin() + uEnd2;
5364 
5365     // Declare iterators
5366     vector<char>::iterator i_out = out_seq_data.begin() - 1;
5367     vector<char>::const_iterator i_in1;
5368     vector<char>::const_iterator i_in2;
5369 
5370     // Insert in_seq1 into out_seq
5371     for(i_in1 = i_in1_begin; i_in1 != i_in1_end; ++i_in1)
5372         (*(++i_out)) = ((*i_in1) << lShift1) | ((*(i_in1+1) & 255) >> rShift1);
5373 
5374     // Handle last byte for in_seq1 if necessary
5375     TSeqPos uEndOutByte;
5376     if((uLength1 % 4) == 0)
5377         uEndOutByte = uLength1/4 - 1;
5378     else
5379         uEndOutByte = uLength1/4;
5380     if(i_out != (out_seq_data.begin() + uEndOutByte))
5381         (*(++i_out)) = (*i_in1) << lShift1;
5382 
5383     // Connect in_seq1 and in_seq2
5384     unsigned char uMask1 = 255 << (8 - 2*(uLength1 % 4));
5385     unsigned char uMask2 = 255 >> (2*(uBeginIdx2 % 4));
5386     TSeqPos uSeq2Inc = 1;
5387 
5388     switch (uCase) {
5389     case 0: // 0 < uVacantIdx < uStartIdx
5390         if((i_in2_begin + 1) == i_in2_end)
5391             {
5392                 (*i_out) &= uMask1;
5393                 (*i_out) |= ((*i_in2_begin) & uMask2) << lShift2;
5394                 return uTotalLength;
5395             }
5396         else
5397             {
5398                 (*i_out) &= uMask1;
5399                 (*i_out) |=
5400                     (((*i_in2_begin) & uMask2) << lShift2) |
5401                     (((*(i_in2_begin+1)) & 255) >> rShift2);
5402             }
5403         break;
5404     case 1: // 0 == uVacantIdx < uStartIdx
5405         if((i_in2_begin + 1) == i_in2_end)
5406             {
5407                 (*(++i_out)) = (*i_in2_begin) << lShift2;
5408                 return uTotalLength;
5409             }
5410         else
5411             {
5412                 (*(++i_out)) =
5413                     ((*i_in2_begin) << lShift2) |
5414                     (((*(i_in2_begin+1)) & 255) >> rShift2);
5415             }
5416         break;
5417     case 2: // uVacantIdx == uStartIdx > 0
5418         (*i_out) &= uMask1;
5419         (*i_out) |= (*i_in2_begin) & uMask2;
5420         if((i_in2_begin + 1) == i_in2_end)
5421             return uTotalLength;
5422         break;
5423     case 3: // uVacantIdx == uStartIdx == 0
5424         (*(++i_out)) = (*i_in2_begin);
5425         if((i_in2_begin + 1) == i_in2_end)
5426             return uTotalLength;
5427         break;
5428     case 4: // uVacantIdx > uStartIdx
5429         if((i_in2_begin + 1) == i_in2_end)
5430             {
5431                 (*i_out) &= uMask1;
5432                 (*i_out) |= ((*i_in2_begin) & uMask2) >> rShift2;
5433                 if(++i_out != out_seq_data.end())
5434                     (*i_out) = (*i_in2_begin) << lShift2;
5435                 return uTotalLength;
5436             }
5437         else
5438             {
5439                 (*i_out) &= uMask1;
5440                 (*i_out) |=
5441                     (((*i_in2_begin) & uMask2) >> rShift2) |
5442                     ((*(i_in2_begin+1) & ~uMask2) << lShift2);
5443                 uSeq2Inc = 0;
5444             }
5445 
5446     }
5447 
5448     // Insert in_seq2 into out_seq
5449     for(i_in2 = i_in2_begin+uSeq2Inc; (i_in2 != i_in2_end) &&
5450             ((i_in2+1) != i_in2_end); ++i_in2) {
5451         (*(++i_out)) = ((*i_in2) << lShift2) | ((*(i_in2+1) & 255) >> rShift2);
5452     }
5453 
5454     // Handle last byte for in_seq2, if there is one
5455     if((++i_out != out_seq_data.end()) && (i_in2 != i_in2_end))
5456         (*i_out) = (*i_in2) << lShift2;
5457 
5458     return uLength1 + uLength2;
5459 }
5460 
5461 
AppendNcbi4na(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5462 TSeqPos CSeqportUtil_implementation::AppendNcbi4na
5463 (CSeq_data*        out_seq,
5464  const CSeq_data&  in_seq1,
5465  TSeqPos           uBeginIdx1,
5466  TSeqPos           uLength1,
5467  const CSeq_data&  in_seq2,
5468  TSeqPos           uBeginIdx2,
5469  TSeqPos           uLength2)
5470     const
5471 {
5472     // Get references to in_seqs
5473     const vector<char>& in_seq1_data = in_seq1.GetNcbi4na().Get();
5474     const vector<char>& in_seq2_data = in_seq2.GetNcbi4na().Get();
5475 
5476     // Get a reference to out_seq
5477     out_seq->Reset();
5478     vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
5479 
5480     // Handle both uBeginidx go beyond end of in_seq
5481     if((uBeginIdx1 >= 4*in_seq1_data.size()) &&
5482        (uBeginIdx2 >= 4*in_seq2_data.size()))
5483         return 0;
5484 
5485     // Handle case where uBeginIdx1 goes beyond end of in_seq1
5486     if(uBeginIdx1 >= 4*in_seq1_data.size())
5487         return GetNcbi4naCopy(in_seq2, out_seq, uBeginIdx2, uLength2);
5488 
5489     // Handle case where uBeginIdx2 goes beyond end of in_seq2
5490     if(uBeginIdx2 >= 4*in_seq2_data.size())
5491         return GetNcbi4naCopy(in_seq1, out_seq, uBeginIdx1, uLength1);
5492 
5493     // Validate and Adjust uBeginIdx_ and uLength_
5494     if(((uBeginIdx1 + uLength1) > 2*in_seq1_data.size()) || uLength1 == 0)
5495         uLength1 = 2*in_seq1_data.size() - uBeginIdx1;
5496 
5497     if(((uBeginIdx2 + uLength2) > 2*in_seq2_data.size()) || uLength2 == 0)
5498         uLength2 = 2*in_seq2_data.size() - uBeginIdx2;
5499 
5500     // Resize out_seq_data to hold appended sequence
5501     TSeqPos uTotalLength = uLength1 + uLength2;
5502     if((uTotalLength % 2) == 0)
5503         out_seq_data.resize(uTotalLength/2);
5504     else
5505         out_seq_data.resize(uTotalLength/2 + 1);
5506 
5507     // Calculate bit shifts required for in_seq1
5508     unsigned int lShift1 = 4*(uBeginIdx1 % 2);
5509     unsigned int rShift1 = 8 - lShift1;
5510 
5511     // Calculate bit shifts required for in_seq2
5512     unsigned int lShift2, rShift2, uCase;
5513     unsigned int uVacantIdx = 4*(uLength1 % 2);
5514     unsigned int uStartIdx = 4*(uBeginIdx2 % 2);
5515     if((uVacantIdx < uStartIdx))
5516         {
5517             uCase = 1;
5518             lShift2 = uStartIdx;
5519             rShift2 = 8 - lShift2;
5520         }
5521     else if((uVacantIdx == uStartIdx) && (uVacantIdx > 0))
5522         {
5523             uCase = 2;
5524             lShift2 = 0;
5525             rShift2 = 8;
5526         }
5527     else if((uVacantIdx == uStartIdx) && (uVacantIdx == 0))
5528         {
5529             uCase = 3;
5530             lShift2 = 0;
5531             rShift2 = 8;
5532         }
5533     else
5534         {
5535             uCase = 4;
5536             rShift2 = uVacantIdx - uStartIdx;
5537             lShift2 = 8 - rShift2;
5538         }
5539 
5540 
5541     // Determine begin and end points for iterators.
5542     TSeqPos uStart1 = uBeginIdx1/2;
5543     TSeqPos uEnd1;
5544     if(((uBeginIdx1 + uLength1) % 2) == 0)
5545         uEnd1 = (uBeginIdx1 + uLength1)/2;
5546     else
5547         uEnd1 = (uBeginIdx1 + uLength1)/2 + 1;
5548 
5549     TSeqPos uStart2 = uBeginIdx2/2;
5550     TSeqPos uEnd2;
5551     if(((uBeginIdx2 + uLength2) % 2) == 0)
5552         uEnd2 = (uBeginIdx2 + uLength2)/2;
5553     else
5554         uEnd2 = (uBeginIdx2 + uLength2)/2 + 1;
5555 
5556     // Get begin and end positions on in_seqs
5557     vector<char>::const_iterator i_in1_begin = in_seq1_data.begin() + uStart1;
5558     vector<char>::const_iterator i_in1_end = in_seq1_data.begin() + uEnd1 - 1;
5559     vector<char>::const_iterator i_in2_begin = in_seq2_data.begin() + uStart2;
5560     vector<char>::const_iterator i_in2_end = in_seq2_data.begin() + uEnd2;
5561 
5562     // Declare iterators
5563     vector<char>::iterator i_out = out_seq_data.begin() - 1;
5564     vector<char>::const_iterator i_in1;
5565     vector<char>::const_iterator i_in2;
5566 
5567     // Insert in_seq1 into out_seq
5568     for(i_in1 = i_in1_begin; i_in1 != i_in1_end; ++i_in1)
5569         (*(++i_out)) = ((*i_in1) << lShift1) | ((*(i_in1+1) & 255) >> rShift1);
5570 
5571     // Handle last byte for in_seq1 if necessary
5572     TSeqPos uEndOutByte;
5573     if((uLength1 % 2) == 0)
5574         uEndOutByte = uLength1/2 - 1;
5575     else
5576         uEndOutByte = uLength1/2;
5577     if(i_out != (out_seq_data.begin() + uEndOutByte))
5578         (*(++i_out)) = (*i_in1) << lShift1;
5579 
5580     // Connect in_seq1 and in_seq2
5581     unsigned char uMask1 = 255 << (8 - 4*(uLength1 % 2));
5582     unsigned char uMask2 = 255 >> (4*(uBeginIdx2 % 2));
5583     TSeqPos uSeq2Inc = 1;
5584 
5585     switch (uCase) {
5586     case 1: // 0 == uVacantIdx < uStartIdx
5587         if((i_in2_begin+1) == i_in2_end)
5588             {
5589                 (*(++i_out)) = (*i_in2_begin) << lShift2;
5590                 return uTotalLength;
5591             }
5592         else
5593             {
5594                 (*(++i_out)) =
5595                     ((*i_in2_begin) << lShift2) |
5596                     (((*(i_in2_begin+1)) & 255) >> rShift2);
5597             }
5598         break;
5599     case 2: // uVacantIdx == uStartIdx > 0
5600         (*i_out) &= uMask1;
5601         (*i_out) |= (*i_in2_begin) & uMask2;
5602         if((i_in2_begin+1) == i_in2_end)
5603             return uTotalLength;
5604         break;
5605     case 3: // uVacantIdx == uStartIdx == 0
5606         (*(++i_out)) = (*i_in2_begin);
5607         if((i_in2_begin+1) == i_in2_end)
5608             return uTotalLength;
5609         break;
5610     case 4: // uVacantIdx > uStartIdx
5611         if((i_in2_begin+1) == i_in2_end)
5612             {
5613                 (*i_out) &= uMask1;
5614                 (*i_out) |= ((*i_in2_begin) & uMask2) >> rShift2;
5615                 if(++i_out != out_seq_data.end())
5616                     (*i_out) = (*i_in2_begin) << lShift2;
5617                 return uTotalLength;
5618             }
5619         else
5620             {
5621                 (*i_out) &= uMask1;
5622                 (*i_out) |=
5623                     (((*i_in2_begin) & uMask2) >> rShift2) |
5624                     ((*(i_in2_begin+1) & ~uMask2) << lShift2);
5625                 uSeq2Inc = 0;
5626             }
5627 
5628     }
5629 
5630     // Insert in_seq2 into out_seq
5631     for(i_in2 = i_in2_begin+uSeq2Inc; (i_in2 != i_in2_end) &&
5632             ((i_in2+1) != i_in2_end); ++i_in2) {
5633         (*(++i_out)) =
5634             ((*i_in2) << lShift2) | ((*(i_in2+1) & 255) >> rShift2);
5635     }
5636 
5637     // Handle last byte for in_seq2, if there is one
5638     if((++i_out != out_seq_data.end()) && (i_in2 != i_in2_end))
5639         (*i_out) = (*i_in2) << lShift2;
5640 
5641     return uTotalLength;
5642 }
5643 
5644 
AppendNcbieaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5645 TSeqPos CSeqportUtil_implementation::AppendNcbieaa
5646 (CSeq_data*        out_seq,
5647  const CSeq_data&  in_seq1,
5648  TSeqPos           uBeginIdx1,
5649  TSeqPos           uLength1,
5650  const CSeq_data&  in_seq2,
5651  TSeqPos           uBeginIdx2,
5652  TSeqPos           uLength2)
5653     const
5654 {
5655     // Get references to in_seqs
5656     const string& in_seq1_data = in_seq1.GetNcbieaa().Get();
5657     const string& in_seq2_data = in_seq2.GetNcbieaa().Get();
5658 
5659     // Get a reference to out_seq
5660     out_seq->Reset();
5661     string& out_seq_data = out_seq->SetNcbieaa().Set();
5662 
5663     // Validate and Adjust uBeginIdx_ and uLength_
5664     if((uBeginIdx1 >= in_seq1_data.size()) &&
5665        (uBeginIdx2 >= in_seq2_data.size()))
5666         {
5667             return 0;
5668         }
5669 
5670     if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5671         uLength1 = in_seq1_data.size() - uBeginIdx1;
5672 
5673     if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5674         uLength2 = in_seq2_data.size() - uBeginIdx2;
5675 
5676     // Append the strings
5677     out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5678     out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5679 
5680     return uLength1 + uLength2;
5681 }
5682 
5683 
AppendNcbistdaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5684 TSeqPos CSeqportUtil_implementation::AppendNcbistdaa
5685 (CSeq_data*          out_seq,
5686  const CSeq_data&    in_seq1,
5687  TSeqPos             uBeginIdx1,
5688  TSeqPos             uLength1,
5689  const CSeq_data&    in_seq2,
5690  TSeqPos             uBeginIdx2,
5691  TSeqPos             uLength2)
5692     const
5693 {
5694     // Get references to in_seqs
5695     const vector<char>& in_seq1_data = in_seq1.GetNcbistdaa().Get();
5696     const vector<char>& in_seq2_data = in_seq2.GetNcbistdaa().Get();
5697 
5698     // Get a reference to out_seq
5699     out_seq->Reset();
5700     vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
5701 
5702     // Validate and Adjust uBeginIdx_ and uLength_
5703     if((uBeginIdx1 >= in_seq1_data.size()) &&
5704        (uBeginIdx2 >= in_seq2_data.size()))
5705         return 0;
5706 
5707     if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5708         uLength1 = in_seq1_data.size() - uBeginIdx1;
5709 
5710     if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5711         uLength2 = in_seq2_data.size() - uBeginIdx2;
5712 
5713     // Get begin and end positions on in_seqs
5714     vector<char>::const_iterator i_in1_begin =
5715         in_seq1_data.begin() + uBeginIdx1;
5716     vector<char>::const_iterator i_in1_end = i_in1_begin + uLength1;
5717     vector<char>::const_iterator i_in2_begin =
5718         in_seq2_data.begin() + uBeginIdx2;
5719     vector<char>::const_iterator i_in2_end = i_in2_begin + uLength2;
5720 
5721     // Insert the in_seqs into out_seq
5722     out_seq_data.insert(out_seq_data.end(), i_in1_begin, i_in1_end);
5723     out_seq_data.insert(out_seq_data.end(), i_in2_begin, i_in2_end);
5724 
5725     return uLength1 + uLength2;
5726 }
5727 
5728 
AppendIupacaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5729 TSeqPos CSeqportUtil_implementation::AppendIupacaa
5730 (CSeq_data*          out_seq,
5731  const CSeq_data&    in_seq1,
5732  TSeqPos             uBeginIdx1,
5733  TSeqPos             uLength1,
5734  const CSeq_data&    in_seq2,
5735  TSeqPos             uBeginIdx2,
5736  TSeqPos             uLength2)
5737     const
5738 {
5739     // Get references to in_seqs
5740     const string& in_seq1_data = in_seq1.GetIupacaa().Get();
5741     const string& in_seq2_data = in_seq2.GetIupacaa().Get();
5742 
5743     // Get a reference to out_seq
5744     out_seq->Reset();
5745     string& out_seq_data = out_seq->SetIupacaa().Set();
5746 
5747     // Validate and Adjust uBeginIdx_ and uLength_
5748     if((uBeginIdx1 >= in_seq1_data.size()) &&
5749        (uBeginIdx2 >= in_seq2_data.size()))
5750         {
5751             return 0;
5752         }
5753 
5754     if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5755         uLength1 = in_seq1_data.size() - uBeginIdx1;
5756 
5757     if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5758         uLength2 = in_seq2_data.size() - uBeginIdx2;
5759 
5760     // Append the strings
5761     out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5762     out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5763 
5764     return uLength1 + uLength2;
5765 }
5766 
5767 // Returns the 3 letter Iupacaa3 code for an ncbistdaa index
GetIupacaa3(TIndex ncbistdaa)5768 const string& CSeqportUtil_implementation::GetIupacaa3
5769 (TIndex ncbistdaa)
5770 {
5771     return GetCodeOrName(eSeq_code_type_iupacaa3, ncbistdaa, true);
5772 }
5773 
5774 // Returns true if code type is available
IsCodeAvailable(CSeq_data::E_Choice code_type)5775 bool CSeqportUtil_implementation::IsCodeAvailable
5776 (CSeq_data::E_Choice code_type)
5777 {
5778     if (code_type == CSeq_data::e_not_set) {
5779         return false;
5780     } else {
5781         return IsCodeAvailable(EChoiceToESeq(code_type));
5782     }
5783 }
5784 
5785 // Return true if code type is available
IsCodeAvailable(ESeq_code_type code_type)5786 bool CSeqportUtil_implementation::IsCodeAvailable (ESeq_code_type code_type)
5787 {
5788     // Iterate through Seq-code-set looking for code type
5789     ITERATE (CSeq_code_set::TCodes, i_ct, m_SeqCodeSet->GetCodes()) {
5790         if((*i_ct)->GetCode() == code_type) {
5791             return true;
5792         }
5793     }
5794     return false;
5795 }
5796 
5797 // Return a pair containing the first index (start-at) and last index
5798 // for code_type.
GetCodeIndexFromTo(CSeq_data::E_Choice code_type)5799 CSeqportUtil::TPair  CSeqportUtil_implementation::GetCodeIndexFromTo
5800 (CSeq_data::E_Choice code_type)
5801 {
5802     return GetCodeIndexFromTo(EChoiceToESeq(code_type));
5803 }
5804 
5805 // Return a pair containing the first index (start-at) and last index
5806 // for code_type.
GetCodeIndexFromTo(ESeq_code_type code_type)5807 CSeqportUtil::TPair CSeqportUtil_implementation::GetCodeIndexFromTo
5808 (ESeq_code_type code_type)
5809 {
5810     // Iterate through Seq-code-set looking for code type
5811     TPair p;
5812     ITERATE (CSeq_code_set::TCodes, i_ct, m_SeqCodeSet->GetCodes()) {
5813         if((*i_ct)->GetCode() == code_type) {
5814             if ( (*i_ct)->IsSetStart_at() ) {
5815                 p.first = static_cast<TIndex>((*i_ct)->GetStart_at());
5816             } else {
5817                 p.first = 0;
5818             }
5819             p.second = p.first + static_cast<TIndex>((*i_ct)->GetNum() - 1);
5820             return p;
5821         }
5822     }
5823     throw CSeqportUtil::CBadType("GetCodeIndexFromTo");
5824 }
5825 
5826 // Converts CSeq_data::E_Choice type to ESeq_code_type
5827 // and calls overloaded GetCodeOrName()
GetCodeOrName(CSeq_data::E_Choice code_type,TIndex idx,bool get_code)5828 const string& CSeqportUtil_implementation::GetCodeOrName
5829 (CSeq_data::E_Choice code_type,
5830  TIndex              idx,
5831  bool                get_code)
5832 {
5833     return GetCodeOrName(EChoiceToESeq(code_type), idx, get_code);
5834 }
5835 
5836 // Returns the code (symbol) of type code_type for index idx.
GetCodeOrName(ESeq_code_type code_type,TIndex idx,bool get_code)5837 const string& CSeqportUtil_implementation::GetCodeOrName
5838 (ESeq_code_type code_type,
5839  TIndex         idx,
5840  bool           get_code)
5841 {
5842     if ( !m_IndexString[get_code][code_type-1].size() ) {
5843         throw CSeqportUtil::CBadType("GetCodeOrName");
5844     }
5845     idx -= m_StartAt[code_type-1];
5846     if (idx >= m_IndexString[get_code][code_type-1].size()) {
5847         throw CSeqportUtil::CBadIndex(idx, "GetCodeOrName");
5848     }
5849     return m_IndexString[get_code][code_type-1][idx];
5850 
5851 }
5852 
5853 // Converts CSeq_data::E_Choice type to ESeq_code_type and call
5854 // overloaded GetIndex();
GetIndex(CSeq_data::E_Choice code_type,const string & code)5855 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndex
5856 (CSeq_data::E_Choice code_type,
5857  const string&       code)
5858 {
5859     return GetIndex(EChoiceToESeq(code_type), code);
5860 }
5861 
5862 // Get the index for code of type code_type. If not found, return -1
GetIndex(ESeq_code_type code_type,const string & code)5863 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndex
5864 (ESeq_code_type code_type,
5865  const string&  code)
5866 {
5867     // Iterator to a map mapping a string code to a code index
5868     map<string, TIndex>::const_iterator pos;
5869 
5870     if ( !m_StringIndex[code_type-1].size() ) {
5871         throw CSeqportUtil::CBadType("GetIndex");
5872     }
5873     pos = m_StringIndex[code_type-1].find(code);
5874     if (pos != m_StringIndex[code_type-1].end()) {
5875         return pos->second;
5876     } else {
5877         throw CSeqportUtil::CBadSymbol(code, "GetIndex");
5878     }
5879 
5880 }
5881 
5882 // Gets complement of index for code type. Returns -1 if code
5883 // type does not exist
GetIndexComplement(CSeq_data::E_Choice code_type,TIndex idx)5884 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndexComplement
5885 (CSeq_data::E_Choice code_type,
5886  TIndex              idx)
5887 {
5888     return GetIndexComplement(EChoiceToESeq(code_type), idx);
5889 }
5890 
5891 // Returns the complement of the index for code_type. If code_type
5892 // does not exist, or complements for code_type do not exist,
5893 // returns -1
GetIndexComplement(ESeq_code_type code_type,TIndex idx)5894 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndexComplement
5895 (ESeq_code_type code_type,
5896  TIndex         idx)
5897 {
5898 
5899     // Check that code is available
5900     if (!m_IndexComplement[code_type-1].size()) {
5901         throw CSeqportUtil::CBadType("GetIndexComplement");
5902     }
5903 
5904     // Check that idx is in range of code indices
5905     idx -= m_StartAt[code_type-1];
5906     if ( idx >= m_IndexComplement[code_type-1].size() ) {
5907         throw CSeqportUtil::CBadIndex(idx, "GetIndexComplement");
5908     }
5909 
5910     // Return the index of the complement
5911     return m_IndexComplement[code_type-1][idx];
5912  }
5913 
GetMapToIndex(CSeq_data::E_Choice from_type,CSeq_data::E_Choice to_type,TIndex from_idx)5914 CSeqportUtil::TIndex CSeqportUtil_implementation::GetMapToIndex
5915 (CSeq_data::E_Choice from_type,
5916  CSeq_data::E_Choice to_type,
5917  TIndex              from_idx)
5918 {
5919     return GetMapToIndex(EChoiceToESeq(from_type),
5920                          EChoiceToESeq(to_type),
5921                          from_idx);
5922 }
5923 
GetMapToIndex(ESeq_code_type from_type,ESeq_code_type to_type,TIndex from_idx)5924 CSeqportUtil::TIndex CSeqportUtil_implementation::GetMapToIndex
5925 (ESeq_code_type from_type,
5926  ESeq_code_type to_type,
5927  TIndex            from_idx)
5928 {
5929     CMap_table* Map = 0;
5930 
5931     if (from_type == eSeq_code_type_iupacna) {
5932         if (to_type == eSeq_code_type_ncbi2na) {
5933             Map = m_IupacnaNcbi2na.GetPointer();
5934         } else if (to_type == eSeq_code_type_ncbi4na) {
5935             Map = m_IupacnaNcbi4na.GetPointer();
5936         }
5937     } else if (from_type == eSeq_code_type_ncbi4na) {
5938         if (to_type == eSeq_code_type_iupacna) {
5939             Map = m_Ncbi4naIupacna.GetPointer();
5940         } else if (to_type == eSeq_code_type_ncbi2na) {
5941             Map = m_Ncbi4naNcbi2na.GetPointer();
5942         }
5943     } else if (from_type == eSeq_code_type_ncbi2na) {
5944         if (to_type == eSeq_code_type_iupacna) {
5945             Map = m_Ncbi2naIupacna.GetPointer();
5946         } else if (to_type == eSeq_code_type_ncbi4na) {
5947             Map = m_Ncbi2naNcbi4na.GetPointer();
5948         }
5949     } else if (from_type == eSeq_code_type_iupacaa) {
5950         if (to_type == eSeq_code_type_ncbieaa) {
5951             Map = m_IupacaaNcbieaa.GetPointer();
5952         } else if (to_type == eSeq_code_type_ncbistdaa) {
5953             Map = m_IupacaaNcbistdaa.GetPointer();
5954         }
5955     } else if (from_type == eSeq_code_type_ncbieaa) {
5956         if (to_type == eSeq_code_type_iupacaa) {
5957             Map = m_NcbieaaIupacaa.GetPointer();
5958         } else if (to_type == eSeq_code_type_ncbistdaa) {
5959             Map = m_NcbieaaNcbistdaa.GetPointer();
5960         }
5961     } else if (from_type == eSeq_code_type_ncbistdaa) {
5962         if (to_type == eSeq_code_type_iupacaa) {
5963             Map = m_NcbistdaaIupacaa.GetPointer();
5964         } else if (to_type == eSeq_code_type_ncbieaa) {
5965             Map = m_NcbistdaaNcbieaa.GetPointer();
5966         }
5967     }
5968 
5969     // Check that requested map is available
5970     if (!Map) {
5971         throw CSeqportUtil::CBadType("GetMapToIndex");
5972     }
5973 
5974     // Check that from_idx is within range of from_type
5975     if (from_idx - (*Map).m_StartAt >= (TIndex)(*Map).m_Size) {
5976         throw CSeqportUtil::CBadIndex(from_idx - (*Map).m_StartAt,
5977             "GetMapToIndex");
5978     }
5979 
5980     // Return map value
5981     return (*Map).m_Table[from_idx];
5982 
5983 
5984 }
5985 
5986 
x_GetSeqFromSeqData(const CSeq_data & data,const string ** str,const vector<char> ** vec) const5987 void CSeqportUtil_implementation::x_GetSeqFromSeqData
5988 (const CSeq_data& data,
5989  const string** str,
5990  const vector<char>** vec)
5991     const
5992 {
5993     *str = 0;
5994     *vec = 0;
5995 
5996     switch ( data.Which() ) {
5997     case CSeq_data::e_Iupacna:
5998         *str = &(data.GetIupacna().Get());
5999         break;
6000 
6001     case CSeq_data::e_Ncbi2na:
6002         *vec = &(data.GetNcbi2na().Get());
6003         break;
6004 
6005     case CSeq_data::e_Ncbi4na:
6006         *vec = &(data.GetNcbi4na().Get());
6007         break;
6008 
6009     case CSeq_data::e_Ncbi8na:
6010         *vec = &(data.GetNcbi8na().Get());
6011         break;
6012 
6013     case CSeq_data::e_Iupacaa:
6014         *str = &(data.GetIupacaa().Get());
6015         break;
6016 
6017     case CSeq_data::e_Ncbi8aa:
6018         *vec = &(data.GetNcbi8aa().Get());
6019         break;
6020 
6021     case CSeq_data::e_Ncbieaa:
6022         *str = &(data.GetNcbieaa().Get());
6023         break;
6024 
6025     case CSeq_data::e_Ncbistdaa:
6026         *vec = &(data.GetNcbistdaa().Get());
6027         break;
6028 
6029     case CSeq_data::e_not_set:
6030     case CSeq_data::e_Ncbipna:
6031     case CSeq_data::e_Ncbipaa:
6032     case CSeq_data::e_Gap:
6033         break;
6034     } // end of switch statement
6035 }
6036 
6037 
6038 // same as above, but takes a non-const CSeq_data object.
x_GetSeqFromSeqData(CSeq_data & data,string ** str,vector<char> ** vec) const6039 void CSeqportUtil_implementation::x_GetSeqFromSeqData
6040 (CSeq_data& data,
6041  string** str,
6042  vector<char>** vec)
6043     const
6044 {
6045     *str = 0;
6046     *vec = 0;
6047 
6048     switch ( data.Which() ) {
6049     case CSeq_data::e_Iupacna:
6050         *str = &(data.SetIupacna().Set());
6051         break;
6052 
6053     case CSeq_data::e_Ncbi2na:
6054         *vec = &(data.SetNcbi2na().Set());
6055         break;
6056 
6057     case CSeq_data::e_Ncbi4na:
6058         *vec = &(data.SetNcbi4na().Set());
6059         break;
6060 
6061     case CSeq_data::e_Ncbi8na:
6062         *vec = &(data.SetNcbi8na().Set());
6063         break;
6064 
6065     case CSeq_data::e_Iupacaa:
6066         *str = &(data.SetIupacaa().Set());
6067         break;
6068 
6069     case CSeq_data::e_Ncbi8aa:
6070         *vec = &(data.SetNcbi8aa().Set());
6071         break;
6072 
6073     case CSeq_data::e_Ncbieaa:
6074         *str = &(data.SetNcbieaa().Set());
6075         break;
6076 
6077     case CSeq_data::e_Ncbistdaa:
6078         *vec = &(data.SetNcbistdaa().Set());
6079         break;
6080 
6081     case CSeq_data::e_not_set:
6082     case CSeq_data::e_Ncbipna:
6083     case CSeq_data::e_Ncbipaa:
6084     case CSeq_data::e_Gap:
6085         break;
6086     } // end of switch statement
6087 }
6088 
6089 
6090 /////////////////////////////////////////////////////////////////////////////
6091 //  CSeqportUtil_implementation::sm_StrAsnData  --  some very long and ugly string
6092 //
6093 
6094 // local copy of seqcode.prt sequence alphabet and conversion table ASN.1
6095 const char* CSeqportUtil_implementation::sm_StrAsnData[] =
6096 {
6097     "-- This is the set of NCBI sequence code tables\n",
6098     "-- J.Ostell  10/18/91\n",
6099     "--\n",
6100     "\n",
6101     "Seq-code-set ::= {\n",
6102     " codes {                              -- codes\n",
6103     " {                                -- IUPACna\n",
6104     " code iupacna ,\n",
6105     " num 25 ,                     -- continuous 65-89\n",
6106     " one-letter TRUE ,            -- all one letter codes\n",
6107     " start-at 65 ,                -- starts with A, ASCII 65\n",
6108     " table {\n",
6109     " { symbol \"A\", name \"Adenine\" },\n",
6110     " { symbol \"B\" , name \"G or T or C\" },\n",
6111     " { symbol \"C\", name \"Cytosine\" },\n",
6112     " { symbol \"D\", name \"G or A or T\" },\n",
6113     " { symbol \"\", name \"\" },\n",
6114     " { symbol \"\", name \"\" },\n",
6115     " { symbol \"G\", name \"Guanine\" },\n",
6116     " { symbol \"H\", name \"A or C or T\" } ,\n",
6117     " { symbol \"\", name \"\" },\n",
6118     " { symbol \"\", name \"\" },\n",
6119     " { symbol \"K\", name \"G or T\" },\n",
6120     " { symbol \"\", name \"\"},\n",
6121     " { symbol \"M\", name \"A or C\" },\n",
6122     " { symbol \"N\", name \"A or G or C or T\" } ,\n",
6123     " { symbol \"\", name \"\" },\n",
6124     " { symbol \"\", name \"\" },\n",
6125     " { symbol \"\", name \"\"},\n",
6126     " { symbol \"R\", name \"G or A\"},\n",
6127     " { symbol \"S\", name \"G or C\"},\n",
6128     " { symbol \"T\", name \"Thymine\"},\n",
6129     " { symbol \"\", name \"\"},\n",
6130     " { symbol \"V\", name \"G or C or A\"},\n",
6131     " { symbol \"W\", name \"A or T\" },\n",
6132     " { symbol \"\", name \"\"},\n",
6133     " { symbol \"Y\", name \"T or C\"}\n",
6134     " } ,                           -- end of table\n",
6135     " comps {                      -- complements\n",
6136     " 84,\n",
6137     " 86,\n",
6138     " 71,\n",
6139     " 72,\n",
6140     " 69,\n",
6141     " 70,\n",
6142     " 67,\n",
6143     " 68,\n",
6144     " 73,\n",
6145     " 74,\n",
6146     " 77,\n",
6147     " 76,\n",
6148     " 75,\n",
6149     " 78,\n",
6150     " 79,\n",
6151     " 80,\n",
6152     " 81,\n",
6153     " 89,\n",
6154     " 83,\n",
6155     " 65,\n",
6156     " 85,\n",
6157     " 66,\n",
6158     " 87,\n",
6159     " 88,\n",
6160     " 82\n",
6161     " }\n",
6162     " } ,\n",
6163     " {                                -- IUPACaa\n",
6164     " code iupacaa ,\n",
6165     " num 26 ,                     -- continuous 65-90\n",
6166     " one-letter TRUE ,            -- all one letter codes\n",
6167     " start-at 65 ,                -- starts with A, ASCII 65\n",
6168     " table {\n",
6169     " { symbol \"A\", name \"Alanine\" },\n",
6170     " { symbol \"B\" , name \"Asp or Asn\" },\n",
6171     " { symbol \"C\", name \"Cysteine\" },\n",
6172     " { symbol \"D\", name \"Aspartic Acid\" },\n",
6173     " { symbol \"E\", name \"Glutamic Acid\" },\n",
6174     " { symbol \"F\", name \"Phenylalanine\" },\n",
6175     " { symbol \"G\", name \"Glycine\" },\n",
6176     " { symbol \"H\", name \"Histidine\" } ,\n",
6177     " { symbol \"I\", name \"Isoleucine\" },\n",
6178     " { symbol \"J\", name \"Leu or Ile\" },\n",
6179     " { symbol \"K\", name \"Lysine\" },\n",
6180     " { symbol \"L\", name \"Leucine\" },\n",
6181     " { symbol \"M\", name \"Methionine\" },\n",
6182     " { symbol \"N\", name \"Asparagine\" } ,\n",
6183     " { symbol \"O\", name \"Pyrrolysine\" },\n",
6184     " { symbol \"P\", name \"Proline\" },\n",
6185     " { symbol \"Q\", name \"Glutamine\"},\n",
6186     " { symbol \"R\", name \"Arginine\"},\n",
6187     " { symbol \"S\", name \"Serine\"},\n",
6188     " { symbol \"T\", name \"Threonine\"},\n",
6189     " { symbol \"U\", name \"Selenocysteine\"}, -- was empty\n",
6190     " { symbol \"V\", name \"Valine\"},\n",
6191     " { symbol \"W\", name \"Tryptophan\" },\n",
6192     " { symbol \"X\", name \"Undetermined or atypical\"},\n",
6193     " { symbol \"Y\", name \"Tyrosine\"},\n",
6194     " { symbol \"Z\", name \"Glu or Gln\" }\n",
6195     " }                            -- end of table            \n",
6196     " } ,\n",
6197     " {                                -- IUPACeaa\n",
6198     " code ncbieaa ,\n",
6199     " num 49 ,                     -- continuous 42-90\n",
6200     " one-letter TRUE ,            -- all one letter codes\n",
6201     " start-at 42 ,                -- starts with *, ASCII 42\n",
6202     " table {\n",
6203     " { symbol \"*\", name \"Termination\" } ,\n",
6204     " { symbol \"\", name \"\" } ,\n",
6205     " { symbol \"\", name \"\" } ,\n",
6206     " { symbol \"-\", name \"Gap\" } ,\n",
6207     " { symbol \"\", name \"\" } ,\n",
6208     " { symbol \"\", name \"\" } ,\n",
6209     " { symbol \"\", name \"\" } ,\n",
6210     " { symbol \"\", name \"\" } ,\n",
6211     " { symbol \"\", name \"\" } ,\n",
6212     " { symbol \"\", name \"\" } ,\n",
6213     " { symbol \"\", name \"\" } ,\n",
6214     " { symbol \"\", name \"\" } ,\n",
6215     " { symbol \"\", name \"\" } ,\n",
6216     " { symbol \"\", name \"\" } ,\n",
6217     " { symbol \"\", name \"\" } ,\n",
6218     " { symbol \"\", name \"\" } ,\n",
6219     " { symbol \"\", name \"\" } ,\n",
6220     " { symbol \"\", name \"\" } ,\n",
6221     " { symbol \"\", name \"\" } ,\n",
6222     " { symbol \"\", name \"\" } ,\n",
6223     " { symbol \"\", name \"\" } ,\n",
6224     " { symbol \"\", name \"\" } ,\n",
6225     " { symbol \"\", name \"\" } ,\n",
6226     " { symbol \"A\", name \"Alanine\" },\n",
6227     " { symbol \"B\" , name \"Asp or Asn\" },\n",
6228     " { symbol \"C\", name \"Cysteine\" },\n",
6229     " { symbol \"D\", name \"Aspartic Acid\" },\n",
6230     " { symbol \"E\", name \"Glutamic Acid\" },\n",
6231     " { symbol \"F\", name \"Phenylalanine\" },\n",
6232     " { symbol \"G\", name \"Glycine\" },\n",
6233     " { symbol \"H\", name \"Histidine\" } ,\n",
6234     " { symbol \"I\", name \"Isoleucine\" },\n",
6235     " { symbol \"J\", name \"Leu or Ile\" },\n",
6236     " { symbol \"K\", name \"Lysine\" },\n",
6237     " { symbol \"L\", name \"Leucine\" },\n",
6238     " { symbol \"M\", name \"Methionine\" },\n",
6239     " { symbol \"N\", name \"Asparagine\" } ,\n",
6240     " { symbol \"O\", name \"Pyrrolysine\" },\n",
6241     " { symbol \"P\", name \"Proline\" },\n",
6242     " { symbol \"Q\", name \"Glutamine\"},\n",
6243     " { symbol \"R\", name \"Arginine\"},\n",
6244     " { symbol \"S\", name \"Serine\"},\n",
6245     " { symbol \"T\", name \"Threonine\"},\n",
6246     " { symbol \"U\", name \"Selenocysteine\"},\n",
6247     " { symbol \"V\", name \"Valine\"},\n",
6248     " { symbol \"W\", name \"Tryptophan\" },\n",
6249     " { symbol \"X\", name \"Undetermined or atypical\"},\n",
6250     " { symbol \"Y\", name \"Tyrosine\"},\n",
6251     " { symbol \"Z\", name \"Glu or Gln\" }\n",
6252     " }                            -- end of table            \n",
6253     " } ,\n",
6254     " {                                -- IUPACaa3\n",
6255     " code iupacaa3 ,\n",
6256     " num 28 ,                     -- continuous 0-27\n",
6257     " one-letter FALSE ,            -- all 3 letter codes\n",
6258     " table {\n",
6259     " { symbol \"---\", name \"Gap\" } ,\n",
6260     " { symbol \"Ala\", name \"Alanine\" },\n",
6261     " { symbol \"Asx\" , name \"Asp or Asn\" },\n",
6262     " { symbol \"Cys\", name \"Cysteine\" },\n",
6263     " { symbol \"Asp\", name \"Aspartic Acid\" },\n",
6264     " { symbol \"Glu\", name \"Glutamic Acid\" },\n",
6265     " { symbol \"Phe\", name \"Phenylalanine\" },\n",
6266     " { symbol \"Gly\", name \"Glycine\" },\n",
6267     " { symbol \"His\", name \"Histidine\" } ,\n",
6268     " { symbol \"Ile\", name \"Isoleucine\" },\n",
6269     " { symbol \"Lys\", name \"Lysine\" },\n",
6270     " { symbol \"Leu\", name \"Leucine\" },\n",
6271     " { symbol \"Met\", name \"Methionine\" },\n",
6272     " { symbol \"Asn\", name \"Asparagine\" } ,\n",
6273     " { symbol \"Pro\", name \"Proline\" },\n",
6274     " { symbol \"Gln\", name \"Glutamine\"},\n",
6275     " { symbol \"Arg\", name \"Arginine\"},\n",
6276     " { symbol \"Ser\", name \"Serine\"},\n",
6277     " { symbol \"Thr\", name \"Threonine\"},\n",
6278     " { symbol \"Val\", name \"Valine\"},\n",
6279     " { symbol \"Trp\", name \"Tryptophan\" },\n",
6280     " { symbol \"Xxx\", name \"Undetermined or atypical\"},\n",
6281     " { symbol \"Tyr\", name \"Tyrosine\"},\n",
6282     " { symbol \"Glx\", name \"Glu or Gln\" },\n",
6283     " { symbol \"Sec\", name \"Selenocysteine\"},\n",
6284     " { symbol \"Ter\", name \"Termination\" },\n",
6285     " { symbol \"Pyl\", name \"Pyrrolysine\"},\n",
6286     " { symbol \"Xle\", name \"Leu or Ile\"}\n",
6287     " }                            -- end of table            \n",
6288     " } ,\n",
6289     " {                                -- NCBIstdaa\n",
6290     " code ncbistdaa ,\n",
6291     " num 28 ,                     -- continuous 0-27\n",
6292     " one-letter TRUE ,            -- all one letter codes\n",
6293     " table {\n",
6294     " { symbol \"-\", name \"Gap\" } ,                -- 0\n",
6295     " { symbol \"A\", name \"Alanine\" },             -- 1\n",
6296     " { symbol \"B\" , name \"Asp or Asn\" },         -- 2\n",
6297     " { symbol \"C\", name \"Cysteine\" },            -- 3\n",
6298     " { symbol \"D\", name \"Aspartic Acid\" },       -- 4\n",
6299     " { symbol \"E\", name \"Glutamic Acid\" },       -- 5\n",
6300     " { symbol \"F\", name \"Phenylalanine\" },       -- 6\n",
6301     " { symbol \"G\", name \"Glycine\" },             -- 7\n",
6302     " { symbol \"H\", name \"Histidine\" } ,          -- 8\n",
6303     " { symbol \"I\", name \"Isoleucine\" },          -- 9\n",
6304     " { symbol \"K\", name \"Lysine\" },              -- 10\n",
6305     " { symbol \"L\", name \"Leucine\" },             -- 11\n",
6306     " { symbol \"M\", name \"Methionine\" },          -- 12\n",
6307     " { symbol \"N\", name \"Asparagine\" } ,         -- 13\n",
6308     " { symbol \"P\", name \"Proline\" },             -- 14\n",
6309     " { symbol \"Q\", name \"Glutamine\"},            -- 15\n",
6310     " { symbol \"R\", name \"Arginine\"},             -- 16\n",
6311     " { symbol \"S\", name \"Serine\"},               -- 17\n",
6312     " { symbol \"T\", name \"Threoine\"},             -- 18\n",
6313     " { symbol \"V\", name \"Valine\"},               -- 19\n",
6314     " { symbol \"W\", name \"Tryptophan\" },          -- 20\n",
6315     " { symbol \"X\", name \"Undetermined or atypical\"},  -- 21\n",
6316     " { symbol \"Y\", name \"Tyrosine\"},             -- 22\n",
6317     " { symbol \"Z\", name \"Glu or Gln\" },          -- 23\n",
6318     " { symbol \"U\", name \"Selenocysteine\"},       -- 24 \n",
6319     " { symbol \"*\", name \"Termination\" },         -- 25\n",
6320     " { symbol \"O\", name \"Pyrrolysine\" },         -- 26\n",
6321     " { symbol \"J\", name \"Leu or Ile\" }           -- 27\n",
6322     " }                            -- end of table            \n",
6323     " } ,\n",
6324     " {                                -- NCBI2na\n",
6325     " code ncbi2na ,\n",
6326     " num 4 ,                     -- continuous 0-3\n",
6327     " one-letter TRUE ,            -- all one letter codes\n",
6328     " table {\n",
6329     " { symbol \"A\", name \"Adenine\" },\n",
6330     " { symbol \"C\", name \"Cytosine\" },\n",
6331     " { symbol \"G\", name \"Guanine\" },\n",
6332     " { symbol \"T\", name \"Thymine/Uracil\"}\n",
6333     " } ,                          -- end of table            \n",
6334     " comps {                      -- complements\n",
6335     " 3,\n",
6336     " 2,\n",
6337     " 1,\n",
6338     " 0\n",
6339     " }\n",
6340     " } ,\n",
6341     " {                                -- NCBI4na\n",
6342     " code ncbi4na ,\n",
6343     " num 16 ,                     -- continuous 0-15\n",
6344     " one-letter TRUE ,            -- all one letter codes\n",
6345     " table {\n",
6346     " { symbol \"-\", name \"Gap\" } ,\n",
6347     " { symbol \"A\", name \"Adenine\" },\n",
6348     " { symbol \"C\", name \"Cytosine\" },\n",
6349     " { symbol \"M\", name \"A or C\" },\n",
6350     " { symbol \"G\", name \"Guanine\" },\n",
6351     " { symbol \"R\", name \"G or A\"},\n",
6352     " { symbol \"S\", name \"G or C\"},\n",
6353     " { symbol \"V\", name \"G or C or A\"},\n",
6354     " { symbol \"T\", name \"Thymine/Uracil\"},\n",
6355     " { symbol \"W\", name \"A or T\" },\n",
6356     " { symbol \"Y\", name \"T or C\"} ,\n",
6357     " { symbol \"H\", name \"A or C or T\" } ,\n",
6358     " { symbol \"K\", name \"G or T\" },\n",
6359     " { symbol \"D\", name \"G or A or T\" },\n",
6360     " { symbol \"B\" , name \"G or T or C\" },\n",
6361     " { symbol \"N\", name \"A or G or C or T\" }\n",
6362     " } ,                           -- end of table            \n",
6363     " comps {                       -- complements\n",
6364     " 0 ,\n",
6365     " 8 ,\n",
6366     " 4 ,\n",
6367     " 12,\n",
6368     " 2 ,\n",
6369     " 10,\n",
6370     " 6 ,\n",
6371     " 14,\n",
6372     " 1 ,\n",
6373     " 9 ,\n",
6374     " 5 ,\n",
6375     " 13,\n",
6376     " 3 ,\n",
6377     " 11,\n",
6378     " 7 ,\n",
6379     " 15\n",
6380     " }\n",
6381     " }\n",
6382     " } ,                                  -- end of codes\n",
6383     " maps {\n",
6384     " {\n",
6385     " from iupacna ,\n",
6386     " to ncbi2na ,\n",
6387     " num 25 ,\n",
6388     " start-at 65 ,\n",
6389     " table {\n",
6390     " 0,     -- A -> A\n",
6391     " 1,     -- B -> C\n",
6392     " 1,     -- C -> C\n",
6393     " 2,     -- D -> G\n",
6394     " 255,\n",
6395     " 255,\n",
6396     " 2,     -- G -> G\n",
6397     " 0,     -- H -> A\n",
6398     " 255,\n",
6399     " 255,\n",
6400     " 2,     -- K -> G\n",
6401     " 255,\n",
6402     " 1,     -- M -> C\n",
6403     " 0,     -- N -> A\n",
6404     " 255,\n",
6405     " 255,\n",
6406     " 255,\n",
6407     " 2,     -- R -> G\n",
6408     " 1,     -- S -> C\n",
6409     " 3,     -- T -> T\n",
6410     " 255,\n",
6411     " 0,     -- V -> A\n",
6412     " 3,     -- W -> T\n",
6413     " 255,\n",
6414     " 3 }    -- Y -> T\n",
6415     " } ,\n",
6416     " {\n",
6417     " from iupacna ,\n",
6418     " to ncbi4na ,\n",
6419     " num 26 ,\n",
6420     " start-at 64 ,\n",
6421     " table {\n",
6422     "		0,     -- @ used by FastaToSeqEntry to convert hyphen to gap\n",
6423     " 1,     -- A\n",
6424     " 14,    -- B\n",
6425     " 2,     -- C\n",
6426     " 13,    -- D\n",
6427     " 255,\n",
6428     " 255,\n",
6429     " 4,     -- G\n",
6430     " 11,    -- H\n",
6431     " 255,\n",
6432     " 255,\n",
6433     " 12,    -- K\n",
6434     " 255,\n",
6435     " 3,     -- M\n",
6436     " 15,    -- N\n",
6437     " 255,\n",
6438     " 255,\n",
6439     " 255,\n",
6440     " 5,     -- R\n",
6441     " 6,     -- S\n",
6442     " 8,     -- T\n",
6443     " 255,\n",
6444     " 7,     -- V\n",
6445     " 9,     -- W\n",
6446     " 255,\n",
6447     " 10 }   -- Y\n",
6448     " } ,\n",
6449     " {\n",
6450     " from ncbi2na ,\n",
6451     " to iupacna ,\n",
6452     " num 4 ,\n",
6453     " table {\n",
6454     " 65,     -- A\n",
6455     " 67,     -- C\n",
6456     " 71,     -- G\n",
6457     " 84 }    -- T\n",
6458     " } ,\n",
6459     " {\n",
6460     " from ncbi2na ,\n",
6461     " to ncbi4na ,\n",
6462     " num 4 ,\n",
6463     " table {\n",
6464     " 1,     -- A\n",
6465     " 2,     -- C\n",
6466     " 4,     -- G\n",
6467     " 8 }    -- T\n",
6468     " } ,\n",
6469     " {\n",
6470     " from ncbi4na ,\n",
6471     " to iupacna ,\n",
6472     " num 16 ,\n",
6473     " table {\n",
6474     " 78,    -- gap -> N\n",
6475     " 65,    -- A\n",
6476     " 67,    -- C\n",
6477     " 77,    -- M\n",
6478     " 71,    -- G\n",
6479     " 82,    -- R\n",
6480     " 83,    -- S\n",
6481     " 86,    -- V\n",
6482     " 84,    -- T\n",
6483     " 87,    -- W\n",
6484     " 89,    -- Y\n",
6485     " 72,    -- H\n",
6486     " 75,    -- K\n",
6487     " 68,    -- D\n",
6488     " 66,    -- B\n",
6489     " 78 }   -- N\n",
6490     " } ,\n",
6491     " {\n",
6492     " from ncbi4na ,\n",
6493     " to ncbi2na ,\n",
6494     " num 16 ,\n",
6495     " table {\n",
6496     " 3,    -- gap -> T\n",
6497     " 0,    -- A -> A\n",
6498     " 1,    -- C -> C\n",
6499     " 1,    -- M -> C\n",
6500     " 2,    -- G -> G\n",
6501     " 2,    -- R -> G\n",
6502     " 1,    -- S -> C\n",
6503     " 0,    -- V -> A\n",
6504     " 3,    -- T -> T\n",
6505     " 3,    -- W -> T\n",
6506     " 3,    -- Y -> T\n",
6507     " 0,    -- H -> A\n",
6508     " 2,    -- K -> G\n",
6509     " 2,    -- D -> G\n",
6510     " 1,    -- B -> C\n",
6511     " 0 }   -- N -> A\n",
6512     " } ,\n",
6513     " {\n",
6514     " from iupacaa ,\n",
6515     " to ncbieaa ,\n",
6516     " num 26 ,\n",
6517     " start-at 65 ,\n",
6518     " table {\n",
6519     " 65 ,    -- they map directly\n",
6520     " 66 ,\n",
6521     " 67 ,\n",
6522     " 68,\n",
6523     " 69,\n",
6524     " 70,\n",
6525     " 71,\n",
6526     " 72,\n",
6527     " 73,\n",
6528     " 74,  -- J - was 255\n",
6529     " 75,\n",
6530     " 76,\n",
6531     " 77,\n",
6532     " 78,\n",
6533     " 79,  -- O - was 255\n",
6534     " 80,\n",
6535     " 81,\n",
6536     " 82,\n",
6537     " 83,\n",
6538     " 84,\n",
6539     " 85,  -- U - was 255\n",
6540     " 86,\n",
6541     " 87,\n",
6542     " 88,\n",
6543     " 89,\n",
6544     " 90 }\n",
6545     " } ,\n",
6546     " {\n",
6547     " from ncbieaa ,\n",
6548     " to iupacaa ,\n",
6549     " num 49 ,\n",
6550     " start-at 42 ,\n",
6551     " table {\n",
6552     " 88 ,   -- termination -> X\n",
6553     " 255,\n",
6554     " 255,\n",
6555     " 88,    -- Gap -> X\n",
6556     " 255,\n",
6557     " 255,\n",
6558     " 255,\n",
6559     " 255,\n",
6560     " 255,\n",
6561     " 255,\n",
6562     " 255,\n",
6563     " 255,\n",
6564     " 255,\n",
6565     " 255,\n",
6566     " 255,\n",
6567     " 255,\n",
6568     " 255,\n",
6569     " 255,\n",
6570     " 255,\n",
6571     " 255,\n",
6572     " 255,\n",
6573     " 255,\n",
6574     " 255,\n",
6575     " 65 ,    -- from here they map directly\n",
6576     " 66 ,\n",
6577     " 67 ,\n",
6578     " 68,\n",
6579     " 69,\n",
6580     " 70,\n",
6581     " 71,\n",
6582     " 72,\n",
6583     " 73,\n",
6584     " 74,  -- J - was 255\n",
6585     " 75,\n",
6586     " 76,\n",
6587     " 77,\n",
6588     " 78,\n",
6589     " 79,  -- O - was 255\n",
6590     " 80,\n",
6591     " 81,\n",
6592     " 82,\n",
6593     " 83,\n",
6594     " 84,\n",
6595     " 85,  -- U was -> X 88\n",
6596     " 86,\n",
6597     " 87,\n",
6598     " 88,\n",
6599     " 89,\n",
6600     " 90 }\n",
6601     " } ,\n",
6602     " {\n",
6603     " from iupacaa ,\n",
6604     " to ncbistdaa ,\n",
6605     " num 26 ,\n",
6606     " start-at 65 ,\n",
6607     " table {\n",
6608     " 1 ,    -- they map directly\n",
6609     " 2 ,\n",
6610     " 3 ,\n",
6611     " 4,\n",
6612     " 5,\n",
6613     " 6,\n",
6614     " 7,\n",
6615     " 8,\n",
6616     " 9,\n",
6617     " 27,  -- J - was 255\n",
6618     " 10,\n",
6619     " 11,\n",
6620     " 12,\n",
6621     " 13,\n",
6622     " 26,  -- O - was 255\n",
6623     " 14,\n",
6624     " 15,\n",
6625     " 16,\n",
6626     " 17,\n",
6627     " 18,\n",
6628     " 24,  -- U - was 255\n",
6629     " 19,\n",
6630     " 20,\n",
6631     " 21,\n",
6632     "				22,\n",
6633     " 23 }\n",
6634     " } ,\n",
6635     " {\n",
6636     " from ncbieaa ,\n",
6637     " to ncbistdaa ,\n",
6638     " num 49 ,\n",
6639     " start-at 42 ,\n",
6640     " table {\n",
6641     " 25,   -- termination\n",
6642     " 255,\n",
6643     " 255,\n",
6644     " 0,    -- Gap\n",
6645     " 255,\n",
6646     " 255,\n",
6647     " 255,\n",
6648     " 255,\n",
6649     " 255,\n",
6650     " 255,\n",
6651     " 255,\n",
6652     " 255,\n",
6653     " 255,\n",
6654     " 255,\n",
6655     " 255,\n",
6656     " 255,\n",
6657     " 255,\n",
6658     " 255,\n",
6659     " 255,\n",
6660     " 255,\n",
6661     " 255,\n",
6662     " 255,\n",
6663     " 255,\n",
6664     " 1 ,    -- they map directly\n",
6665     " 2 ,\n",
6666     " 3 ,\n",
6667     " 4,\n",
6668     " 5,\n",
6669     " 6,\n",
6670     " 7,\n",
6671     " 8,\n",
6672     " 9,\n",
6673     " 27,  -- J - was 255\n",
6674     " 10,\n",
6675     " 11,\n",
6676     " 12,\n",
6677     " 13,\n",
6678     " 26,  -- O - was 255\n",
6679     " 14,\n",
6680     " 15,\n",
6681     " 16,\n",
6682     " 17,\n",
6683     " 18,\n",
6684     " 24,  -- U\n",
6685     " 19,\n",
6686     " 20,\n",
6687     " 21,\n",
6688     " 22,\n",
6689     " 23 }\n",
6690     " }  ,\n",
6691     " {\n",
6692     " from ncbistdaa ,\n",
6693     " to ncbieaa ,\n",
6694     " num 28 ,\n",
6695     " table {\n",
6696     " 45 ,  --   \"-\"\n",
6697     " 65 ,    -- they map directly with holes for O and J\n",
6698     " 66 ,\n",
6699     " 67 ,\n",
6700     " 68,\n",
6701     " 69,\n",
6702     " 70,\n",
6703     " 71,\n",
6704     " 72,\n",
6705     " 73,\n",
6706     " 75,\n",
6707     " 76,\n",
6708     " 77,\n",
6709     " 78,\n",
6710     " 80,\n",
6711     " 81,\n",
6712     " 82,\n",
6713     " 83,\n",
6714     " 84,\n",
6715     " 86,\n",
6716     " 87,\n",
6717     " 88,\n",
6718     " 89,\n",
6719     " 90,\n",
6720     " 85,	 -- U\n",
6721     " 42,  -- *\n",
6722     " 79,	 -- O - new\n",
6723     " 74}  -- J - new\n",
6724     " } ,\n",
6725     " {\n",
6726     " from ncbistdaa ,\n",
6727     " to iupacaa ,\n",
6728     " num 28 ,\n",
6729     " table {\n",
6730     " 255 ,  --   \"-\"\n",
6731     " 65 ,    -- they map directly with holes for O and J\n",
6732     " 66 ,\n",
6733     " 67 ,\n",
6734     " 68,\n",
6735     " 69,\n",
6736     " 70,\n",
6737     " 71,\n",
6738     " 72,\n",
6739     " 73,\n",
6740     " 75,\n",
6741     " 76,\n",
6742     " 77,\n",
6743     " 78,\n",
6744     " 80,\n",
6745     " 81,\n",
6746     " 82,\n",
6747     " 83,\n",
6748     " 84,\n",
6749     " 86,\n",
6750     " 87,\n",
6751     " 88,\n",
6752     " 89,\n",
6753     " 90,\n",
6754     " 85,  -- U - was 88\n",
6755     " 255, -- *\n",
6756     " 79,	 -- O - new\n",
6757     " 74}  -- J - new\n",
6758     " } \n",
6759     " }\n",
6760     "-- end of seq-code-set -- }", // make sure '}' is last symbol of ASN text
6761     0  // to indicate that there is no more data
6762 };
6763 
6764 
6765 END_objects_SCOPE
6766 END_NCBI_SCOPE
6767