1 /*$Id: seqport_util.cpp 634628 2021-07-15 17:30:59Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Clifford Clausen
27 * (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko)
28 *
29 * File Description:
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <objects/seq/seqport_util.hpp>
34
35 #include <serial/serial.hpp>
36 #include <serial/objostr.hpp>
37 #include <serial/objistr.hpp>
38
39 #include <objects/seq/NCBI2na.hpp>
40 #include <objects/seq/NCBI4na.hpp>
41 #include <objects/seq/NCBI8na.hpp>
42 #include <objects/seq/NCBI8aa.hpp>
43 #include <objects/seq/IUPACna.hpp>
44 #include <objects/seq/IUPACaa.hpp>
45 #include <objects/seq/NCBIeaa.hpp>
46 #include <objects/seq/NCBIstdaa.hpp>
47 #include <objects/seq/NCBIpaa.hpp>
48
49 #include <objects/seqcode/Seq_code_set.hpp>
50 #include <objects/seqcode/Seq_code_table.hpp>
51 #include <objects/seqcode/Seq_code_type.hpp>
52 #include <objects/seqcode/Seq_map_table.hpp>
53
54 #include <util/sequtil/sequtil.hpp>
55 #include <util/sequtil/sequtil_convert.hpp>
56 #include <util/sequtil/sequtil_manip.hpp>
57 #include <util/random_gen.hpp>
58 #include <corelib/ncbi_safe_static.hpp>
59
60 #include <algorithm>
61 #include <string.h>
62
63
64 BEGIN_NCBI_SCOPE
65 BEGIN_objects_SCOPE
66
67 static const bool kSymbol = true;
68 static const bool kName = false;
69 static const unsigned int kNumCodes = 11;
70
EChoiceToESeq(CSeq_data::E_Choice from_type)71 static inline ESeq_code_type EChoiceToESeq (CSeq_data::E_Choice from_type)
72 {
73 switch (from_type) {
74 case CSeq_data::e_Iupacaa:
75 return eSeq_code_type_iupacaa;
76 case CSeq_data::e_Ncbi2na:
77 return eSeq_code_type_ncbi2na;
78 case CSeq_data::e_Ncbi4na:
79 return eSeq_code_type_ncbi4na;
80 case CSeq_data::e_Iupacna:
81 return eSeq_code_type_iupacna;
82 case CSeq_data::e_Ncbieaa:
83 return eSeq_code_type_ncbieaa;
84 case CSeq_data::e_Ncbistdaa:
85 return eSeq_code_type_ncbistdaa;
86 case CSeq_data::e_Ncbi8na:
87 return eSeq_code_type_ncbi8na;
88 case CSeq_data::e_Ncbipna:
89 return eSeq_code_type_ncbipna;
90 case CSeq_data::e_Ncbi8aa:
91 return eSeq_code_type_ncbi8aa;
92 case CSeq_data::e_Ncbipaa:
93 return eSeq_code_type_ncbipaa;
94 default:
95 throw CSeqportUtil::CBadType("EChoiceToESeq");
96 }
97 }
98
99 // CSeqportUtil_implementation is a singleton.
100
101 class CSeqportUtil_implementation {
102 public:
103 CSeqportUtil_implementation();
104 ~CSeqportUtil_implementation();
105
106 typedef CSeqportUtil::TIndex TIndex;
107 typedef CSeqportUtil::TPair TPair;
108
109 TSeqPos Convert
110 (const CSeq_data& in_seq,
111 CSeq_data* out_seq,
112 CSeq_data::E_Choice to_code,
113 TSeqPos uBeginIdx,
114 TSeqPos uLength,
115 bool bAmbig,
116 CRandom::TValue seed,
117 TSeqPos total_length = 0,
118 TSeqPos* out_seq_length = 0,
119 vector<Uint4>* blast_ambig = 0)
120 const;
121
122 TSeqPos Pack
123 (CSeq_data* in_seq,
124 TSeqPos uLength)
125 const;
126
127 bool FastValidate
128 (const CSeq_data& in_seq,
129 TSeqPos uBeginIdx,
130 TSeqPos uLength)
131 const;
132
133 void Validate
134 (const CSeq_data& in_seq,
135 vector<TSeqPos>* badIdx,
136 TSeqPos uBeginIdx,
137 TSeqPos uLength)
138 const;
139
140 TSeqPos GetAmbigs
141 (const CSeq_data& in_seq,
142 CSeq_data* out_seq,
143 vector<TSeqPos>* out_indices,
144 CSeq_data::E_Choice to_code,
145 TSeqPos uBeginIdx,
146 TSeqPos uLength)
147 const;
148
149 TSeqPos GetCopy
150 (const CSeq_data& in_seq,
151 CSeq_data* out_seq,
152 TSeqPos uBeginIdx,
153 TSeqPos uLength)
154 const;
155
156 TSeqPos Keep
157 (CSeq_data* in_seq,
158 TSeqPos uBeginIdx,
159 TSeqPos uLength)
160 const;
161
162 TSeqPos Append
163 (CSeq_data* out_seq,
164 const CSeq_data& in_seq1,
165 TSeqPos uBeginIdx1,
166 TSeqPos uLength1,
167 const CSeq_data& in_seq2,
168 TSeqPos uBeginIdx2,
169 TSeqPos uLength2)
170 const;
171
172 TSeqPos Complement
173 (CSeq_data* in_seq,
174 TSeqPos uBeginIdx,
175 TSeqPos uLength)
176 const;
177
178 TSeqPos Complement
179 (const CSeq_data& in_seq,
180 CSeq_data* out_seq,
181 TSeqPos uBeginIdx,
182 TSeqPos uLength)
183 const;
184
185 TSeqPos Reverse
186 (CSeq_data* in_seq,
187 TSeqPos uBeginIdx,
188 TSeqPos uLength)
189 const;
190
191 TSeqPos Reverse
192 (const CSeq_data& in_seq,
193 CSeq_data* out_seq,
194 TSeqPos uBeginIdx,
195 TSeqPos uLength)
196 const;
197
198 TSeqPos ReverseComplement
199 (CSeq_data* in_seq,
200 TSeqPos uBeginIdx,
201 TSeqPos uLength)
202 const;
203
204 TSeqPos ReverseComplement
205 (const CSeq_data& in_seq,
206 CSeq_data* out_seq,
207 TSeqPos uBeginIdx,
208 TSeqPos uLength)
209 const;
210
211 const string& GetIupacaa3(TIndex ncbistdaa);
212
213 bool IsCodeAvailable(CSeq_data::E_Choice code_type);
214
215 bool IsCodeAvailable(ESeq_code_type code_type);
216
217 TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type);
218
219 TPair GetCodeIndexFromTo(ESeq_code_type code_type);
220
221 const string& GetCodeOrName(CSeq_data::E_Choice code_type,
222 TIndex idx,
223 bool get_code);
224
225 const string& GetCodeOrName(ESeq_code_type code_type,
226 TIndex idx,
227 bool get_code);
228
229 TIndex GetIndex(CSeq_data::E_Choice code_type,
230 const string& code);
231
232 TIndex GetIndex(ESeq_code_type code_type,
233 const string& code);
234
235 TIndex GetIndexComplement(CSeq_data::E_Choice code_type,
236 TIndex idx);
237
238 TIndex GetIndexComplement(ESeq_code_type code_type,
239 TIndex idx);
240
241 TIndex GetMapToIndex(CSeq_data::E_Choice from_type,
242 CSeq_data::E_Choice to_type,
243 TIndex from_idx);
244
245 TIndex GetMapToIndex(ESeq_code_type from_type,
246 ESeq_code_type to_type,
247 TIndex from_idx);
248 // Template wrapper class used to create data type specific
249 // classes to delete code tables on exit from main
250 template <class T>
251 class CWrapper_table : public CObject
252 {
253 public:
CWrapper_table(int size,int start)254 CWrapper_table(int size, int start)
255 {
256 m_Table = new T[256];
257 m_StartAt = start;
258 m_Size = size;
259 }
~CWrapper_table()260 ~CWrapper_table() {
261 drop_table();
262 }
drop_table()263 void drop_table()
264 {
265 delete[] m_Table;
266 m_Table = 0;
267 }
268
269 T* m_Table;
270 int m_StartAt;
271 int m_Size;
272 };
273
274 // Template wrapper class used for two-dimensional arrays.
275 template <class T>
276 class CWrapper_2D : public CObject
277 {
278 public:
CWrapper_2D(int size1,int start1,int size2,int start2)279 CWrapper_2D(int size1, int start1, int size2, int start2)
280 {
281 m_Size_D1 = size1;
282 m_Size_D2 = size2;
283 m_StartAt_D1 = start1;
284 m_StartAt_D2 = start2;
285 m_Table = new T*[size1];
286 for(int i=0; i<size1; i++)
287 {
288 m_Table[i] = new T[size2] - start2;
289 }
290 m_Table -= start1;
291 }
~CWrapper_2D()292 ~CWrapper_2D()
293 {
294 m_Table += m_StartAt_D1;
295 for(int i=0; i<m_Size_D1; i++)
296 {
297 delete[](m_Table[i] + m_StartAt_D2);
298 }
299 delete[] m_Table;
300 }
301
302 T** m_Table;
303 int m_Size_D1;
304 int m_Size_D2;
305 int m_StartAt_D1;
306 int m_StartAt_D2;
307 };
308
309 // Typedefs making use of wrapper classes above.
310 typedef CWrapper_table<char> CCode_table;
311 typedef CWrapper_table<string> CCode_table_str;
312 typedef CWrapper_table<int> CMap_table;
313 typedef CWrapper_table<unsigned int> CFast_table4;
314 typedef CWrapper_table<unsigned short> CFast_table2;
315 typedef CWrapper_table<unsigned char> CAmbig_detect;
316 typedef CWrapper_table<char> CCode_comp;
317 typedef CWrapper_table<char> CCode_rev;
318
319 typedef CWrapper_2D<unsigned char> CFast_4_1;
320 typedef CWrapper_2D<unsigned char> CFast_2_1;
321
322 private:
323 // String to initialize CSeq_code_set
324 // This string is initialized in seqport_util.h
325 static const char* sm_StrAsnData[];
326
327 // CSeq_code_set member holding code and map table data
328 CRef<CSeq_code_set> m_SeqCodeSet;
329
330 // Helper function used internally to initialize m_SeqCodeSet
331 CRef<CSeq_code_set> Init();
332
333 // Member variables holding code tables
334 CRef<CCode_table> m_Iupacna;
335 CRef<CCode_table> m_Ncbieaa;
336 CRef<CCode_table> m_Ncbistdaa;
337 CRef<CCode_table> m_Iupacaa;
338
339 // Helper function to initialize code tables
340 CRef<CCode_table> InitCodes(ESeq_code_type code_type);
341
342 // Member variables holding na complement information
343 CRef<CCode_comp> m_Iupacna_complement;
344 CRef<CCode_comp> m_Ncbi2naComplement;
345 CRef<CCode_comp> m_Ncbi4naComplement;
346
347 // Helper functions to initialize complement tables
348 CRef<CCode_comp> InitIupacnaComplement();
349 CRef<CCode_comp> InitNcbi2naComplement();
350 CRef<CCode_comp> InitNcbi4naComplement();
351
352 // Member variables holding na reverse information
353 // Used to reverse residues packed within a byte.
354 CRef<CCode_rev> m_Ncbi2naRev;
355 CRef<CCode_rev> m_Ncbi4naRev;
356
357 // Helper functions to initialize reverse tables
358 CRef<CCode_rev> InitNcbi2naRev();
359 CRef<CCode_rev> InitNcbi4naRev();
360
361 // Member variables holding map tables
362
363 CRef<CMap_table> m_Ncbi2naIupacna;
364 CRef<CMap_table> m_Ncbi2naNcbi4na;
365 CRef<CMap_table> m_Ncbi4naIupacna;
366 CRef<CMap_table> m_IupacnaNcbi2na;
367 CRef<CMap_table> m_IupacnaNcbi4na;
368 CRef<CMap_table> m_Ncbi4naNcbi2na;
369 CRef<CMap_table> m_IupacaaNcbieaa;
370 CRef<CMap_table> m_NcbieaaIupacaa;
371 CRef<CMap_table> m_IupacaaNcbistdaa;
372 CRef<CMap_table> m_NcbieaaNcbistdaa;
373 CRef<CMap_table> m_NcbistdaaNcbieaa;
374 CRef<CMap_table> m_NcbistdaaIupacaa;
375
376
377 TSeqPos x_ConvertAmbig
378 (const CSeq_data& in_seq,
379 CSeq_data* out_seq,
380 CSeq_data::E_Choice to_code,
381 TSeqPos uBeginIdx,
382 TSeqPos uLength,
383 CRandom::TValue seed,
384 TSeqPos total_length = 0,
385 TSeqPos* out_seq_length = 0,
386 vector<Uint4>* blast_ambig = 0)
387 const;
388
389 // Helper function to initialize map tables
390 CRef<CMap_table> InitMaps(ESeq_code_type from_type,
391 ESeq_code_type to_type);
392
393 // Member variables holding fast conversion tables
394
395 // Takes a byte as an index and returns a unsigned int with
396 // 4 characters, each character being one of ATGC
397 //CRef<CFast_table4> m_FastNcbi2naIupacna;
398
399 // Takes a byte (each byte with 4 Ncbi2na codes) as an index and
400 // returns a Unit2 with 2 bytes, each byte formated as 2 Ncbi4na codes
401 //CRef<CFast_table2> m_FastNcbi2naNcbi4na;
402
403 // Takes a byte (each byte with 2 Ncbi4na codes) as an index and
404 // returns a 2 byte string, each byte with an Iupacna code.
405 //CRef<CFast_table2> m_FastNcbi4naIupacna;
406
407 // Table used for fast compression from Iupacna to Ncbi2na (4 bytes to 1
408 // byte). This table is a 2 dimensional table. The first dimension
409 // corresponds to the iupacna position modulo 4 (0-3). The second dimension
410 // is the value of the iupacna byte (0-255). The 4 resulting values from 4
411 // iupancna bytes are bitwise or'd to produce 1 byte.
412 CRef<CFast_4_1> m_FastIupacnaNcbi2na;
413
414 // Table used for fast compression from Iupacna to Ncbi4na
415 // (2 bytes to 1 byte). Similar to m_FastIupacnaNcbi2na
416 CRef<CFast_2_1> m_FastIupacnaNcbi4na;
417
418 // Table used for fast compression from Ncbi4na to Ncbi2na
419 // (2 bytes to 1 byte). Similar to m_FastIupacnaNcbi4na
420 CRef<CFast_2_1> m_FastNcbi4naNcbi2na;
421
422 // Tables used to convert an index for a code type to a symbol or name
423 // for the same code type
424 vector<vector<string> > m_IndexString[2];
425 vector<vector<TIndex> > m_IndexComplement;
426 vector<map<string, TIndex> > m_StringIndex;
427 vector<TIndex> m_StartAt;
428
429 // Helper function to initialize fast conversion tables
430 //CRef<CFast_table4> InitFastNcbi2naIupacna();
431 CRef<CFast_table2> InitFastNcbi2naNcbi4na();
432 CRef<CFast_table2> InitFastNcbi4naIupacna();
433 CRef<CFast_4_1> InitFastIupacnaNcbi2na();
434 CRef<CFast_2_1> InitFastIupacnaNcbi4na();
435 CRef<CFast_2_1> InitFastNcbi4naNcbi2na();
436
437 // Helper functions to initialize Index to/from code/name conversion tables
438 // and complement tables
439 void InitIndexCodeName();
440
441 // Data members and functions used for random disambiguation
442
443 // structure used for ncbi4na --> ncbi2na
444 struct SMasksArray : public CObject
445 {
446 // Structure to hold all masks applicable to an input byte
447 struct SMasks {
448 int nMasks;
449 unsigned char cMask[16];
450 };
451 SMasks m_Table[256];
452 };
453
454 CRef<SMasksArray> m_Masks;
455
456 // Helper function to initialize m_Masks
457 CRef<SMasksArray> InitMasks();
458
459 // Data members used for detecting ambiguities
460
461 // Data members used by GetAmbig methods to get a list of
462 // ambiguities resulting from alphabet conversions
463 CRef<CAmbig_detect> m_DetectAmbigNcbi4naNcbi2na;
464 CRef<CAmbig_detect> m_DetectAmbigIupacnaNcbi2na;
465
466 // Helper functiond to initialize m_Detect_Ambig_ data members
467 CRef<CAmbig_detect> InitAmbigNcbi4naNcbi2na();
468 CRef<CAmbig_detect> InitAmbigIupacnaNcbi2na();
469
470 // Alphabet conversion functions. Functions return
471 // the number of converted codes.
472
473 /*
474 // Fuction to convert ncbi2na (1 byte) to iupacna (4 bytes)
475 TSeqPos MapNcbi2naToIupacna(const CSeq_data& in_seq,
476 CSeq_data* out_seq,
477 TSeqPos uBeginIdx,
478 TSeqPos uLength)
479 const;
480
481 // Function to convert ncbi2na (1 byte) to ncbi4na (2 bytes)
482 TSeqPos MapNcbi2naToNcbi4na(const CSeq_data& in_seq,
483 CSeq_data* out_seq,
484 TSeqPos uBeginIdx,
485 TSeqPos uLength)
486 const;
487
488 // Function to convert ncbi4na (1 byte) to iupacna (2 bytes)
489 TSeqPos MapNcbi4naToIupacna(const CSeq_data& in_seq,
490 CSeq_data* out_seq,
491 TSeqPos uBeginIdx,
492 TSeqPos uLength)
493 const;
494 */
495 // Function to convert iupacna (4 bytes) to ncbi2na (1 byte)
496 TSeqPos MapIupacnaToNcbi2na(const CSeq_data& in_seq,
497 CSeq_data* out_seq,
498 TSeqPos uBeginIdx,
499 TSeqPos uLength,
500 bool bAmbig,
501 CRandom::TValue seed,
502 TSeqPos total_length,
503 TSeqPos* out_seq_length,
504 vector<Uint4>* blast_ambig)
505 const;
506 /*
507
508 // Function to convert iupacna (2 bytes) to ncbi4na (1 byte)
509 TSeqPos MapIupacnaToNcbi4na(const CSeq_data& in_seq,
510 CSeq_data* out_seq,
511 TSeqPos uBeginIdx,
512 TSeqPos uLength)
513 const;
514 */
515 // Function to convert ncbi4na (2 bytes) to ncbi2na (1 byte)
516 TSeqPos MapNcbi4naToNcbi2na(const CSeq_data& in_seq,
517 CSeq_data* out_seq,
518 TSeqPos uBeginIdx,
519 TSeqPos uLength,
520 bool bAmbig,
521 CRandom::TValue seed,
522 TSeqPos total_length,
523 TSeqPos* out_seq_length,
524 vector<Uint4>* blast_ambig)
525 const;
526 /*
527
528 // Function to convert iupacaa (byte) to ncbieaa (byte)
529 TSeqPos MapIupacaaToNcbieaa(const CSeq_data& in_seq,
530 CSeq_data* out_seq,
531 TSeqPos uBeginIdx,
532 TSeqPos uLength) const;
533
534 // Function to convert ncbieaa (byte) to iupacaa (byte)
535 TSeqPos MapNcbieaaToIupacaa(const CSeq_data& in_seq,
536 CSeq_data* out_seq,
537 TSeqPos uBeginIdx,
538 TSeqPos uLength)
539 const;
540
541 // Function to convert iupacaa (byte) to ncbistdaa (byte)
542 TSeqPos MapIupacaaToNcbistdaa(const CSeq_data& in_seq,
543 CSeq_data* out_seq,
544 TSeqPos uBeginIdx,
545 TSeqPos uLength)
546 const;
547
548 // Function to convert ncbieaa (byte) to ncbistdaa (byte)
549 TSeqPos MapNcbieaaToNcbistdaa(const CSeq_data& in_seq,
550 CSeq_data* out_seq,
551 TSeqPos uBeginIdx,
552 TSeqPos uLength)
553 const;
554
555 // Function to convert ncbistdaa (byte) to ncbieaa (byte)
556 TSeqPos MapNcbistdaaToNcbieaa(const CSeq_data& in_seq,
557 CSeq_data* out_seq,
558 TSeqPos uBeginIdx,
559 TSeqPos uLength)
560 const;
561
562 // Function to convert ncbistdaa (byte) to iupacaa (byte)
563 TSeqPos MapNcbistdaaToIupacaa(const CSeq_data& in_seq,
564 CSeq_data* out_seq,
565 TSeqPos uBeginIdx,
566 TSeqPos uLength)
567 const;
568 */
569
570 // Fast Validation functions
571 bool FastValidateIupacna(const CSeq_data& in_seq,
572 TSeqPos uBeginIdx,
573 TSeqPos uLength)
574 const;
575
576 bool FastValidateNcbieaa(const CSeq_data& in_seq,
577 TSeqPos uBeginIdx,
578 TSeqPos uLength)
579 const;
580
581
582 bool FastValidateNcbistdaa(const CSeq_data& in_seq,
583 TSeqPos uBeginIdx,
584 TSeqPos uLength)
585 const;
586
587
588 bool FastValidateIupacaa(const CSeq_data& in_seq,
589 TSeqPos uBeginIdx,
590 TSeqPos uLength)
591 const;
592
593 // Full Validation functions
594 void ValidateIupacna(const CSeq_data& in_seq,
595 vector<TSeqPos>* badIdx,
596 TSeqPos uBeginIdx,
597 TSeqPos uLength)
598 const;
599
600 void ValidateNcbieaa(const CSeq_data& in_seq,
601 vector<TSeqPos>* badIdx,
602 TSeqPos uBeginIdx,
603 TSeqPos uLength)
604 const;
605
606 void ValidateNcbistdaa(const CSeq_data& in_seq,
607 vector<TSeqPos>* badIdx,
608 TSeqPos uBeginIdx,
609 TSeqPos uLength)
610 const;
611
612 void ValidateIupacaa(const CSeq_data& in_seq,
613 vector<TSeqPos>* badIdx,
614 TSeqPos uBeginIdx,
615 TSeqPos uLength)
616 const;
617
618 // Functions to make copies of the different types of sequences
619 TSeqPos GetNcbi2naCopy(const CSeq_data& in_seq,
620 CSeq_data* out_seq,
621 TSeqPos uBeginIdx,
622 TSeqPos uLength)
623 const;
624
625 TSeqPos GetNcbi4naCopy(const CSeq_data& in_seq,
626 CSeq_data* out_seq,
627 TSeqPos uBeginIdx,
628 TSeqPos uLength)
629 const;
630
631 TSeqPos GetIupacnaCopy(const CSeq_data& in_seq,
632 CSeq_data* out_seq,
633 TSeqPos uBeginIdx,
634 TSeqPos uLength)
635 const;
636
637 TSeqPos GetNcbieaaCopy(const CSeq_data& in_seq,
638 CSeq_data* out_seq,
639 TSeqPos uBeginIdx,
640 TSeqPos uLength)
641 const;
642
643 TSeqPos GetNcbistdaaCopy(const CSeq_data& in_seq,
644 CSeq_data* out_seq,
645 TSeqPos uBeginIdx,
646 TSeqPos uLength)
647 const;
648
649 TSeqPos GetIupacaaCopy(const CSeq_data& in_seq,
650 CSeq_data* out_seq,
651 TSeqPos uBeginIdx,
652 TSeqPos uLength)
653 const;
654
655 // Function to adjust uBeginIdx to lie on an in_seq byte boundary
656 // and uLength to lie on on an out_seq byte boundary. Returns
657 // overhang, the number of out seqs beyond byte boundary determined
658 // by uBeginIdx + uLength
659 TSeqPos Adjust(TSeqPos* uBeginIdx,
660 TSeqPos* uLength,
661 TSeqPos uInSeqBytes,
662 TSeqPos uInSeqsPerByte,
663 TSeqPos uOutSeqsPerByte)
664 const;
665
666 // GetAmbig methods
667
668 // Loops through an ncbi4na input sequence and determines
669 // the ambiguities that would result from conversion to an ncbi2na sequence
670 // On return, out_seq contains the ncbi4na bases that become ambiguous and
671 // out_indices contains the indices of the abiguous bases in in_seq
672 TSeqPos GetAmbigs_ncbi4na_ncbi2na(const CSeq_data& in_seq,
673 CSeq_data* out_seq,
674 vector<TSeqPos>* out_indices,
675 TSeqPos uBeginIdx,
676 TSeqPos uLength)
677 const;
678
679 // Loops through an iupacna input sequence and determines
680 // the ambiguities that would result from conversion to an ncbi2na sequence
681 // On return, out_seq contains the iupacna bases that become ambiguous and
682 // out_indices contains the indices of the abiguous bases in in_seq. The
683 // return is the number of ambiguities found.
684 TSeqPos GetAmbigs_iupacna_ncbi2na(const CSeq_data& in_seq,
685 CSeq_data* out_seq,
686 vector<TSeqPos>* out_indices,
687 TSeqPos uBeginIdx,
688 TSeqPos uLength)
689 const;
690
691 // Methods to perform Keep on specific seq types. Methods
692 // return length of kept sequence.
693 TSeqPos KeepNcbi2na(CSeq_data* in_seq,
694 TSeqPos uBeginIdx,
695 TSeqPos uLength)
696 const;
697
698 TSeqPos KeepNcbi4na(CSeq_data* in_seq,
699 TSeqPos uBeginIdx,
700 TSeqPos uLength)
701 const;
702
703 TSeqPos KeepIupacna(CSeq_data* in_seq,
704 TSeqPos uBeginIdx,
705 TSeqPos uLength)
706 const;
707
708 TSeqPos KeepNcbieaa(CSeq_data* in_seq,
709 TSeqPos uBeginIdx,
710 TSeqPos uLength)
711 const;
712
713 TSeqPos KeepNcbistdaa(CSeq_data* in_seq,
714 TSeqPos uBeginIdx,
715 TSeqPos uLength)
716 const;
717
718 TSeqPos KeepIupacaa(CSeq_data* in_seq,
719 TSeqPos uBeginIdx,
720 TSeqPos uLength)
721 const;
722
723 // Methods to complement na sequences
724
725 // In place methods. Return number of complemented residues.
726 TSeqPos ComplementIupacna(CSeq_data* in_seq,
727 TSeqPos uBeginIdx,
728 TSeqPos uLength)
729 const;
730
731 TSeqPos ComplementNcbi2na(CSeq_data* in_seq,
732 TSeqPos uBeginIdx,
733 TSeqPos uLength)
734 const;
735
736 TSeqPos ComplementNcbi4na(CSeq_data* in_seq,
737 TSeqPos uBeginIdx,
738 TSeqPos uLength)
739 const;
740
741
742 // Complement in copy methods
743 TSeqPos ComplementIupacna(const CSeq_data& in_seq,
744 CSeq_data* out_seq,
745 TSeqPos uBeginIdx,
746 TSeqPos uLength)
747 const;
748
749 TSeqPos ComplementNcbi2na(const CSeq_data& in_seq,
750 CSeq_data* out_seq,
751 TSeqPos uBeginIdx,
752 TSeqPos uLength)
753 const;
754
755 TSeqPos ComplementNcbi4na(const CSeq_data& in_seq,
756 CSeq_data* out_seq,
757 TSeqPos uBeginIdx,
758 TSeqPos uLength)
759 const;
760
761
762 // Methods to reverse na sequences
763
764 // In place methods
765 TSeqPos ReverseIupacna(CSeq_data* in_seq,
766 TSeqPos uBeginIdx,
767 TSeqPos uLength)
768 const;
769
770 TSeqPos ReverseNcbi2na(CSeq_data* in_seq,
771 TSeqPos uBeginIdx,
772 TSeqPos uLength)
773 const;
774
775 TSeqPos ReverseNcbi4na(CSeq_data* in_seq,
776 TSeqPos uBeginIdx,
777 TSeqPos uLength)
778 const;
779
780 // Reverse in copy methods
781 TSeqPos ReverseIupacna(const CSeq_data& in_seq,
782 CSeq_data* out_seq,
783 TSeqPos uBeginIdx,
784 TSeqPos uLength)
785 const;
786
787 TSeqPos ReverseNcbi2na(const CSeq_data& in_seq,
788 CSeq_data* out_seq,
789 TSeqPos uBeginIdx,
790 TSeqPos uLength)
791 const;
792
793 TSeqPos ReverseNcbi4na(const CSeq_data& in_seq,
794 CSeq_data* out_seq,
795 TSeqPos uBeginIdx,
796 TSeqPos uLength)
797 const;
798
799 // Methods to reverse-complement an na sequences
800
801 // In place methods
802 TSeqPos ReverseComplementIupacna(CSeq_data* in_seq,
803 TSeqPos uBeginIdx,
804 TSeqPos uLength)
805 const;
806
807 TSeqPos ReverseComplementNcbi2na(CSeq_data* in_seq,
808 TSeqPos uBeginIdx,
809 TSeqPos uLength)
810 const;
811
812 TSeqPos ReverseComplementNcbi4na(CSeq_data* in_seq,
813 TSeqPos uBeginIdx,
814 TSeqPos uLength)
815 const;
816
817 // Reverse in copy methods
818 TSeqPos ReverseComplementIupacna(const CSeq_data& in_seq,
819 CSeq_data* out_seq,
820 TSeqPos uBeginIdx,
821 TSeqPos uLength)
822 const;
823
824 TSeqPos ReverseComplementNcbi2na(const CSeq_data& in_seq,
825 CSeq_data* out_seq,
826 TSeqPos uBeginIdx,
827 TSeqPos uLength)
828 const;
829
830 TSeqPos ReverseComplementNcbi4na(const CSeq_data& in_seq,
831 CSeq_data* out_seq,
832 TSeqPos uBeginIdx,
833 TSeqPos uLength)
834 const;
835
836 // Append methods
837 TSeqPos AppendIupacna(CSeq_data* out_seq,
838 const CSeq_data& in_seq1,
839 TSeqPos uBeginIdx1,
840 TSeqPos uLength1,
841 const CSeq_data& in_seq2,
842 TSeqPos uBeginIdx2,
843 TSeqPos uLength2)
844 const;
845
846 TSeqPos AppendNcbi2na(CSeq_data* out_seq,
847 const CSeq_data& in_seq1,
848 TSeqPos uBeginIdx1,
849 TSeqPos uLength1,
850 const CSeq_data& in_seq2,
851 TSeqPos uBeginIdx2,
852 TSeqPos uLength2)
853 const;
854
855 TSeqPos AppendNcbi4na(CSeq_data* out_seq,
856 const CSeq_data& in_seq1,
857 TSeqPos uBeginIdx1,
858 TSeqPos uLength1,
859 const CSeq_data& in_seq2,
860 TSeqPos uBeginIdx2,
861 TSeqPos uLength2)
862 const;
863
864 TSeqPos AppendNcbieaa(CSeq_data* out_seq,
865 const CSeq_data& in_seq1,
866 TSeqPos uBeginIdx1,
867 TSeqPos uLength1,
868 const CSeq_data& in_seq2,
869 TSeqPos uBeginIdx2,
870 TSeqPos uLength2)
871 const;
872
873 TSeqPos AppendNcbistdaa(CSeq_data* out_seq,
874 const CSeq_data& in_seq1,
875 TSeqPos uBeginIdx1,
876 TSeqPos uLength1,
877 const CSeq_data& in_seq2,
878 TSeqPos uBeginIdx2,
879 TSeqPos uLength2)
880 const;
881
882 TSeqPos AppendIupacaa(CSeq_data* out_seq,
883 const CSeq_data& in_seq1,
884 TSeqPos uBeginIdx1,
885 TSeqPos uLength1,
886 const CSeq_data& in_seq2,
887 TSeqPos uBeginIdx2,
888 TSeqPos uLength2)
889 const;
890
891 void x_GetSeqFromSeqData(const CSeq_data& data,
892 const string** str,
893 const vector<char>** vec)
894 const;
895 void x_GetSeqFromSeqData(CSeq_data& data,
896 string** str,
897 vector<char>** vec)
898 const;
899 };
900
901
902 static CSafeStatic<CSeqportUtil_implementation> sx_Implementation;
903
x_GetImplementation(void)904 CSeqportUtil_implementation& CSeqportUtil::x_GetImplementation(void)
905 {
906 return *sx_Implementation;
907 }
908
909
910
911
912 /////////////////////////////////////////////////////////////////////////////
913 // PUBLIC (static wrappers to CSeqportUtil_implementation public methods)::
914 //
915
916
Convert(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed)917 TSeqPos CSeqportUtil::Convert
918 (const CSeq_data& in_seq,
919 CSeq_data* out_seq,
920 CSeq_data::E_Choice to_code,
921 TSeqPos uBeginIdx,
922 TSeqPos uLength,
923 bool bAmbig,
924 CRandom::TValue seed)
925 {
926 return x_GetImplementation().Convert
927 (in_seq, out_seq, to_code, uBeginIdx, uLength, bAmbig, seed,
928 0, 0, 0);
929 }
930
ConvertWithBlastAmbig(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig)931 TSeqPos CSeqportUtil::ConvertWithBlastAmbig
932 (const CSeq_data& in_seq,
933 CSeq_data* out_seq,
934 TSeqPos uBeginIdx,
935 TSeqPos uLength,
936 TSeqPos total_length,
937 TSeqPos* out_seq_length,
938 vector<Uint4>* blast_ambig)
939 {
940 return x_GetImplementation().Convert
941 (in_seq, out_seq, CSeq_data::e_Ncbi2na, uBeginIdx, uLength, true,
942 17734276, total_length, out_seq_length, blast_ambig);
943 }
944
Pack(CSeq_data * in_seq,TSeqPos uLength)945 TSeqPos CSeqportUtil::Pack
946 (CSeq_data* in_seq,
947 TSeqPos uLength)
948 {
949 return x_GetImplementation().Pack
950 (in_seq, uLength);
951 }
952
953
FastValidate(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength)954 bool CSeqportUtil::FastValidate
955 (const CSeq_data& in_seq,
956 TSeqPos uBeginIdx,
957 TSeqPos uLength)
958 {
959 return x_GetImplementation().FastValidate
960 (in_seq, uBeginIdx, uLength);
961 }
962
963
Validate(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength)964 void CSeqportUtil::Validate
965 (const CSeq_data& in_seq,
966 vector<TSeqPos>* badIdx,
967 TSeqPos uBeginIdx,
968 TSeqPos uLength)
969 {
970 x_GetImplementation().Validate
971 (in_seq, badIdx, uBeginIdx, uLength);
972 }
973
974
GetAmbigs(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength)975 TSeqPos CSeqportUtil::GetAmbigs
976 (const CSeq_data& in_seq,
977 CSeq_data* out_seq,
978 vector<TSeqPos>* out_indices,
979 CSeq_data::E_Choice to_code,
980 TSeqPos uBeginIdx,
981 TSeqPos uLength)
982 {
983 return x_GetImplementation().GetAmbigs
984 (in_seq, out_seq, out_indices, to_code, uBeginIdx, uLength);
985 }
986
987
GetCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)988 TSeqPos CSeqportUtil::GetCopy
989 (const CSeq_data& in_seq,
990 CSeq_data* out_seq,
991 TSeqPos uBeginIdx,
992 TSeqPos uLength)
993 {
994 return x_GetImplementation().GetCopy
995 (in_seq, out_seq, uBeginIdx, uLength);
996 }
997
998
999
Keep(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1000 TSeqPos CSeqportUtil::Keep
1001 (CSeq_data* in_seq,
1002 TSeqPos uBeginIdx,
1003 TSeqPos uLength)
1004 {
1005 return x_GetImplementation().Keep
1006 (in_seq, uBeginIdx, uLength);
1007 }
1008
1009
Append(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2)1010 TSeqPos CSeqportUtil::Append
1011 (CSeq_data* out_seq,
1012 const CSeq_data& in_seq1,
1013 TSeqPos uBeginIdx1,
1014 TSeqPos uLength1,
1015 const CSeq_data& in_seq2,
1016 TSeqPos uBeginIdx2,
1017 TSeqPos uLength2)
1018 {
1019 return x_GetImplementation().Append
1020 (out_seq,
1021 in_seq1, uBeginIdx1, uLength1, in_seq2, uBeginIdx2, uLength2);
1022 }
1023
1024
Complement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1025 TSeqPos CSeqportUtil::Complement
1026 (CSeq_data* in_seq,
1027 TSeqPos uBeginIdx,
1028 TSeqPos uLength)
1029 {
1030 return x_GetImplementation().Complement
1031 (in_seq, uBeginIdx, uLength);
1032 }
1033
1034
Complement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1035 TSeqPos CSeqportUtil::Complement
1036 (const CSeq_data& in_seq,
1037 CSeq_data* out_seq,
1038 TSeqPos uBeginIdx,
1039 TSeqPos uLength)
1040 {
1041 return x_GetImplementation().Complement
1042 (in_seq, out_seq, uBeginIdx, uLength);
1043 }
1044
1045
Reverse(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1046 TSeqPos CSeqportUtil::Reverse
1047 (CSeq_data* in_seq,
1048 TSeqPos uBeginIdx,
1049 TSeqPos uLength)
1050 {
1051 return x_GetImplementation().Reverse
1052 (in_seq, uBeginIdx, uLength);
1053 }
1054
1055
Reverse(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1056 TSeqPos CSeqportUtil::Reverse
1057 (const CSeq_data& in_seq,
1058 CSeq_data* out_seq,
1059 TSeqPos uBeginIdx,
1060 TSeqPos uLength)
1061 {
1062 return x_GetImplementation().Reverse
1063 (in_seq, out_seq, uBeginIdx, uLength);
1064 }
1065
1066
ReverseComplement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength)1067 TSeqPos CSeqportUtil::ReverseComplement
1068 (CSeq_data* in_seq,
1069 TSeqPos uBeginIdx,
1070 TSeqPos uLength)
1071 {
1072 return x_GetImplementation().ReverseComplement
1073 (in_seq, uBeginIdx, uLength);
1074 }
1075
1076
ReverseComplement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength)1077 TSeqPos CSeqportUtil::ReverseComplement
1078 (const CSeq_data& in_seq,
1079 CSeq_data* out_seq,
1080 TSeqPos uBeginIdx,
1081 TSeqPos uLength)
1082 {
1083 return x_GetImplementation().ReverseComplement
1084 (in_seq, out_seq, uBeginIdx, uLength);
1085 }
1086
1087
GetIupacaa3(TIndex ncbistdaa)1088 const string& CSeqportUtil::GetIupacaa3(TIndex ncbistdaa)
1089 {
1090 return x_GetImplementation().GetIupacaa3(ncbistdaa);
1091 }
1092
IsCodeAvailable(CSeq_data::E_Choice code_type)1093 bool CSeqportUtil::IsCodeAvailable(CSeq_data::E_Choice code_type)
1094 {
1095 return x_GetImplementation().IsCodeAvailable(code_type);
1096 }
1097
IsCodeAvailable(ESeq_code_type code_type)1098 bool CSeqportUtil::IsCodeAvailable(ESeq_code_type code_type)
1099 {
1100 return x_GetImplementation().IsCodeAvailable(code_type);
1101 }
1102
GetCodeIndexFromTo(CSeq_data::E_Choice code_type)1103 CSeqportUtil::TPair CSeqportUtil::GetCodeIndexFromTo
1104 (CSeq_data::E_Choice code_type)
1105 {
1106 return x_GetImplementation().GetCodeIndexFromTo(code_type);
1107 }
1108
GetCodeIndexFromTo(ESeq_code_type code_type)1109 CSeqportUtil::TPair CSeqportUtil::GetCodeIndexFromTo
1110 (ESeq_code_type code_type)
1111 {
1112 return x_GetImplementation().GetCodeIndexFromTo(code_type);
1113 }
1114
GetCode(CSeq_data::E_Choice code_type,TIndex idx)1115 const string& CSeqportUtil::GetCode
1116 (CSeq_data::E_Choice code_type,
1117 TIndex idx)
1118 {
1119 return x_GetImplementation().GetCodeOrName(code_type, idx, true);
1120 }
1121
GetCode(ESeq_code_type code_type,TIndex idx)1122 const string& CSeqportUtil::GetCode
1123 (ESeq_code_type code_type,
1124 TIndex idx)
1125 {
1126 return x_GetImplementation().GetCodeOrName(code_type, idx, true);
1127 }
1128
GetName(CSeq_data::E_Choice code_type,TIndex idx)1129 const string& CSeqportUtil::GetName
1130 (CSeq_data::E_Choice code_type,
1131 TIndex idx)
1132 {
1133 return x_GetImplementation().GetCodeOrName(code_type, idx, false);
1134 }
1135
GetName(ESeq_code_type code_type,TIndex idx)1136 const string& CSeqportUtil::GetName
1137 (ESeq_code_type code_type,
1138 TIndex idx)
1139 {
1140 return x_GetImplementation().GetCodeOrName(code_type, idx, false);
1141 }
1142
GetIndex(CSeq_data::E_Choice code_type,const string & code)1143 CSeqportUtil::TIndex CSeqportUtil::GetIndex
1144 (CSeq_data::E_Choice code_type,
1145 const string& code)
1146 {
1147 return x_GetImplementation().GetIndex(code_type, code);
1148 }
1149
GetIndex(ESeq_code_type code_type,const string & code)1150 CSeqportUtil::TIndex CSeqportUtil::GetIndex
1151 (ESeq_code_type code_type,
1152 const string& code)
1153 {
1154 return x_GetImplementation().GetIndex(code_type, code);
1155 }
1156
GetIndexComplement(CSeq_data::E_Choice code_type,TIndex idx)1157 CSeqportUtil::TIndex CSeqportUtil::GetIndexComplement
1158 (CSeq_data::E_Choice code_type,
1159 TIndex idx)
1160 {
1161 return x_GetImplementation().GetIndexComplement(code_type, idx);
1162 }
1163
GetIndexComplement(ESeq_code_type code_type,TIndex idx)1164 CSeqportUtil::TIndex CSeqportUtil::GetIndexComplement
1165 (ESeq_code_type code_type,
1166 TIndex idx)
1167 {
1168 return x_GetImplementation().GetIndexComplement(code_type, idx);
1169 }
1170
GetMapToIndex(CSeq_data::E_Choice from_type,CSeq_data::E_Choice to_type,TIndex from_idx)1171 CSeqportUtil::TIndex CSeqportUtil::GetMapToIndex
1172 (CSeq_data::E_Choice from_type,
1173 CSeq_data::E_Choice to_type,
1174 TIndex from_idx)
1175 {
1176 return x_GetImplementation().GetMapToIndex(from_type, to_type, from_idx);
1177 }
1178
GetMapToIndex(ESeq_code_type from_type,ESeq_code_type to_type,TIndex from_idx)1179 CSeqportUtil::TIndex CSeqportUtil::GetMapToIndex
1180 (ESeq_code_type from_type,
1181 ESeq_code_type to_type,
1182 TIndex from_idx)
1183 {
1184 return x_GetImplementation().GetMapToIndex(from_type, to_type, from_idx);
1185 }
1186
CSeqportUtil_implementation()1187 CSeqportUtil_implementation::CSeqportUtil_implementation()
1188 {
1189
1190 // Initialize m_SeqCodeSet
1191 m_SeqCodeSet = Init();
1192
1193 // Initialize code tables
1194 m_Iupacna = InitCodes(eSeq_code_type_iupacna);
1195
1196 m_Ncbieaa = InitCodes(eSeq_code_type_ncbieaa);
1197
1198 m_Ncbistdaa = InitCodes(eSeq_code_type_ncbistdaa);
1199
1200 m_Iupacaa = InitCodes(eSeq_code_type_iupacaa);
1201
1202
1203 // Initialize na complement tables
1204 m_Iupacna_complement = InitIupacnaComplement();
1205
1206 m_Ncbi2naComplement = InitNcbi2naComplement();
1207
1208 m_Ncbi4naComplement = InitNcbi4naComplement();
1209
1210
1211
1212 // Initialize na reverse tables
1213 m_Ncbi2naRev = InitNcbi2naRev();
1214
1215 m_Ncbi4naRev = InitNcbi4naRev();
1216
1217
1218 // Initialize map tables
1219
1220 m_Ncbi2naIupacna = InitMaps(eSeq_code_type_ncbi2na,
1221 eSeq_code_type_iupacna);
1222
1223 m_Ncbi2naNcbi4na = InitMaps(eSeq_code_type_ncbi2na,
1224 eSeq_code_type_ncbi4na);
1225
1226 m_Ncbi4naIupacna = InitMaps(eSeq_code_type_ncbi4na,
1227 eSeq_code_type_iupacna);
1228
1229 m_IupacnaNcbi2na = InitMaps(eSeq_code_type_iupacna,
1230 eSeq_code_type_ncbi2na);
1231
1232 m_IupacnaNcbi4na = InitMaps(eSeq_code_type_iupacna,
1233 eSeq_code_type_ncbi4na);
1234
1235 m_Ncbi4naNcbi2na = InitMaps(eSeq_code_type_ncbi4na,
1236 eSeq_code_type_ncbi2na);
1237
1238 m_IupacaaNcbieaa = InitMaps(eSeq_code_type_iupacaa,
1239 eSeq_code_type_ncbieaa);
1240
1241 m_NcbieaaIupacaa = InitMaps(eSeq_code_type_ncbieaa,
1242 eSeq_code_type_iupacaa);
1243
1244 m_IupacaaNcbistdaa = InitMaps(eSeq_code_type_iupacaa,
1245 eSeq_code_type_ncbistdaa);
1246
1247 m_NcbieaaNcbistdaa = InitMaps(eSeq_code_type_ncbieaa,
1248 eSeq_code_type_ncbistdaa);
1249
1250 m_NcbistdaaNcbieaa = InitMaps(eSeq_code_type_ncbistdaa,
1251 eSeq_code_type_ncbieaa);
1252
1253 m_NcbistdaaIupacaa = InitMaps(eSeq_code_type_ncbistdaa,
1254 eSeq_code_type_iupacaa);
1255
1256 // Initialize fast conversion tables
1257 //m_FastNcbi2naIupacna = InitFastNcbi2naIupacna();
1258 //m_FastNcbi2naNcbi4na = InitFastNcbi2naNcbi4na();
1259 //m_FastNcbi4naIupacna = InitFastNcbi4naIupacna();
1260 m_FastIupacnaNcbi2na = InitFastIupacnaNcbi2na();
1261 m_FastIupacnaNcbi4na = InitFastIupacnaNcbi4na();
1262 m_FastNcbi4naNcbi2na = InitFastNcbi4naNcbi2na();
1263
1264 // Initialize tables for conversion of index to codes or names
1265 InitIndexCodeName();
1266
1267 // Initialize m_Masks used for random ambiguity resolution
1268 m_Masks = CSeqportUtil_implementation::InitMasks();
1269
1270 // Initialize m_DetectAmbigNcbi4naNcbi2na used for ambiguity
1271 // detection and reporting
1272 m_DetectAmbigNcbi4naNcbi2na = InitAmbigNcbi4naNcbi2na();
1273
1274 // Initialize m_DetectAmbigIupacnaNcbi2na used for ambiguity detection
1275 // and reporting
1276 m_DetectAmbigIupacnaNcbi2na = InitAmbigIupacnaNcbi2na();
1277
1278 }
1279
1280 // Destructor. All memory allocated on the
1281 // free store is wrapped in smart pointers.
1282 // Therefore, the destructor does not need
1283 // to deallocate memory.
~CSeqportUtil_implementation()1284 CSeqportUtil_implementation::~CSeqportUtil_implementation()
1285 {
1286 return;
1287 }
1288
1289
1290 /////////////////////////////////////////////////////////////////////////////
1291 // PRIVATE::
1292 //
1293
1294
1295 // Helper function to initialize m_SeqCodeSet from sm_StrAsnData
Init()1296 CRef<CSeq_code_set> CSeqportUtil_implementation::Init()
1297 {
1298 // Compose a long-long string
1299 string str;
1300 for (size_t i = 0; sm_StrAsnData[i]; i++) {
1301 str += sm_StrAsnData[i];
1302 }
1303
1304 // Create an in memory stream on sm_StrAsnData
1305 CNcbiIstrstream is(str);
1306
1307 auto_ptr<CObjectIStream>
1308 asn_codes_in(CObjectIStream::Open(eSerial_AsnText, is));
1309
1310 // Create a CSeq_code_set
1311 CRef<CSeq_code_set> ptr_seq_code_set(new CSeq_code_set());
1312
1313 // Initialize the newly created CSeq_code_set
1314 *asn_codes_in >> *ptr_seq_code_set;
1315
1316 // Return a newly created CSeq_code_set
1317 return ptr_seq_code_set;
1318 }
1319
1320
1321 // Function to initialize code tables
1322 CRef<CSeqportUtil_implementation::CCode_table>
InitCodes(ESeq_code_type code_type)1323 CSeqportUtil_implementation::InitCodes(ESeq_code_type code_type)
1324 {
1325 // Get list of code tables
1326 const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1327
1328 // Get table for code_type
1329 list<CRef<CSeq_code_table> >::const_iterator i_ct;
1330 for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1331 if((*i_ct)->GetCode() == code_type)
1332 break;
1333
1334
1335 if(i_ct == code_list.end())
1336 throw runtime_error("Requested code table not found");
1337
1338 // Get table data
1339 const list<CRef<CSeq_code_table::C_E> >& table_data = (*i_ct)->GetTable();
1340 SIZE_TYPE size = table_data.size();
1341 int start_at = (*i_ct)->GetStart_at();
1342 CRef<CCode_table> codeTable(new CCode_table(size, start_at));
1343
1344 // Initialize codeTable to 255
1345 for(int i=0; i<256; i++)
1346 codeTable->m_Table[i] = '\xff';
1347
1348 // Copy table data to codeTable
1349 int nIdx = start_at;
1350 list<CRef<CSeq_code_table::C_E> >::const_iterator i_td;
1351 for(i_td = table_data.begin(); i_td != table_data.end(); ++i_td) {
1352 codeTable->m_Table[nIdx] = *((*i_td)->GetSymbol().c_str());
1353 if(codeTable->m_Table[nIdx] == '\x00')
1354 codeTable->m_Table[nIdx++] = '\xff';
1355 else
1356 nIdx++;
1357 }
1358
1359 // Return codeTable
1360 return codeTable;
1361 }
1362
1363
1364 // Function to initialize iupacna complement table
1365 CRef<CSeqportUtil_implementation::CCode_comp>
InitIupacnaComplement()1366 CSeqportUtil_implementation::InitIupacnaComplement()
1367 {
1368
1369 // Get list of code tables
1370 const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1371
1372 // Get table for code_type iupacna
1373 list<CRef<CSeq_code_table> >::const_iterator i_ct;
1374 for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1375 if((*i_ct)->GetCode() == eSeq_code_type_iupacna)
1376 break;
1377
1378
1379 if(i_ct == code_list.end())
1380 throw runtime_error("Code table for Iupacna not found");
1381
1382 // Check that complements are set
1383 if(!(*i_ct)->IsSetComps())
1384 throw runtime_error("Complement data is not set for iupacna table");
1385
1386 // Get complement data, start at and size of complement data
1387 const list<int>& comp_data = (*i_ct)->GetComps();
1388 int start_at = (*i_ct)->GetStart_at();
1389
1390 // Allocate memory for complement data
1391 CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1392
1393 // Initialize compTable to 255 for illegal codes
1394 for(unsigned int i = 0; i<256; i++)
1395 compTable->m_Table[i] = (char) 255;
1396
1397 // Loop trhough the complement data and set compTable
1398 list<int>::const_iterator i_comp;
1399 unsigned int nIdx = start_at;
1400 for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1401 compTable->m_Table[nIdx++] = (*i_comp);
1402
1403 // Return the complement data
1404 return compTable;
1405
1406 }
1407
1408
1409 // Function to initialize ncbi2na complement table
1410 CRef<CSeqportUtil_implementation::CCode_comp>
InitNcbi2naComplement()1411 CSeqportUtil_implementation::InitNcbi2naComplement()
1412 {
1413
1414 // Get list of code tables
1415 const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1416
1417 // Get table for code_type ncbi2na
1418 list<CRef<CSeq_code_table> >::const_iterator i_ct;
1419 for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1420 if((*i_ct)->GetCode() == eSeq_code_type_ncbi2na)
1421 break;
1422
1423 if(i_ct == code_list.end())
1424 throw runtime_error("Code table for Iupacna not found");
1425
1426 // Check that complements are set
1427 if(!(*i_ct)->IsSetComps())
1428 throw runtime_error("Complement data is not set for ncbi2na table");
1429
1430 // Get complement data, start at and size of complement data
1431 const list<int>& comp_data = (*i_ct)->GetComps();
1432 int start_at = (*i_ct)->GetStart_at();
1433
1434 // Allocate memory for complement data
1435 CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1436
1437 // Put complement data in an array
1438 char compArray[4];
1439 int nIdx = start_at;
1440 list<int>::const_iterator i_comp;
1441 for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1442 compArray[nIdx++] = (*i_comp);
1443
1444 // Set compTable
1445 for(unsigned int i = 0; i < 4; i++)
1446 for(unsigned int j = 0; j < 4; j++)
1447 for(unsigned int k = 0; k < 4; k++)
1448 for(unsigned int l = 0; l < 4; l++)
1449 {
1450 nIdx = i<<6 | j<<4 | k<<2 | l;
1451 char c1 = compArray[i] << 6;
1452 char c2 = compArray[j] << 4;
1453 char c3 = compArray[k] << 2;
1454 char c4 = compArray[l];
1455 compTable->m_Table[nIdx] = c1 | c2 | c3 | c4;
1456 }
1457
1458 // Return complement data
1459 return compTable;
1460
1461 }
1462
1463
1464 // Function to initialize ncbi4na complement table
1465 CRef<CSeqportUtil_implementation::CCode_comp>
InitNcbi4naComplement()1466 CSeqportUtil_implementation::InitNcbi4naComplement()
1467 {
1468
1469 // Get list of code tables
1470 const list<CRef<CSeq_code_table> >& code_list = m_SeqCodeSet->GetCodes();
1471
1472 // Get table for code_type ncbi2na
1473 list<CRef<CSeq_code_table> >::const_iterator i_ct;
1474 for(i_ct = code_list.begin(); i_ct != code_list.end(); ++i_ct)
1475 if((*i_ct)->GetCode() == eSeq_code_type_ncbi4na)
1476 break;
1477
1478 if(i_ct == code_list.end())
1479 throw runtime_error("Code table for Iupacna not found");
1480
1481 // Check that complements are set
1482 if(!(*i_ct)->IsSetComps())
1483 throw runtime_error("Complement data is not set for iupacna table");
1484
1485 // Get complement data, start at and size of complement data
1486 const list<int>& comp_data = (*i_ct)->GetComps();
1487 int start_at = (*i_ct)->GetStart_at();
1488
1489 // Allocate memory for complement data
1490 CRef<CCode_comp> compTable(new CCode_comp(256, start_at));
1491
1492
1493 // Put complement data in an array
1494 char compArray[16];
1495 int nIdx = start_at;
1496 list<int>::const_iterator i_comp;
1497 for(i_comp = comp_data.begin(); i_comp != comp_data.end(); ++i_comp)
1498 compArray[nIdx++] = (*i_comp);
1499
1500 // Set compTable
1501 for(unsigned int i = 0; i<16; i++)
1502 for(unsigned int j = 0; j < 16; j++)
1503 {
1504 nIdx = i<<4 | j;
1505 char c1 = compArray[i] << 4;
1506 char c2 = compArray[j];
1507 compTable->m_Table[nIdx] = c1 | c2;
1508 }
1509
1510 // Return complement data
1511 return compTable;
1512
1513 }
1514
1515
1516 // Function to initialize m_Ncbi2naRev
InitNcbi2naRev()1517 CRef<CSeqportUtil_implementation::CCode_rev> CSeqportUtil_implementation::InitNcbi2naRev()
1518 {
1519
1520 // Allocate memory for reverse table
1521 CRef<CCode_rev> revTable(new CCode_rev(256, 0));
1522
1523 // Initialize table used to reverse a byte.
1524 for(unsigned int i = 0; i < 4; i++)
1525 for(unsigned int j = 0; j < 4; j++)
1526 for(unsigned int k = 0; k < 4; k++)
1527 for(unsigned int l = 0; l < 4; l++)
1528 revTable->m_Table[64*i + 16*j + 4*k + l] =
1529 64*l + 16*k + 4*j +i;
1530
1531 // Return the reverse table
1532 return revTable;
1533 }
1534
1535
1536 // Function to initialize m_Ncbi4naRev
InitNcbi4naRev()1537 CRef<CSeqportUtil_implementation::CCode_rev> CSeqportUtil_implementation::InitNcbi4naRev()
1538 {
1539
1540 // Allocate memory for reverse table
1541 CRef<CCode_rev> revTable(new CCode_rev(256, 0));
1542
1543 // Initialize table used to reverse a byte.
1544 for(unsigned int i = 0; i < 16; i++)
1545 for(unsigned int j = 0; j < 16; j++)
1546 revTable->m_Table[16*i + j] = 16*j + i;
1547
1548 // Return the reverse table
1549 return revTable;
1550 }
1551
1552
1553
1554 // Function to initialize map tables
1555 CRef<CSeqportUtil_implementation::CMap_table>
InitMaps(ESeq_code_type from_type,ESeq_code_type to_type)1556 CSeqportUtil_implementation::InitMaps
1557 (ESeq_code_type from_type,
1558 ESeq_code_type to_type)
1559 {
1560
1561 // Get list of map tables
1562 const list< CRef< CSeq_map_table > >& map_list = m_SeqCodeSet->GetMaps();
1563
1564 // Get requested map table
1565 list<CRef<CSeq_map_table> >::const_iterator i_mt;
1566 for(i_mt = map_list.begin(); i_mt != map_list.end(); ++i_mt)
1567 if((*i_mt)->GetFrom() == from_type && (*i_mt)->GetTo() == to_type)
1568 break;
1569
1570 if(i_mt == map_list.end())
1571 throw runtime_error("Requested map table not found");
1572
1573 // Get the map table
1574 const list<int>& table_data = (*i_mt)->GetTable();
1575
1576 // Create a map table reference
1577 SIZE_TYPE size = table_data.size();
1578 int start_at = (*i_mt)->GetStart_at();
1579 CRef<CMap_table> mapTable(new CMap_table(size,start_at));
1580
1581 // Copy the table data to mapTable
1582 int nIdx = start_at;
1583 list<int>::const_iterator i_td;
1584 for(i_td = table_data.begin(); i_td != table_data.end(); ++i_td)
1585 {
1586 mapTable->m_Table[nIdx++] = *i_td;
1587 }
1588
1589 return mapTable;
1590 }
1591
1592
1593 // Functions to initialize fast conversion tables
1594 // Function to initialize FastNcib2naIupacna
1595 /*
1596 CRef<CSeqportUtil_implementation::CFast_table4> CSeqportUtil_implementation::InitFastNcbi2naIupacna()
1597 {
1598
1599 CRef<CFast_table4> fastTable(new CFast_table4(256,0));
1600 unsigned char i,j,k,l;
1601 for(i = 0; i < 4; i++)
1602 for(j = 0; j < 4; j++)
1603 for(k = 0; k < 4; k++)
1604 for(l = 0; l < 4; l++)
1605 {
1606 unsigned char aByte = (i<<6) | (j<<4) | (k<<2) | l;
1607 char chi = m_Ncbi2naIupacna->m_Table[i];
1608 char chj = m_Ncbi2naIupacna->m_Table[j];
1609 char chk = m_Ncbi2naIupacna->m_Table[k];
1610 char chl = m_Ncbi2naIupacna->m_Table[l];
1611
1612 // Note high order bit pair corresponds to low order
1613 // byte etc., on Unix machines.
1614 char *pt =
1615 reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1616 *(pt++) = chi;
1617 *(pt++) = chj;
1618 *(pt++) = chk;
1619 *(pt) = chl;
1620 }
1621 return fastTable;
1622 }
1623 */
1624
1625 // Function to initialize FastNcib2naNcbi4na
InitFastNcbi2naNcbi4na()1626 CRef<CSeqportUtil_implementation::CFast_table2> CSeqportUtil_implementation::InitFastNcbi2naNcbi4na()
1627 {
1628
1629 CRef<CFast_table2> fastTable(new CFast_table2(256,0));
1630 unsigned char i, j, k, l;
1631
1632 for(i = 0; i < 4; i++)
1633 for(j = 0; j < 4; j++)
1634 for(k = 0; k < 4; k++)
1635 for(l = 0; l < 4; l++) {
1636 unsigned char aByte = (i<<6) | (j<<4) | (k<<2) | l;
1637 unsigned char chi = m_Ncbi2naNcbi4na->m_Table[i];
1638 unsigned char chj = m_Ncbi2naNcbi4na->m_Table[j];
1639 unsigned char chk = m_Ncbi2naNcbi4na->m_Table[k];
1640 unsigned char chl = m_Ncbi2naNcbi4na->m_Table[l];
1641 char *pt =
1642
1643 reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1644 *(pt++) = (chi << 4) | chj;
1645 *pt = (chk << 4) | chl;
1646 }
1647 return fastTable;
1648 }
1649
1650
1651 // Function to initialize FastNcib4naIupacna
InitFastNcbi4naIupacna()1652 CRef<CSeqportUtil_implementation::CFast_table2> CSeqportUtil_implementation::InitFastNcbi4naIupacna()
1653 {
1654
1655 CRef<CFast_table2> fastTable(new CFast_table2(256,0));
1656 unsigned char i,j;
1657 for(i = 0; i < 16; i++)
1658 for(j = 0; j < 16; j++) {
1659 unsigned char aByte = (i<<4) | j;
1660 unsigned char chi = m_Ncbi4naIupacna->m_Table[i];
1661 unsigned char chj = m_Ncbi4naIupacna->m_Table[j];
1662
1663 // Note high order nible corresponds to low order byte
1664 // etc., on Unix machines.
1665 char *pt = reinterpret_cast<char*>(&fastTable->m_Table[aByte]);
1666 *(pt++) = chi;
1667 *pt = chj;
1668 }
1669 return fastTable;
1670 }
1671
1672
1673 // Function to initialize m_FastIupacnancbi2na
InitFastIupacnaNcbi2na()1674 CRef<CSeqportUtil_implementation::CFast_4_1> CSeqportUtil_implementation::InitFastIupacnaNcbi2na()
1675 {
1676
1677 int start_at = m_IupacnaNcbi2na->m_StartAt;
1678 int size = m_IupacnaNcbi2na->m_Size;
1679 CRef<CFast_4_1> fastTable(new CFast_4_1(4,0,256,0));
1680 for(int ch = 0; ch < 256; ch++) {
1681 if((ch >= start_at) && (ch < (start_at + size)))
1682 {
1683 unsigned char uch = m_IupacnaNcbi2na->m_Table[ch];
1684 uch &= '\x03';
1685 for(unsigned int pos = 0; pos < 4; pos++)
1686 fastTable->m_Table[pos][ch] = uch << (6-2*pos);
1687 }
1688 else
1689 for(unsigned int pos = 0; pos < 4; pos++)
1690 fastTable->m_Table[pos][ch] = '\x00';
1691 }
1692 return fastTable;
1693 }
1694
1695
1696 // Function to initialize m_FastIupacnancbi4na
InitFastIupacnaNcbi4na()1697 CRef<CSeqportUtil_implementation::CFast_2_1> CSeqportUtil_implementation::InitFastIupacnaNcbi4na()
1698 {
1699
1700 int start_at = m_IupacnaNcbi4na->m_StartAt;
1701 int size = m_IupacnaNcbi4na->m_Size;
1702 CRef<CFast_2_1> fastTable(new CFast_2_1(2,0,256,0));
1703 for(int ch = 0; ch < 256; ch++) {
1704 if((ch >= start_at) && (ch < (start_at + size)))
1705 {
1706 unsigned char uch = m_IupacnaNcbi4na->m_Table[ch];
1707 for(unsigned int pos = 0; pos < 2; pos++)
1708 fastTable->m_Table[pos][ch] = uch << (4-4*pos);
1709 }
1710 else
1711 {
1712 fastTable->m_Table[0][ch] = 0xF0;
1713 fastTable->m_Table[1][ch] = 0x0F;
1714 }
1715 }
1716 return fastTable;
1717 }
1718
1719
1720 // Function to initialize m_FastNcbi4naNcbi2na
InitFastNcbi4naNcbi2na()1721 CRef<CSeqportUtil_implementation::CFast_2_1> CSeqportUtil_implementation::InitFastNcbi4naNcbi2na()
1722 {
1723
1724 int start_at = m_Ncbi4naNcbi2na->m_StartAt;
1725 int size = m_Ncbi4naNcbi2na->m_Size;
1726 CRef<CFast_2_1> fastTable(new CFast_2_1(2,0,256,0));
1727 for(int n1 = 0; n1 < 16; n1++)
1728 for(int n2 = 0; n2 < 16; n2++) {
1729 int nIdx = 16*n1 + n2;
1730 unsigned char u1, u2;
1731 if((n1 >= start_at) && (n1 < start_at + size))
1732 u1 = m_Ncbi4naNcbi2na->m_Table[n1] & 3;
1733 else
1734 u1 = '\x00';
1735 if((n2 >= start_at) && (n2 < start_at + size))
1736 u2 = m_Ncbi4naNcbi2na->m_Table[n2] & 3;
1737 else
1738 u2 = '\x00';
1739 fastTable->m_Table[0][nIdx] = (u1<<6) | (u2<<4);
1740 fastTable->m_Table[1][nIdx] = (u1<<2) | u2;
1741 }
1742
1743 return fastTable;
1744 }
1745
1746
1747 // Function to initialize m_IndexString and m_StringIndex
InitIndexCodeName()1748 void CSeqportUtil_implementation::InitIndexCodeName()
1749 {
1750 m_IndexString[kName].resize(kNumCodes);
1751 m_IndexString[kSymbol].resize(kNumCodes);
1752 m_IndexComplement.resize(kNumCodes);
1753 m_StringIndex.resize(kNumCodes);
1754 m_StartAt.resize(kNumCodes);
1755
1756 bool found[kNumCodes];
1757 for (unsigned int ii = 0; ii < kNumCodes; ii++) {
1758 found[ii] = false;
1759 }
1760 ITERATE (CSeq_code_set::TCodes, it, m_SeqCodeSet->GetCodes()) {
1761 const ESeq_code_type& code = (*it)->GetCode();
1762 if (!found[code-1]) {
1763 found[code-1] = true;
1764 m_StartAt[code-1] = (*it)->IsSetStart_at() ?
1765 (*it)->GetStart_at() : 0;
1766 TIndex i = m_StartAt[code-1];
1767 ITERATE(CSeq_code_table::TTable, is, (*it)->GetTable()) {
1768 m_IndexString[kSymbol][code-1].push_back((*is)->GetSymbol());
1769 m_IndexString[kName][code-1].push_back((*is)->GetName());
1770 m_StringIndex[code-1].insert
1771 (make_pair((*is)->GetSymbol(), i++));
1772 }
1773 if ( (*it)->IsSetComps() ) {
1774 ITERATE (list<int>, ic, (*it)->GetComps()) {
1775 m_IndexComplement[code-1].push_back(*ic);
1776 }
1777 }
1778 }
1779 }
1780
1781
1782 }
1783
1784
1785 // Function to initialize m_Masks
InitMasks()1786 CRef<CSeqportUtil_implementation::SMasksArray> CSeqportUtil_implementation::InitMasks()
1787 {
1788
1789 unsigned int i, j, uCnt;
1790 unsigned char cVal, cRslt;
1791 CRef<SMasksArray> aMask(new SMasksArray);
1792
1793 // Initialize possible masks for converting ambiguous
1794 // ncbi4na bytes to unambiguous bytes
1795 static const unsigned char mask[16] = {
1796 0x11, 0x12, 0x14, 0x18,
1797 0x21, 0x22, 0x24, 0x28,
1798 0x41, 0x42, 0x44, 0x48,
1799 0x81, 0x82, 0x84, 0x88
1800 };
1801
1802 static const unsigned char maskUpper[4] = { 0x10, 0x20, 0x40, 0x80 };
1803 static const unsigned char maskLower[4] = { 0x01, 0x02, 0x04, 0x08 };
1804
1805 // Loop through possible ncbi4na bytes and
1806 // build masks that convert it to unambiguous na
1807 for(i = 0; i < 256; i++) {
1808 cVal = i;
1809 uCnt = 0;
1810
1811 // Case where both upper and lower nible > 0
1812 if(((cVal & '\x0f') != 0) && ((cVal & '\xf0') != 0))
1813 for(j = 0; j < 16; j++) {
1814 cRslt = cVal & mask[j];
1815 if(cRslt == mask[j])
1816 aMask->m_Table[i].cMask[uCnt++] = mask[j];
1817 }
1818
1819 // Case where upper nible = 0 and lower nible > 0
1820 else if((cVal & '\x0f') != 0)
1821 for(j = 0; j < 4; j++)
1822 {
1823 cRslt = cVal & maskLower[j];
1824 if(cRslt == maskLower[j])
1825 aMask->m_Table[i].cMask[uCnt++] = maskLower[j];
1826 }
1827
1828
1829 // Case where lower nible = 0 and upper nible > 0
1830 else if((cVal & '\xf0') != 0)
1831 for(j = 0; j < 4; j++)
1832 {
1833 cRslt = cVal & maskUpper[j];
1834 if(cRslt == maskUpper[j])
1835 aMask->m_Table[i].cMask[uCnt++] = maskUpper[j];
1836 }
1837
1838 // Both upper and lower nibles = 0
1839 else
1840 aMask->m_Table[i].cMask[uCnt++] = '\x00';
1841
1842 // Number of distict masks for ncbi4na byte i
1843 aMask->m_Table[i].nMasks = uCnt;
1844
1845 // Fill out the remainder of cMask array with copies
1846 // of first uCnt masks
1847 for(j = uCnt; j < 16 && uCnt > 0; j++)
1848 aMask->m_Table[i].cMask[j] = aMask->m_Table[i].cMask[j % uCnt];
1849
1850 }
1851
1852 return aMask;
1853 }
1854
1855
1856 // Function to initialize m_DetectAmbigNcbi4naNcbi2na used for
1857 // ambiguity detection
InitAmbigNcbi4naNcbi2na()1858 CRef<CSeqportUtil_implementation::CAmbig_detect> CSeqportUtil_implementation::InitAmbigNcbi4naNcbi2na()
1859 {
1860
1861 unsigned char low, high, ambig;
1862
1863 // Create am new CAmbig_detect object
1864 CRef<CAmbig_detect> ambig_detect(new CAmbig_detect(256,0));
1865
1866 // Loop through low and high order nibles and assign
1867 // values as follows: 0 - no ambiguity, 1 - low order nible ambigiguous
1868 // 2 - high order ambiguous, 3 -- both high and low ambiguous.
1869
1870 // Loop for low order nible
1871 for(low = 0; low < 16; low++) {
1872 // Determine if low order nible is ambiguous
1873 if((low == 1) || (low ==2) || (low == 4) || (low == 8))
1874 ambig = 0; // Not ambiguous
1875 else
1876 ambig = 1; // Ambiguous
1877
1878 // Loop for high order nible
1879 for(high = 0; high < 16; high++) {
1880
1881 // Determine if high order nible is ambiguous
1882 if((high != 1) && (high != 2) && (high != 4) && (high != 8))
1883 ambig += 2; // Ambiguous
1884
1885 // Set ambiguity value
1886 ambig_detect->m_Table[16*high + low] = ambig;
1887
1888 // Reset ambig
1889 ambig &= '\xfd'; // Set second bit to 0
1890 }
1891 }
1892
1893 return ambig_detect;
1894 }
1895
1896
1897 // Function to initialize m_DetectAmbigIupacnaNcbi2na used for ambiguity
1898 // detection
InitAmbigIupacnaNcbi2na()1899 CRef<CSeqportUtil_implementation::CAmbig_detect> CSeqportUtil_implementation::InitAmbigIupacnaNcbi2na()
1900 {
1901
1902 // Create am new CAmbig_detect object
1903 CRef<CAmbig_detect> ambig_detect(new CAmbig_detect(256,0));
1904
1905 // 0 implies no ambiguity. 1 implies ambiguity
1906 // Initialize to 0
1907 for(unsigned int i = 0; i<256; i++)
1908 ambig_detect->m_Table[i] = 0;
1909
1910 // Set iupacna characters that are ambiguous when converted
1911 // to ncib2na
1912 ambig_detect->m_Table[66] = 1; // B
1913 ambig_detect->m_Table[68] = 1; // D
1914 ambig_detect->m_Table[72] = 1; // H
1915 ambig_detect->m_Table[75] = 1; // K
1916 ambig_detect->m_Table[77] = 1; // M
1917 ambig_detect->m_Table[78] = 1; // N
1918 ambig_detect->m_Table[82] = 1; // R
1919 ambig_detect->m_Table[83] = 1; // S
1920 ambig_detect->m_Table[86] = 1; // V
1921 ambig_detect->m_Table[87] = 1; // W
1922 ambig_detect->m_Table[89] = 1; // Y
1923
1924 return ambig_detect;
1925 }
1926
1927 /*
1928 struct SSeqDataToSeqUtil
1929 {
1930 CSeq_data::E_Choice seq_data_coding;
1931 CSeqConvert::TCoding seq_convert_coding;
1932 };
1933
1934
1935 static SSeqDataToSeqUtil s_SeqDataToSeqUtilMap[] = {
1936 { CSeq_data::e_Iupacna, CSeqUtil::e_Iupacna },
1937 { CSeq_data::e_Iupacaa, CSeqUtil::e_Iupacna },
1938 { CSeq_data::e_Ncbi2na, CSeqUtil::e_Ncbi2na },
1939 { CSeq_data::e_Ncbi4na, CSeqUtil::e_Ncbi4na },
1940 { CSeq_data::e_Ncbi8na, CSeqUtil::e_Ncbi8na },
1941 { CSeq_data::e_Ncbi8aa, CSeqUtil::e_Ncbi8aa },
1942 { CSeq_data::e_Ncbieaa, CSeqUtil::e_Ncbieaa },
1943 { CSeq_data::e_Ncbistdaa, CSeqUtil::e_Ncbistdaa }
1944 };
1945 */
1946
1947 static CSeqUtil::TCoding s_SeqDataToSeqUtil[] = {
1948 CSeqUtil::e_not_set,
1949 CSeqUtil::e_Iupacna,
1950 CSeqUtil::e_Iupacaa,
1951 CSeqUtil::e_Ncbi2na,
1952 CSeqUtil::e_Ncbi4na,
1953 CSeqUtil::e_Ncbi8na,
1954 CSeqUtil::e_not_set,
1955 CSeqUtil::e_Ncbi8aa,
1956 CSeqUtil::e_Ncbieaa,
1957 CSeqUtil::e_not_set,
1958 CSeqUtil::e_Ncbistdaa
1959 };
1960
1961
1962 // Convert from one coding scheme to another. The following
1963 // 12 conversions are supported: ncbi2na<=>ncbi4na;
1964 // ncbi2na<=>iupacna; ncbi4na<=>iupacna; ncbieaa<=>ncbistdaa;
1965 // ncbieaa<=>iupacaa; ncbistdaa<=>iupacaa. Convert is
1966 // really just a dispatch function--it calls the appropriate
1967 // priviate conversion function.
x_ConvertAmbig(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const1968 TSeqPos CSeqportUtil_implementation::x_ConvertAmbig
1969 (const CSeq_data& in_seq,
1970 CSeq_data* out_seq,
1971 CSeq_data::E_Choice to_code,
1972 TSeqPos uBeginIdx,
1973 TSeqPos uLength,
1974 CRandom::TValue seed,
1975 TSeqPos total_length,
1976 TSeqPos* out_seq_length,
1977 vector<Uint4>* blast_ambig)
1978 const
1979 {
1980 CSeq_data::E_Choice from_code = in_seq.Which();
1981
1982 if(to_code == CSeq_data::e_not_set || from_code == CSeq_data::e_not_set)
1983 throw std::runtime_error("to_code or from_code not set");
1984
1985 if ( to_code != CSeq_data::e_Ncbi2na ) {
1986 throw std::runtime_error("to_code is not Ncbi2na");
1987 }
1988
1989 switch (from_code) {
1990 case CSeq_data::e_Iupacna:
1991 return MapIupacnaToNcbi2na(in_seq, out_seq, uBeginIdx, uLength, true,
1992 seed, total_length, out_seq_length,
1993 blast_ambig);
1994 case CSeq_data::e_Ncbi4na:
1995 return MapNcbi4naToNcbi2na(in_seq, out_seq, uBeginIdx, uLength, true,
1996 seed, total_length, out_seq_length,
1997 blast_ambig);
1998 default:
1999 throw runtime_error("Requested conversion not implemented");
2000 }
2001 }
2002
2003 // Convert from one coding scheme to another. The following
2004 // 12 conversions are supported: ncbi2na<=>ncbi4na;
2005 // ncbi2na<=>iupacna; ncbi4na<=>iupacna; ncbieaa<=>ncbistdaa;
2006 // ncbieaa<=>iupacaa; ncbistdaa<=>iupacaa. Convert is
2007 // really just a dispatch function--it calls the appropriate
2008 // priviate conversion function.
Convert(const CSeq_data & in_seq,CSeq_data * out_seq,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const2009 TSeqPos CSeqportUtil_implementation::Convert
2010 (const CSeq_data& in_seq,
2011 CSeq_data* out_seq,
2012 CSeq_data::E_Choice to_code,
2013 TSeqPos uBeginIdx,
2014 TSeqPos uLength,
2015 bool bAmbig,
2016 CRandom::TValue seed,
2017 TSeqPos total_length,
2018 TSeqPos* out_seq_length,
2019 vector<Uint4>* blast_ambig)
2020 const
2021 {
2022 CSeq_data::E_Choice from_code = in_seq.Which();
2023
2024 // adjust uLength
2025 if ( uLength == 0 ) {
2026 uLength = numeric_limits<TSeqPos>::max();
2027 }
2028
2029 if(to_code == CSeq_data::e_not_set || from_code == CSeq_data::e_not_set) {
2030 throw std::runtime_error("to_code or from_code not set");
2031 }
2032 if ( s_SeqDataToSeqUtil[to_code] == CSeqUtil::e_not_set ||
2033 s_SeqDataToSeqUtil[from_code] == CSeqUtil::e_not_set ) {
2034 throw runtime_error("Requested conversion not implemented");
2035 }
2036
2037 // Note: for now use old code to convert to ncbi2na with random
2038 // conversion of ambiguous characters.
2039 if ( (to_code == CSeq_data::e_Ncbi2na) && (bAmbig == true) ) {
2040 return x_ConvertAmbig(in_seq, out_seq, to_code, uBeginIdx, uLength,
2041 seed, total_length, out_seq_length, blast_ambig);
2042 }
2043
2044 const string* in_str = 0;
2045 const vector<char>* in_vec = 0;
2046
2047 x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2048
2049 TSeqPos retval = 0;
2050 if ( in_str != 0 ) {
2051 string result;
2052 retval = CSeqConvert::Convert(*in_str, s_SeqDataToSeqUtil[from_code],
2053 uBeginIdx, uLength,
2054 result, s_SeqDataToSeqUtil[to_code]);
2055 CSeq_data temp(result, to_code);
2056 out_seq->Assign(temp);
2057 } else if ( in_vec != 0 ) {
2058 vector<char> result;
2059 retval = CSeqConvert::Convert(*in_vec, s_SeqDataToSeqUtil[from_code],
2060 uBeginIdx, uLength,
2061 result, s_SeqDataToSeqUtil[to_code]);
2062 CSeq_data temp(result, to_code);
2063 out_seq->Assign(temp);
2064 }
2065 return retval;
2066 }
2067
2068
2069 // Provide maximum packing without loss of information
Pack(CSeq_data * in_seq,TSeqPos uLength) const2070 TSeqPos CSeqportUtil_implementation::Pack
2071 (CSeq_data* in_seq,
2072 TSeqPos uLength)
2073 const
2074 {
2075 _ASSERT(in_seq != 0);
2076
2077 CSeq_data::E_Choice from_code = in_seq->Which();
2078 _ASSERT(from_code != CSeq_data::e_not_set);
2079
2080 if ( s_SeqDataToSeqUtil[from_code] == CSeqUtil::e_not_set ) {
2081 throw runtime_error("Unable tp pack requested coding");
2082 }
2083
2084
2085 // nothing to pack for proteins
2086 switch ( from_code ) {
2087 case CSeq_data::e_Iupacaa:
2088 return in_seq->GetIupacaa().Get().size();
2089 case CSeq_data::e_Ncbi8aa:
2090 return in_seq->GetNcbi8aa().Get().size();
2091 case CSeq_data::e_Ncbieaa:
2092 return in_seq->GetNcbieaa().Get().size();
2093 case CSeq_data::e_Ncbipaa:
2094 return in_seq->GetNcbipaa().Get().size();
2095 case CSeq_data::e_Ncbistdaa:
2096 return in_seq->GetNcbistdaa().Get().size();
2097 default:
2098 break;
2099 }
2100 // nothing to convert
2101 if ( from_code == CSeq_data::e_Ncbi2na &&
2102 in_seq->GetNcbi2na().Get().size() * 4 <= uLength ) {
2103 return in_seq->GetNcbi2na().Get().size() * 4;
2104 }
2105
2106 const string* in_str = 0;
2107 const vector<char>* in_vec = 0;
2108
2109 x_GetSeqFromSeqData(*in_seq, &in_str, &in_vec);
2110
2111 vector<char> out_vec;
2112 CSeqUtil::TCoding coding = CSeqUtil::e_not_set;
2113
2114 TSeqPos retval = 0;
2115 if ( in_str != 0 ) {
2116 retval =
2117 CSeqConvert::Pack(*in_str, s_SeqDataToSeqUtil[from_code],
2118 out_vec, coding, uLength);
2119 } else if ( in_vec != 0 ) {
2120 retval =
2121 CSeqConvert::Pack(*in_vec, s_SeqDataToSeqUtil[from_code],
2122 out_vec, coding, uLength);
2123 }
2124
2125 switch (coding) {
2126 case CSeqUtil::e_Ncbi2na:
2127 in_seq->SetNcbi2na().Set(out_vec);
2128 break;
2129 case CSeqUtil::e_Ncbi4na:
2130 in_seq->SetNcbi4na().Set(out_vec);
2131 break;
2132 default:
2133 _TROUBLE;
2134 }
2135
2136 return retval;
2137 }
2138
2139
2140 // Method to quickly validate that a CSeq_data object contains valid data.
2141 // FastValidate is a dispatch function that calls the appropriate
2142 // private fast validation function.
FastValidate(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2143 bool CSeqportUtil_implementation::FastValidate
2144 (const CSeq_data& in_seq,
2145 TSeqPos uBeginIdx,
2146 TSeqPos uLength)
2147 const
2148 {
2149 switch (in_seq.Which()) {
2150 case CSeq_data::e_Ncbi2na:
2151 return true; // ncbi2na sequences are always valid
2152 case CSeq_data::e_Ncbi4na:
2153 return true; // ncbi4na sequences are always valid
2154 case CSeq_data::e_Iupacna:
2155 return FastValidateIupacna(in_seq, uBeginIdx, uLength);
2156 case CSeq_data::e_Ncbieaa:
2157 return FastValidateNcbieaa(in_seq, uBeginIdx, uLength);
2158 case CSeq_data::e_Ncbistdaa:
2159 return FastValidateNcbistdaa(in_seq, uBeginIdx, uLength);
2160 case CSeq_data::e_Iupacaa:
2161 return FastValidateIupacaa(in_seq, uBeginIdx, uLength);
2162 default:
2163 throw runtime_error("Sequence could not be validated");
2164 }
2165 }
2166
2167
2168 // Function to perform full validation. Validate is a
2169 // dispatch function that calls the appropriate private
2170 // validation function.
Validate(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const2171 void CSeqportUtil_implementation::Validate
2172 (const CSeq_data& in_seq,
2173 vector<TSeqPos>* badIdx,
2174 TSeqPos uBeginIdx,
2175 TSeqPos uLength)
2176 const
2177 {
2178 switch (in_seq.Which()) {
2179 case CSeq_data::e_Ncbi2na:
2180 return; // ncbi2na sequences are always valid
2181 case CSeq_data::e_Ncbi4na:
2182 return; // ncbi4na sequences are always valid
2183 case CSeq_data::e_Iupacna:
2184 ValidateIupacna(in_seq, badIdx, uBeginIdx, uLength);
2185 break;
2186 case CSeq_data::e_Ncbieaa:
2187 ValidateNcbieaa(in_seq, badIdx, uBeginIdx, uLength);
2188 break;
2189 case CSeq_data::e_Ncbistdaa:
2190 ValidateNcbistdaa(in_seq, badIdx, uBeginIdx, uLength);
2191 break;
2192 case CSeq_data::e_Iupacaa:
2193 ValidateIupacaa(in_seq, badIdx, uBeginIdx, uLength);
2194 break;
2195 default:
2196 throw runtime_error("Sequence could not be validated");
2197 }
2198 }
2199
2200
2201 // Function to find ambiguous bases and vector of indices of
2202 // ambiguous bases in CSeq_data objects. GetAmbigs is a
2203 // dispatch function that calls the appropriate private get
2204 // ambigs function.
GetAmbigs(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,CSeq_data::E_Choice to_code,TSeqPos uBeginIdx,TSeqPos uLength) const2205 TSeqPos CSeqportUtil_implementation::GetAmbigs
2206 (const CSeq_data& in_seq,
2207 CSeq_data* out_seq,
2208 vector<TSeqPos>* out_indices,
2209 CSeq_data::E_Choice to_code,
2210 TSeqPos uBeginIdx,
2211 TSeqPos uLength)
2212 const
2213 {
2214
2215 // Determine and call applicable GetAmbig method.
2216 switch (in_seq.Which()) {
2217 case CSeq_data::e_Ncbi4na:
2218 switch (to_code) {
2219 case CSeq_data::e_Ncbi2na:
2220 return GetAmbigs_ncbi4na_ncbi2na(in_seq, out_seq, out_indices,
2221 uBeginIdx, uLength);
2222 default:
2223 return 0;
2224 }
2225 case CSeq_data::e_Iupacna:
2226 switch (to_code) {
2227 case CSeq_data::e_Ncbi2na:
2228 return GetAmbigs_iupacna_ncbi2na(in_seq, out_seq, out_indices,
2229 uBeginIdx, uLength);
2230 default:
2231 return 0;
2232 }
2233 default:
2234 return 0;
2235 }
2236 }
2237
2238
2239 // Get a copy of in_seq from uBeginIdx through uBeginIdx + uLength-1
2240 // and put in out_seq. See comments in alphabet.hpp for more information.
2241 // GetCopy is a dispatch function.
GetCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2242 TSeqPos CSeqportUtil_implementation::GetCopy
2243 (const CSeq_data& in_seq,
2244 CSeq_data* out_seq,
2245 TSeqPos uBeginIdx,
2246 TSeqPos uLength)
2247 const
2248 {
2249 // Do processing based on in_seq type
2250 switch (in_seq.Which()) {
2251 case CSeq_data::e_Ncbi2na:
2252 return GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
2253 case CSeq_data::e_Ncbi4na:
2254 return GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
2255 case CSeq_data::e_Iupacna:
2256 return GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
2257 case CSeq_data::e_Ncbieaa:
2258 return GetNcbieaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2259 case CSeq_data::e_Ncbistdaa:
2260 return GetNcbistdaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2261 case CSeq_data::e_Iupacaa:
2262 return GetIupacaaCopy(in_seq, out_seq, uBeginIdx, uLength);
2263 default:
2264 throw runtime_error
2265 ("GetCopy() is not implemented for the requested sequence type");
2266 }
2267 }
2268
2269
2270
2271
2272 // Method to keep only a contiguous piece of a sequence beginning
2273 // at uBeginIdx and uLength residues long. Keep is a
2274 // dispatch function.
Keep(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2275 TSeqPos CSeqportUtil_implementation::Keep
2276 (CSeq_data* in_seq,
2277 TSeqPos uBeginIdx,
2278 TSeqPos uLength)
2279 const
2280 {
2281 // Do proceessing based upon in_seq type
2282 switch (in_seq->Which()) {
2283 case CSeq_data::e_Ncbi2na:
2284 return KeepNcbi2na(in_seq, uBeginIdx, uLength);
2285 case CSeq_data::e_Ncbi4na:
2286 return KeepNcbi4na(in_seq, uBeginIdx, uLength);
2287 case CSeq_data::e_Iupacna:
2288 return KeepIupacna(in_seq, uBeginIdx, uLength);
2289 case CSeq_data::e_Ncbieaa:
2290 return KeepNcbieaa(in_seq, uBeginIdx, uLength);
2291 case CSeq_data::e_Ncbistdaa:
2292 return KeepNcbistdaa(in_seq, uBeginIdx, uLength);
2293 case CSeq_data::e_Iupacaa:
2294 return KeepIupacaa(in_seq, uBeginIdx, uLength);
2295 default:
2296 throw runtime_error("Cannot perform Keep on in_seq type.");
2297 }
2298 }
2299
2300
2301 // Append in_seq2 to in_seq1 and put result in out_seq. This
2302 // is a dispatch function.
Append(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const2303 TSeqPos CSeqportUtil_implementation::Append
2304 (CSeq_data* out_seq,
2305 const CSeq_data& in_seq1,
2306 TSeqPos uBeginIdx1,
2307 TSeqPos uLength1,
2308 const CSeq_data& in_seq2,
2309 TSeqPos uBeginIdx2,
2310 TSeqPos uLength2)
2311 const
2312 {
2313 // Check that in_seqs or of same type
2314 if(in_seq1.Which() != in_seq2.Which())
2315 throw runtime_error("Append in_seq types do not match.");
2316
2317 // Check that out_seq is not null
2318 if(!out_seq) {
2319 return 0;
2320 }
2321
2322 // Call applicable append method base on in_seq types
2323 switch (in_seq1.Which()) {
2324 case CSeq_data::e_Iupacna:
2325 return AppendIupacna(out_seq, in_seq1, uBeginIdx1, uLength1,
2326 in_seq2, uBeginIdx2, uLength2);
2327 case CSeq_data::e_Ncbi2na:
2328 return AppendNcbi2na(out_seq, in_seq1, uBeginIdx1, uLength1,
2329 in_seq2, uBeginIdx2, uLength2);
2330 case CSeq_data::e_Ncbi4na:
2331 return AppendNcbi4na(out_seq, in_seq1, uBeginIdx1, uLength1,
2332 in_seq2, uBeginIdx2, uLength2);
2333 case CSeq_data::e_Ncbieaa:
2334 return AppendNcbieaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2335 in_seq2, uBeginIdx2, uLength2);
2336 case CSeq_data::e_Ncbistdaa:
2337 return AppendNcbistdaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2338 in_seq2, uBeginIdx2, uLength2);
2339 case CSeq_data::e_Iupacaa:
2340 return AppendIupacaa(out_seq, in_seq1, uBeginIdx1, uLength1,
2341 in_seq2, uBeginIdx2, uLength2);
2342 default:
2343 throw runtime_error("Append for in_seq type not supported.");
2344 }
2345 }
2346
2347
2348 // Methods to complement na sequences. These are
2349 // dispatch functions.
2350
2351 // Method to complement na sequence in place
Complement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2352 TSeqPos CSeqportUtil_implementation::Complement
2353 (CSeq_data* in_seq,
2354 TSeqPos uBeginIdx,
2355 TSeqPos uLength)
2356 const
2357 {
2358 _ASSERT(in_seq != 0);
2359
2360 CSeq_data complement;
2361 TSeqPos retval = Complement(*in_seq, &complement, uBeginIdx, uLength);
2362 in_seq->Assign(complement);
2363
2364 return retval;
2365 }
2366
2367
2368 // Method to complement na sequence in a copy out_seq
Complement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2369 TSeqPos CSeqportUtil_implementation::Complement
2370 (const CSeq_data& in_seq,
2371 CSeq_data* out_seq,
2372 TSeqPos uBeginIdx,
2373 TSeqPos uLength)
2374 const
2375 {
2376 _ASSERT(out_seq != 0);
2377
2378 if ( uLength == 0 ) {
2379 uLength = numeric_limits<TSeqPos>::max();
2380 }
2381 CSeq_data::E_Choice in_code = in_seq.Which();
2382 _ASSERT(in_code != CSeq_data::e_not_set);
2383
2384 const string* in_str = 0;
2385 const vector<char>* in_vec = 0;
2386 x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2387
2388 TSeqPos retval = 0;
2389 if ( in_str ) {
2390 string out_str;
2391 retval = CSeqManip::Complement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2392 CSeq_data temp(out_str, in_code);
2393 out_seq->Assign(temp);
2394 } else if (in_vec != 0) {
2395 vector<char> out_vec;
2396 retval = CSeqManip::Complement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2397 CSeq_data temp(out_vec, in_code);
2398 out_seq->Assign(temp);
2399 }
2400 return retval;
2401 }
2402
2403
2404 // Methods to reverse na sequences. These are
2405 // dispatch functions.
2406
2407 // Method to reverse na sequence in place
Reverse(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2408 TSeqPos CSeqportUtil_implementation::Reverse
2409 (CSeq_data* in_seq,
2410 TSeqPos uBeginIdx,
2411 TSeqPos uLength)
2412 const
2413 {
2414 CSeq_data temp;
2415 TSeqPos retval = Reverse(*in_seq, &temp, uBeginIdx, uLength);
2416 in_seq->Assign(temp);
2417
2418 return retval;
2419 }
2420
2421
2422 // Method to reverse na sequence in a copy out_seq
Reverse(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2423 TSeqPos CSeqportUtil_implementation::Reverse
2424 (const CSeq_data& in_seq,
2425 CSeq_data* out_seq,
2426 TSeqPos uBeginIdx,
2427 TSeqPos uLength)
2428 const
2429 {
2430 _ASSERT(out_seq != 0);
2431
2432 if ( uLength == 0 ) {
2433 uLength = numeric_limits<TSeqPos>::max();
2434 }
2435
2436 CSeq_data::E_Choice in_code = in_seq.Which();
2437 _ASSERT(in_code != CSeq_data::e_not_set);
2438
2439 const string* in_str = 0;
2440 const vector<char>* in_vec = 0;
2441 x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2442
2443 TSeqPos retval = 0;
2444 if ( in_str ) {
2445 string out_str;
2446 retval = CSeqManip::Reverse(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2447 CSeq_data temp(out_str, in_code);
2448 out_seq->Assign(temp);
2449 } else if (in_vec != NULL) {
2450 vector<char> out_vec;
2451 retval = CSeqManip::Reverse(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2452 CSeq_data temp(out_vec, in_code);
2453 out_seq->Assign(temp);
2454 }
2455 return retval;
2456 }
2457
2458
2459
2460 // Methods to reverse-complement a sequence. These are
2461 // dispatch functions.
2462
2463 // Method to reverse-complement na sequence in place
ReverseComplement(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2464 TSeqPos CSeqportUtil_implementation::ReverseComplement
2465 (CSeq_data* in_seq,
2466 TSeqPos uBeginIdx,
2467 TSeqPos uLength)
2468 const
2469 {
2470 _ASSERT(in_seq != 0);
2471
2472 CSeq_data::E_Choice in_code = in_seq->Which();
2473 _ASSERT(in_code != CSeq_data::e_not_set);
2474
2475 string* in_str = 0;
2476 vector<char>* in_vec = 0;
2477 x_GetSeqFromSeqData(*in_seq, &in_str, &in_vec);
2478
2479 if ( in_str ) {
2480 return CSeqManip::ReverseComplement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength);
2481 } else if (in_vec != NULL) {
2482 return CSeqManip::ReverseComplement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength);
2483 } else {
2484 return 0;
2485 }
2486 }
2487
2488
2489 // Method to reverse-complement na sequence in a copy out_seq
ReverseComplement(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const2490 TSeqPos CSeqportUtil_implementation::ReverseComplement
2491 (const CSeq_data& in_seq,
2492 CSeq_data* out_seq,
2493 TSeqPos uBeginIdx,
2494 TSeqPos uLength)
2495 const
2496 {
2497 _ASSERT(out_seq != 0);
2498
2499 if ( uLength == 0 ) {
2500 uLength = numeric_limits<TSeqPos>::max();
2501 }
2502
2503 CSeq_data::E_Choice in_code = in_seq.Which();
2504 _ASSERT(in_code != CSeq_data::e_not_set);
2505
2506 const string* in_str = 0;
2507 const vector<char>* in_vec = 0;
2508 x_GetSeqFromSeqData(in_seq, &in_str, &in_vec);
2509
2510 TSeqPos retval = 0;
2511 if ( in_str ) {
2512 string out_str;
2513 retval = CSeqManip::ReverseComplement(*in_str, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_str);
2514 CSeq_data temp(out_str, in_code);
2515 out_seq->Assign(temp);
2516 } else if (in_vec != NULL) {
2517 vector<char> out_vec;
2518 retval = CSeqManip::ReverseComplement(*in_vec, s_SeqDataToSeqUtil[in_code], uBeginIdx, uLength, out_vec);
2519 CSeq_data temp(out_vec, in_code);
2520 out_seq->Assign(temp);
2521 }
2522
2523 return retval;
2524 }
2525
2526
2527 // Implement private worker functions called by public
2528 // dispatch functions.
2529
2530 // Methods to convert between coding schemes
2531
2532 /*
2533 // Convert in_seq from ncbi2na (1 byte) to iupacna (4 bytes)
2534 // and put result in out_seq
2535 TSeqPos CSeqportUtil_implementation::MapNcbi2naToIupacna
2536 (const CSeq_data& in_seq,
2537 CSeq_data* out_seq,
2538 TSeqPos uBeginIdx,
2539 TSeqPos uLength)
2540 const
2541 {
2542 // Save uBeginIdx and uLength for later use
2543 TSeqPos uBeginSav = uBeginIdx;
2544 TSeqPos uLenSav = uLength;
2545
2546 // Get vector holding the in sequence
2547 const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
2548
2549 // Get string where the out sequence will go
2550 out_seq->Reset();
2551 string& out_seq_data = out_seq->SetIupacna().Set();
2552
2553 // Validate uBeginSav
2554 if(uBeginSav >= 4*in_seq_data.size())
2555 return 0;
2556
2557 // Adjust uLenSav
2558 if((uLenSav == 0 )|| ((uLenSav + uBeginSav )> 4*in_seq_data.size()))
2559 uLenSav = 4*in_seq_data.size() - uBeginSav;
2560
2561
2562 // Adjust uBeginIdx and uLength, if necessary
2563 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 4, 1);
2564
2565 // Declare iterator for in_seq
2566 vector<char>::const_iterator i_in;
2567
2568 // Allocate string memory for result of conversion
2569 out_seq_data.resize(uLenSav);
2570
2571 // Get pointer to data of out_seq_data (a string)
2572 string::iterator i_out = out_seq_data.begin()-1;
2573
2574 // Determine begin and end bytes of in_seq_data
2575 vector<char>::const_iterator i_in_begin =
2576 in_seq_data.begin() + uBeginIdx/4;
2577 vector<char>::const_iterator i_in_end = i_in_begin + uLength/4;
2578 if((uLength % 4) != 0) ++i_in_end;
2579 --i_in_end;
2580
2581 // Handle first input sequence byte
2582 unsigned int uVal =
2583 m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in_begin)];
2584 char *pchar, *pval;
2585 pval = reinterpret_cast<char*>(&uVal);
2586 for(pchar = pval + uBeginSav - uBeginIdx; pchar < pval + 4; ++pchar)
2587 *(++i_out) = *pchar;
2588
2589 if(i_in_begin == i_in_end)
2590 return uLenSav;
2591 ++i_in_begin;
2592
2593 // Loop through in_seq_data and convert to out_seq
2594 for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2595 uVal =
2596 m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in)];
2597 pchar = reinterpret_cast<char*>(&uVal);
2598 (*(++i_out)) = (*(pchar++));
2599 (*(++i_out)) = (*(pchar++));
2600 (*(++i_out)) = (*(pchar++));
2601 (*(++i_out)) = (*(pchar++));
2602 }
2603
2604 // Handle last byte of input data
2605 uVal =
2606 m_FastNcbi2naIupacna->m_Table[static_cast<unsigned char>(*i_in_end)];
2607 pval = reinterpret_cast<char*>(&uVal);
2608 TSeqPos uOverhang = (uBeginSav + uLenSav) % 4;
2609 uOverhang = (uOverhang ==0) ? 4 : uOverhang;
2610 for(pchar = pval; pchar < pval + uOverhang; ++pchar) {
2611 (*(++i_out)) = *pchar;
2612 }
2613
2614 return uLenSav;
2615 }
2616
2617
2618 // Convert in_seq from ncbi2na (1 byte) to ncbi4na (2 bytes)
2619 // and put result in out_seq
2620 TSeqPos CSeqportUtil_implementation::MapNcbi2naToNcbi4na
2621 (const CSeq_data& in_seq,
2622 CSeq_data* out_seq,
2623 TSeqPos uBeginIdx,
2624 TSeqPos uLength)
2625 const
2626 {
2627 // Get vector holding the in sequence
2628 const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
2629
2630 // Get vector where out sequence will go
2631 out_seq->Reset();
2632 vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
2633
2634 // Save uBeginIdx and uLength for later use as they
2635 // are modified below
2636 TSeqPos uBeginSav = uBeginIdx;
2637 TSeqPos uLenSav = uLength;
2638
2639 // Check that uBeginSav is not beyond end of in_seq_data
2640 if(uBeginSav >= 4*in_seq_data.size())
2641 return 0;
2642
2643 // Adjust uLenSav
2644 if((uLenSav == 0) || ((uBeginSav + uLenSav) > 4*in_seq_data.size()))
2645 uLenSav = 4*in_seq_data.size() - uBeginSav;
2646
2647
2648 // Adjust uBeginIdx and uLength, if necessary
2649 TSeqPos uOverhang =
2650 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 4, 2);
2651
2652 // Declare iterator for in_seq
2653 vector<char>::const_iterator i_in;
2654
2655 // Allocate memory for out_seq_data
2656 TSeqPos uInBytes = (uLength + uOverhang)/4;
2657 if(((uLength + uOverhang) % 4) != 0) uInBytes++;
2658 vector<char>::size_type nOutBytes = 2*uInBytes;
2659 out_seq_data.resize(nOutBytes);
2660
2661 // Get an iterator of out_seq_data
2662 vector<char>::iterator i_out = out_seq_data.begin()-1;
2663
2664 // Determine begin and end bytes of in_seq_data
2665 vector<char>::const_iterator i_in_begin =
2666 in_seq_data.begin() + uBeginIdx/4;
2667 vector<char>::const_iterator i_in_end = i_in_begin + uInBytes;
2668
2669 // Loop through in_seq_data and convert to out_seq_data
2670 for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2671 unsigned short uVal =
2672 m_FastNcbi2naNcbi4na->m_Table[static_cast<unsigned char>(*i_in)];
2673 char* pch = reinterpret_cast<char*>(&uVal);
2674 (*(++i_out)) = (*(pch++));
2675 (*(++i_out)) = (*(pch++));
2676 }
2677 TSeqPos keepidx = uBeginSav - uBeginIdx;
2678 KeepNcbi4na(out_seq, keepidx, uLenSav);
2679
2680 return uLenSav;
2681 }
2682
2683
2684 // Convert in_seq from ncbi4na (1 byte) to iupacna (2 bytes)
2685 // and put result in out_seq
2686 TSeqPos CSeqportUtil_implementation::MapNcbi4naToIupacna
2687 (const CSeq_data& in_seq,
2688 CSeq_data* out_seq,
2689 TSeqPos uBeginIdx,
2690 TSeqPos uLength)
2691 const
2692 {
2693 // Save uBeginIdx and uLength for later use
2694 TSeqPos uBeginSav = uBeginIdx;
2695 TSeqPos uLenSav = uLength;
2696
2697 // Get vector holding the in sequence
2698 const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
2699
2700 // Get string where the out sequence will go
2701 out_seq->Reset();
2702 string& out_seq_data = out_seq->SetIupacna().Set();
2703
2704 // Validate uBeginSav
2705 if(uBeginSav >= 2*in_seq_data.size())
2706 return 0;
2707
2708 // Adjust uLenSav
2709 if((uLenSav == 0 )|| ((uLenSav + uBeginSav )> 2*in_seq_data.size()))
2710 uLenSav = 2*in_seq_data.size() - uBeginSav;
2711
2712
2713 // Adjust uBeginIdx and uLength, if necessary
2714 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 2, 1);
2715
2716 // Declare iterator for in_seq
2717 vector<char>::const_iterator i_in;
2718
2719 // Allocate string memory for result of conversion
2720 out_seq_data.resize(uLenSav);
2721
2722 // Get pointer to data of out_seq_data (a string)
2723 string::iterator i_out = out_seq_data.begin() - 1;
2724
2725 // Determine begin and end bytes of in_seq_data
2726 vector<char>::const_iterator i_in_begin =
2727 in_seq_data.begin() + uBeginIdx/2;
2728 vector<char>::const_iterator i_in_end = i_in_begin + uLength/2;
2729 if((uLength % 2) != 0) ++i_in_end;
2730 --i_in_end;
2731
2732 // Handle first input sequence byte
2733 unsigned short uVal =
2734 m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in_begin)];
2735 char *pchar, *pval;
2736 pval = reinterpret_cast<char*>(&uVal);
2737 for(pchar = pval + uBeginSav - uBeginIdx; pchar < pval + 2; ++pchar)
2738 *(++i_out) = *pchar;
2739
2740 if(i_in_begin == i_in_end)
2741 return uLenSav;
2742 ++i_in_begin;
2743
2744 // Loop through in_seq_data and convert to out_seq
2745 for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
2746 uVal =
2747 m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in)];
2748 pchar = reinterpret_cast<char*>(&uVal);
2749 (*(++i_out)) = (*(pchar++));
2750 (*(++i_out)) = (*(pchar++));
2751 }
2752
2753 // Handle last byte of input data
2754 uVal =
2755 m_FastNcbi4naIupacna->m_Table[static_cast<unsigned char>(*i_in_end)];
2756 pval = reinterpret_cast<char*>(&uVal);
2757 TSeqPos uOverhang = (uBeginSav + uLenSav) % 2;
2758 uOverhang = (uOverhang ==0) ? 2 : uOverhang;
2759 for(pchar = pval; pchar < pval + uOverhang; ++pchar)
2760 (*(++i_out)) = *pchar;
2761
2762 return uLenSav;
2763 }
2764 */
2765
2766 // Table for quick check of whether an ncbi4na residue represents an ambiguity.
2767 // The 0 value is not considered an ambiguity, as it represents the end of
2768 // buffer.
2769 static const char kAmbig4na[16] =
2770 { 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 };
2771
2772 class CAmbiguityContext {
2773 public:
2774 CAmbiguityContext(vector<Uint4>& amb_buff, int seq_length);
2775 // Make sure the vector is not freed in the destructor
~CAmbiguityContext()2776 ~CAmbiguityContext() {}
2777 void UpdateBuffer();
2778 void AddAmbiguity(char in_byte, TSeqPos& seq_pos);
2779 void Finish();
2780 private:
2781 vector<Uint4>& m_vAmbBuf; ///< Ambiguity buffer to fill
2782 char m_LastAmbChar; ///< Last previous ambiguity character
2783 TSeqPos m_AmbCount;
2784 TSeqPos m_AmbStart;
2785 int m_BuffPos;
2786 bool m_bLongFormat;
2787 TSeqPos m_MaxAmbCount;
2788 };
2789
CAmbiguityContext(vector<Uint4> & amb_buff,int seq_length)2790 CAmbiguityContext::CAmbiguityContext(vector<Uint4>& amb_buff, int seq_length)
2791 : m_vAmbBuf(amb_buff)
2792 {
2793 m_AmbCount = 0;
2794 m_AmbStart = 0;
2795 m_BuffPos = 0;
2796 m_LastAmbChar = 0;
2797 m_bLongFormat = (seq_length >= 0x00ffffff);
2798 m_MaxAmbCount = (m_bLongFormat ? 0x00000fff : 0x0000000f);
2799 // If "long format", set the top bit in the length element of the
2800 // ambiguity vector, but only if the input vector is empty. Otherwise,
2801 // assume that this initialization has already been done in the previous
2802 // invocation.
2803 if (m_vAmbBuf.size() == 0) {
2804 Uint4 amb_len = (m_bLongFormat ? 0x80000000 : 0);
2805 m_vAmbBuf.push_back(amb_len);
2806 }
2807 }
2808
2809
UpdateBuffer()2810 void CAmbiguityContext::UpdateBuffer()
2811 {
2812 // If there are no more unprocessed ambiguities, return.
2813 if (!m_LastAmbChar)
2814 return;
2815
2816 Uint4 amb_element = m_LastAmbChar << 28;
2817 // In long format length occupies bits 16-27, and sequence position is stored
2818 // in the next integer element. In short format length occupies bits 24-27,
2819 // and sequence position is stored in the same integer element.
2820 if (m_bLongFormat) {
2821 amb_element |= (m_AmbCount << 16);
2822 m_vAmbBuf.push_back(amb_element);
2823 m_vAmbBuf.push_back(m_AmbStart);
2824 } else {
2825 amb_element |= (m_AmbCount << 24);
2826 amb_element |= m_AmbStart;
2827 m_vAmbBuf.push_back(amb_element);
2828 }
2829 }
2830
AddAmbiguity(char in_byte,TSeqPos & seq_pos)2831 void CAmbiguityContext::AddAmbiguity(char in_byte, TSeqPos& seq_pos)
2832 {
2833 char res[2];
2834
2835 res[0] = (in_byte >> 4) & 0x0f;
2836 res[1] = in_byte & 0x0f;
2837
2838 for (int i = 0; i < 2; ++i, ++seq_pos) {
2839 if (kAmbig4na[(int)res[i]]) {
2840 if ((res[i] != m_LastAmbChar) || (m_AmbCount >= m_MaxAmbCount)) {
2841 // Finish the previous ambiguity element, start new;
2842 UpdateBuffer();
2843 m_LastAmbChar = res[i];
2844 m_AmbCount = 0;
2845 m_AmbStart = seq_pos;
2846 } else {
2847 // Just increment the count for the last ambiguity
2848 ++m_AmbCount;
2849 }
2850 } else {
2851 // No ambiguity: finish the previous ambiguity element, if any,
2852 // reset the m_LastAmbChar and count.
2853 UpdateBuffer();
2854 m_LastAmbChar = 0;
2855 m_AmbCount = 0;
2856 }
2857 }
2858 }
2859
Finish()2860 void CAmbiguityContext::Finish()
2861 {
2862 UpdateBuffer();
2863 // In the first element of the vector, preserve the top bit, and reset the
2864 // remainder to the number of ambiguity entries.
2865 m_vAmbBuf[0] =
2866 (m_vAmbBuf[0] & 0x80000000) | ((m_vAmbBuf.size() - 1) & 0x7fffffff);
2867 }
2868
2869 // Function to convert iupacna (4 bytes) to ncbi2na (1 byte)
MapIupacnaToNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const2870 TSeqPos CSeqportUtil_implementation::MapIupacnaToNcbi2na
2871 (const CSeq_data& in_seq,
2872 CSeq_data* out_seq,
2873 TSeqPos uBeginIdx,
2874 TSeqPos uLength,
2875 bool bAmbig,
2876 CRandom::TValue seed,
2877 TSeqPos total_length,
2878 TSeqPos* out_seq_length,
2879 vector<Uint4>* blast_ambig)
2880 const
2881 {
2882 // Get string holding the in_seq
2883 const string& in_seq_data = in_seq.GetIupacna().Get();
2884
2885 // Out sequence may contain unfinished byte from the previous segment
2886 if (out_seq_length != nullptr && *out_seq_length == 0)
2887 out_seq->Reset();
2888 // Get vector where the out sequence will go
2889 vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
2890
2891 // If uBeginIdx is after end of in_seq, return
2892 if(uBeginIdx >= in_seq_data.size())
2893 return 0;
2894
2895 // Determine return value
2896 TSeqPos uLenSav = uLength;
2897 if((uLenSav == 0) || ((uLenSav + uBeginIdx)) > in_seq_data.size())
2898 uLenSav = in_seq_data.size() - uBeginIdx;
2899
2900
2901 // Adjust uBeginIdx and uLength, if necessary and get uOverhang
2902 TSeqPos uOverhang =
2903 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 4);
2904
2905 // Check if the output sequence data has already been filled
2906 // with some previous data, e.g. previous segment of a delta
2907 // sequence.
2908 TSeqPos out_seq_pos = 0;
2909 if (out_seq_length) {
2910 out_seq_pos = *out_seq_length;
2911 *out_seq_length += uLenSav;
2912 }
2913 TSeqPos rbit = 2*(out_seq_pos % 4);
2914 TSeqPos lbit = 8 - rbit;
2915
2916 // Allocate vector memory for result of conversion
2917 // Note memory for overhang is added below.
2918 vector<char>::size_type nBytes = (out_seq_pos + uLenSav + 3) / 4;
2919 out_seq_data.resize(nBytes);
2920
2921 // Instantiate an ambiguity context object, if BLAST-style
2922 // ambiguity output is requested.
2923 auto_ptr<CAmbiguityContext> amb_context(NULL);
2924 if (blast_ambig) {
2925 amb_context.reset(new CAmbiguityContext(*blast_ambig, total_length));
2926 }
2927
2928 // Declare iterator for out_seq_data and determine begin and end
2929 vector<char>::iterator i_out;
2930 vector<char>::iterator i_out_begin = out_seq_data.begin() + out_seq_pos/4;
2931 vector<char>::iterator i_out_end = i_out_begin + uLength/4;
2932
2933 // Determine begin of in_seq_data
2934 string::const_iterator i_in = in_seq_data.begin() + uBeginIdx;
2935
2936 char new_byte;
2937 const int kOneByteMask = 0xff;
2938
2939 if(bAmbig)
2940 {
2941 // Do random disambiguation
2942 unsigned char c1, c2;
2943 CRandom::TValue rv;
2944
2945 // Declare a random number generator and set seed
2946 CRandom rg;
2947 rg.SetSeed(seed);
2948
2949 // Do disambiguation by converting Iupacna to Ncbi4na
2950 // deterministically and then converting from Ncbi4na to Ncbi2na
2951 // with random disambiguation
2952
2953 // Loop through the out_seq_data converting 4 Iupacna bytes to
2954 // one Ncbi2na byte. in_seq_data.size() % 4 bytes at end of
2955 // input handled separately below.
2956 for(i_out = i_out_begin; i_out != i_out_end; )
2957 {
2958 // Determine first Ncbi4na byte from 1st two Iupacna bytes
2959 c1 =
2960 m_FastIupacnaNcbi4na->m_Table
2961 [0][static_cast<unsigned char>(*i_in)] |
2962 m_FastIupacnaNcbi4na->m_Table
2963 [1][static_cast<unsigned char>(*(i_in+1))];
2964
2965 // Determine second Ncbi4na byte from 2nd two Iupacna bytes
2966 c2 =
2967 m_FastIupacnaNcbi4na->m_Table
2968 [0][static_cast<unsigned char>(*(i_in+2))]|
2969 m_FastIupacnaNcbi4na->m_Table
2970 [1][static_cast<unsigned char>(*(i_in+3))];
2971
2972 if (blast_ambig) {
2973 amb_context->AddAmbiguity(c1, out_seq_pos);
2974 amb_context->AddAmbiguity(c2, out_seq_pos);
2975 }
2976
2977 // Randomly pick disambiguated Ncbi4na bytes
2978 rv = rg.GetRand() % 16;
2979 c1 &= m_Masks->m_Table[c1].cMask[rv];
2980 rv = rg.GetRand() % 16;
2981 c2 &= m_Masks->m_Table[c2].cMask[rv];
2982
2983 // Convert from Ncbi4na to Ncbi2na
2984 // Calculate the new byte. Assign parts of it to the
2985 // remainder of the current output byte, and the
2986 // front part of the next output byte, advancing the
2987 // output iterator in the process.
2988 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] |
2989 m_FastNcbi4naNcbi2na->m_Table[1][c2];
2990 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
2991 ++i_out;
2992 // Fill part of next byte only if it is necessary, i.e. when
2993 // rbit is not 0.
2994 if (rbit)
2995 (*i_out) = ((new_byte & kOneByteMask) << lbit);
2996
2997 // Increment input sequence iterator.
2998 i_in+=4;
2999 }
3000
3001 // Handle overhang at end of in_seq
3002 switch (uOverhang) {
3003 case 1:
3004 c1 =
3005 m_FastIupacnaNcbi4na->m_Table
3006 [0][static_cast<unsigned char>(*i_in)];
3007 if (blast_ambig)
3008 amb_context->AddAmbiguity(c1, out_seq_pos);
3009 rv = rg.GetRand() % 16;
3010 c1 &= m_Masks->m_Table[c1].cMask[rv];
3011 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xC0;
3012 break;
3013 case 2:
3014 c1 =
3015 m_FastIupacnaNcbi4na->m_Table
3016 [0][static_cast<unsigned char>(*i_in)] |
3017 m_FastIupacnaNcbi4na->m_Table
3018 [1][static_cast<unsigned char>(*(i_in+1))];
3019 if (blast_ambig)
3020 amb_context->AddAmbiguity(c1, out_seq_pos);
3021 rv = rg.GetRand() % 16;
3022 c1 &= m_Masks->m_Table[c1].cMask[rv];
3023 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xF0;
3024 break;
3025 case 3:
3026 c1 =
3027 m_FastIupacnaNcbi4na->m_Table
3028 [0][static_cast<unsigned char>(*i_in)] |
3029 m_FastIupacnaNcbi4na->m_Table
3030 [1][static_cast<unsigned char>(*(i_in+1))];
3031 c2 =
3032 m_FastIupacnaNcbi4na->m_Table
3033 [0][static_cast<unsigned char>(*(i_in+2))];
3034 if (blast_ambig) {
3035 amb_context->AddAmbiguity(c1, out_seq_pos);
3036 amb_context->AddAmbiguity(c2, out_seq_pos);
3037 }
3038 rv = rg.GetRand() % 16;
3039 c1 &= m_Masks->m_Table[c1].cMask[rv];
3040 rv = rg.GetRand() % 16;
3041 c2 &= m_Masks->m_Table[c2].cMask[rv];
3042 new_byte = (m_FastNcbi4naNcbi2na->m_Table[0][c1] & 0xF0) |
3043 (m_FastNcbi4naNcbi2na->m_Table[1][c2] & 0x0C);
3044 break;
3045 default:
3046 // This is a bogus assignment, just to suppress a
3047 // compiler warning. The value will not actually be
3048 // used (see the "uOverhang > 0" condition below)
3049 new_byte = 0;
3050 break;
3051 }
3052
3053 // Assign respective parts of the new byte to the remaining parts
3054 // of the output sequence. Output iterator only needs to be
3055 // incremented if the overhang is greater than the unfilled
3056 // remainder of the last output byte.
3057 if (uOverhang > 0) {
3058 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3059 if (2*uOverhang > lbit) {
3060 ++i_out;
3061 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3062 }
3063 }
3064
3065 if (blast_ambig)
3066 amb_context->Finish();
3067 }
3068 else
3069 {
3070 // Pack uLength input characters into out_seq_data
3071 for(i_out = i_out_begin; i_out != i_out_end; )
3072 {
3073 new_byte =
3074 m_FastIupacnaNcbi2na->m_Table
3075 [0][static_cast<unsigned char>(*(i_in))] |
3076 m_FastIupacnaNcbi2na->m_Table
3077 [1][static_cast<unsigned char>(*(i_in+1))] |
3078 m_FastIupacnaNcbi2na->m_Table
3079 [2][static_cast<unsigned char>(*(i_in+2))] |
3080 m_FastIupacnaNcbi2na->m_Table
3081 [3][static_cast<unsigned char>(*(i_in+3))];
3082 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3083 ++i_out;
3084 // Fill part of next byte only if it is necessary, i.e. when
3085 // rbit is not 0.
3086 if (rbit)
3087 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3088 i_in+=4;
3089 }
3090
3091 // Handle overhang
3092 if(uOverhang > 0) {
3093 new_byte = '\x00';
3094 for(TSeqPos i = 0; i < uOverhang; i++) {
3095 new_byte |=
3096 m_FastIupacnaNcbi2na->m_Table
3097 [i][static_cast<unsigned char>(*(i_in+i))];
3098 }
3099 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3100 if (2*uOverhang > lbit) {
3101 ++i_out;
3102 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3103 }
3104 }
3105 }
3106 return uLenSav;
3107 }
3108 /*
3109
3110 // Function to convert iupacna (2 bytes) to ncbi4na (1 byte)
3111 TSeqPos CSeqportUtil_implementation::MapIupacnaToNcbi4na
3112 (const CSeq_data& in_seq,
3113 CSeq_data* out_seq,
3114 TSeqPos uBeginIdx,
3115 TSeqPos uLength)
3116 const
3117 {
3118 // Get string holding the in_seq
3119 const string& in_seq_data = in_seq.GetIupacna().Get();
3120
3121 // Get vector where the out sequence will go
3122 out_seq->Reset();
3123 vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
3124
3125 // If uBeginIdx beyond end of in_seq, return
3126 if(uBeginIdx >= in_seq_data.size())
3127 return 0;
3128
3129 // Determine return value
3130 TSeqPos uLenSav = uLength;
3131 if((uLenSav == 0) || (uLenSav + uBeginIdx) > in_seq_data.size())
3132 uLenSav = in_seq_data.size() - uBeginIdx;
3133
3134 // Adjust uBeginIdx and uLength and get uOverhang
3135 TSeqPos uOverhang =
3136 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 2);
3137
3138 // Allocate vector memory for result of conversion
3139 // Note memory for overhang is added below.
3140 vector<char>::size_type nBytes = uLength/2;
3141 out_seq_data.resize(nBytes);
3142
3143 // Declare iterator for out_seq_data and determine begin and end
3144 vector<char>::iterator i_out;
3145 vector<char>::iterator i_out_begin = out_seq_data.begin();
3146 vector<char>::iterator i_out_end = i_out_begin + uLength/2;
3147
3148 // Determine begin of in_seq_data offset by 1
3149 string::const_iterator i_in = in_seq_data.begin() + uBeginIdx;
3150
3151 // Pack uLength input characters into out_seq_data
3152 for(i_out = i_out_begin; i_out != i_out_end; ++i_out) {
3153 (*i_out) =
3154 m_FastIupacnaNcbi4na->m_Table
3155 [0][static_cast<unsigned char>(*(i_in))] |
3156 m_FastIupacnaNcbi4na->m_Table
3157 [1][static_cast<unsigned char>(*(i_in+1))];
3158 i_in+=2;
3159 }
3160
3161 // Handle overhang
3162 char ch = '\x00';
3163 if (uOverhang > 0) {
3164 ch |=
3165 m_FastIupacnaNcbi4na->
3166 m_Table[0][static_cast<unsigned char>(*i_in)];
3167 out_seq_data.push_back(ch);
3168 }
3169
3170 return uLenSav;
3171 }
3172 */
3173
3174
3175 // Function to convert ncbi4na (2 bytes) to ncbi2na (1 byte)
MapNcbi4naToNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength,bool bAmbig,CRandom::TValue seed,TSeqPos total_length,TSeqPos * out_seq_length,vector<Uint4> * blast_ambig) const3176 TSeqPos CSeqportUtil_implementation::MapNcbi4naToNcbi2na(
3177 const CSeq_data& in_seq, CSeq_data* out_seq,
3178 TSeqPos uBeginIdx, TSeqPos uLength,
3179 bool bAmbig, CRandom::TValue seed,
3180 TSeqPos total_length,
3181 TSeqPos* out_seq_length, vector<Uint4>* blast_ambig)
3182 const
3183 {
3184 // Get vector holding the in_seq
3185 const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
3186
3187 // Out sequence may contain unfinished byte from a previous segment.
3188 if (out_seq_length != nullptr && *out_seq_length == 0)
3189 out_seq->Reset();
3190 // Get vector where the out sequence will go
3191 vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
3192
3193 // Save uBeginIdx and uLength as they will be modified below
3194 TSeqPos uBeginSav = uBeginIdx;
3195 TSeqPos uLenSav = uLength;
3196
3197
3198 // Check that uBeginSav is not beyond end of in_seq
3199 if(uBeginSav >= 2*in_seq_data.size())
3200 return 0;
3201
3202 // Adjust uLenSav if needed
3203 if((uLenSav == 0) || ((uBeginSav + uLenSav) > 2*in_seq_data.size()))
3204 uLenSav = 2*in_seq_data.size() - uBeginSav;
3205
3206 // Adjust uBeginIdx and uLength and get uOverhang
3207 TSeqPos uOverhang =
3208 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 2, 4);
3209
3210 // Check if the output sequence data has already been filled
3211 // with some previous data, e.g. previous segment of a delta
3212 // sequence.
3213 TSeqPos out_seq_pos = 0;
3214 if (out_seq_length) {
3215 out_seq_pos = *out_seq_length;
3216 *out_seq_length += uLenSav;
3217 }
3218 TSeqPos rbit = 2*(out_seq_pos % 4);
3219 TSeqPos lbit = 8 - rbit;
3220
3221 // Allocate vector memory for result of conversion
3222 // Note memory for overhang is added below.
3223 vector<char>::size_type nBytes = (out_seq_pos + uLenSav + 3) / 4;
3224 out_seq_data.resize(nBytes);
3225
3226 // Instantiate an ambiguity context object, if BLAST-style
3227 // ambiguity output is requested.
3228 auto_ptr<CAmbiguityContext> amb_context(NULL);
3229 if (blast_ambig) {
3230 amb_context.reset(new CAmbiguityContext(*blast_ambig, total_length));
3231 }
3232
3233 // Declare iterator for out_seq_data and determine begin and end
3234 vector<char>::iterator i_out;
3235 vector<char>::iterator i_out_begin = out_seq_data.begin() + out_seq_pos/4;
3236 vector<char>::iterator i_out_end = i_out_begin + uLength/4;
3237
3238 // Make sure that the first byte of out_seq_data does not contain garbage
3239 // in the bits that are not yet supposed to be filled.
3240 *i_out_begin &= (0xff << lbit);
3241
3242 // Determine begin of in_seq_data
3243 vector<char>::const_iterator i_in = in_seq_data.begin() + uBeginIdx/2;
3244
3245 char new_byte;
3246 const int kOneByteMask = 0xff;
3247
3248 if(bAmbig) { // Do random disambiguation
3249 // Declare a random number generator and set seed
3250 CRandom rg;
3251 rg.SetSeed(seed);
3252
3253 // Pack uLength input bytes into out_seq_data
3254 for(i_out = i_out_begin; i_out != i_out_end; ) {
3255 // Disambiguate
3256 unsigned char c1 = static_cast<unsigned char>(*i_in);
3257 unsigned char c2 = static_cast<unsigned char>(*(i_in+1));
3258
3259 if (blast_ambig) {
3260 amb_context->AddAmbiguity(c1, out_seq_pos);
3261 amb_context->AddAmbiguity(c2, out_seq_pos);
3262 }
3263 CRandom::TValue rv = rg.GetRand() % 16;
3264 c1 &= m_Masks->m_Table[c1].cMask[rv];
3265 rv = rg.GetRand() % 16;
3266 c2 &= m_Masks->m_Table[c2].cMask[rv];
3267
3268 // Convert
3269 new_byte = m_FastNcbi4naNcbi2na->m_Table[0][c1] |
3270 m_FastNcbi4naNcbi2na->m_Table[1][c2];
3271 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3272 ++i_out;
3273 // Fill part of next byte only if it is necessary, i.e. when
3274 // rbit is not 0.
3275 if (rbit)
3276 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3277 i_in+=2;
3278 }
3279
3280 // Handle overhang
3281 new_byte = '\x00';
3282
3283 if(uOverhang > 0) {
3284 // Disambiguate
3285 unsigned char c1 = static_cast<unsigned char>(*i_in);
3286 // If only one residue, make sure that the second half of the byte
3287 // is 0.
3288 if (uOverhang == 1)
3289 c1 &= 0xf0;
3290 if (blast_ambig)
3291 amb_context->AddAmbiguity(c1, out_seq_pos);
3292 CRandom::TValue rv = rg.GetRand() % 16;
3293 c1 &= m_Masks->m_Table[c1].cMask[rv];
3294
3295 // Convert
3296 new_byte |= m_FastNcbi4naNcbi2na->m_Table[0][c1];
3297 }
3298
3299 if(uOverhang == 3) {
3300 // Disambiguate; make sure that the second half of the byte
3301 // is 0.
3302 unsigned char c1 = static_cast<unsigned char>(*(++i_in)) & 0xf0;
3303 if (blast_ambig)
3304 amb_context->AddAmbiguity(c1, out_seq_pos);
3305 CRandom::TValue rv = rg.GetRand() % 16;
3306 c1 &= m_Masks->m_Table[c1].cMask[rv];
3307
3308 // Convert
3309 new_byte |= m_FastNcbi4naNcbi2na->m_Table[1][c1];
3310 }
3311
3312 if(uOverhang > 0) {
3313 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3314 if (2*uOverhang > lbit) {
3315 ++i_out;
3316 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3317 }
3318 }
3319
3320 if (blast_ambig)
3321 amb_context->Finish();
3322 } else { // Do not do random disambiguation
3323
3324 // Pack uLength input bytes into out_seq_data
3325 for(i_out = i_out_begin; i_out != i_out_end; ) {
3326 new_byte =
3327 m_FastNcbi4naNcbi2na->m_Table
3328 [0][static_cast<unsigned char>(*i_in)] |
3329 m_FastNcbi4naNcbi2na->m_Table
3330 [1][static_cast<unsigned char>(*(i_in+1))];
3331 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3332 ++i_out;
3333 // Fill part of next byte only if it is necessary, i.e. when
3334 // rbit is not 0.
3335 if (rbit)
3336 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3337 i_in+=2;
3338 }
3339
3340 // Handle overhang
3341 if(uOverhang > 0) {
3342 new_byte = '\x00';
3343 new_byte |= m_FastNcbi4naNcbi2na->m_Table
3344 [0][static_cast<unsigned char>(*i_in)];
3345
3346 if(uOverhang == 3)
3347 new_byte |= m_FastNcbi4naNcbi2na->m_Table
3348 [1][static_cast<unsigned char>(*(++i_in))];
3349
3350 (*i_out) |= ((new_byte & kOneByteMask) >> rbit);
3351 if (2*uOverhang > lbit) {
3352 ++i_out;
3353 (*i_out) = ((new_byte & kOneByteMask) << lbit);
3354 }
3355 }
3356 }
3357
3358 TSeqPos keepidx = uBeginSav - uBeginIdx;
3359 KeepNcbi2na(out_seq, keepidx, uLenSav);
3360
3361 return uLenSav;
3362 }
3363
3364 /*
3365 // Function to convert iupacaa (byte) to ncbieaa (byte)
3366 TSeqPos CSeqportUtil_implementation::MapIupacaaToNcbieaa
3367 (const CSeq_data& in_seq,
3368 CSeq_data* out_seq,
3369 TSeqPos uBeginIdx,
3370 TSeqPos uLength)
3371 const
3372 {
3373 // Get read-only reference to in_seq data
3374 const string& in_seq_data = in_seq.GetIupacaa().Get();
3375
3376 // Get read & write reference to out_seq data
3377 out_seq->Reset();
3378 string& out_seq_data = out_seq->SetNcbieaa().Set();
3379
3380 // If uBeginIdx beyond end of in_seq, return
3381 if(uBeginIdx >= in_seq_data.size())
3382 return 0;
3383
3384 // Adjust uBeginIdx and uLength, if necessary
3385 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3386
3387 // Allocate memory for out_seq
3388 out_seq_data.resize(uLength);
3389
3390 // Declare iterator for out_seq_data
3391 string::iterator i_out = out_seq_data.begin();
3392
3393 // Declare iterator for in_seq_data and determine begin and end
3394 string::const_iterator i_in;
3395 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3396 string::const_iterator i_in_end = i_in_begin + uLength;
3397
3398 // Loop through input and convert to output
3399 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3400 (*(i_out++)) =
3401 m_IupacaaNcbieaa->m_Table[static_cast<unsigned char>(*i_in)];
3402
3403 return uLength;
3404 }
3405
3406
3407 // Function to convert ncbieaa (byte) to iupacaa (byte)
3408 TSeqPos CSeqportUtil_implementation::MapNcbieaaToIupacaa
3409 (const CSeq_data& in_seq,
3410 CSeq_data* out_seq,
3411 TSeqPos uBeginIdx,
3412 TSeqPos uLength)
3413 const
3414 {
3415 // Get read-only reference to in_seq data
3416 const string& in_seq_data = in_seq.GetNcbieaa().Get();
3417
3418 // Get read & write reference to out_seq data
3419 out_seq->Reset();
3420 string& out_seq_data = out_seq->SetIupacaa().Set();
3421
3422 // If uBeginIdx beyond end of in_seq, return
3423 if(uBeginIdx >= in_seq_data.size())
3424 return 0;
3425
3426 // Adjust uBeginIdx and uLength, if necessary
3427 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3428
3429 // Allocate memory for out_seq
3430 out_seq_data.resize(uLength);
3431
3432 // Declare iterator for out_seq_data
3433 string::iterator i_out = out_seq_data.begin();
3434
3435 // Declare iterator for in_seq_data and determine begin and end
3436 string::const_iterator i_in;
3437 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3438 string::const_iterator i_in_end = i_in_begin + uLength;
3439
3440 // Loop through input and convert to output
3441 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3442 (*(i_out++)) =
3443 m_NcbieaaIupacaa->m_Table[static_cast<unsigned char>(*i_in)];
3444
3445 return uLength;
3446 }
3447
3448
3449 // Function to convert iupacaa (byte) to ncbistdaa (byte)
3450 TSeqPos CSeqportUtil_implementation::MapIupacaaToNcbistdaa
3451 (const CSeq_data& in_seq,
3452 CSeq_data* out_seq,
3453 TSeqPos uBeginIdx,
3454 TSeqPos uLength)
3455 const
3456 {
3457 // Get read-only reference to in_seq data
3458 const string& in_seq_data = in_seq.GetIupacaa().Get();
3459
3460 // Get read & write reference to out_seq data
3461 out_seq->Reset();
3462 vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
3463
3464 // If uBeginIdx beyond end of in_seq, return
3465 if(uBeginIdx >= in_seq_data.size())
3466 return 0;
3467
3468 // Adjust uBeginIdx and uLength, if necessary
3469 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3470
3471 // Allocate memory for out_seq
3472 out_seq_data.resize(uLength);
3473
3474 // Declare iterator for out_seq_data
3475 vector<char>::iterator i_out = out_seq_data.begin();
3476
3477 // Declare iterator for in_seq_data and determine begin and end
3478 string::const_iterator i_in;
3479 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3480 string::const_iterator i_in_end = i_in_begin + uLength;
3481
3482 // Loop through input and convert to output
3483 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3484 (*(i_out++)) =
3485 m_IupacaaNcbistdaa->m_Table[static_cast<unsigned char>(*i_in)];
3486
3487 return uLength;
3488 }
3489
3490
3491
3492
3493
3494 // Function to convert ncbieaa (byte) to ncbistdaa (byte)
3495 TSeqPos CSeqportUtil_implementation::MapNcbieaaToNcbistdaa
3496 (const CSeq_data& in_seq,
3497 CSeq_data* out_seq,
3498 TSeqPos uBeginIdx,
3499 TSeqPos uLength)
3500 const
3501 {
3502 // Get read-only reference to in_seq data
3503 const string& in_seq_data = in_seq.GetNcbieaa().Get();
3504
3505 // Get read & write reference to out_seq data
3506 out_seq->Reset();
3507 vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
3508
3509 // If uBeginIdx beyond end of in_seq, return
3510 if(uBeginIdx >= in_seq_data.size())
3511 return 0;
3512
3513 // Adjust uBeginIdx and uLength, if necessary
3514 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3515
3516 // Allocate memory for out_seq
3517 out_seq_data.resize(uLength);
3518
3519 // Declare iterator for out_seq_data
3520 vector<char>::iterator i_out = out_seq_data.begin();
3521
3522 // Declare iterator for in_seq_data and determine begin and end
3523 string::const_iterator i_in;
3524 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3525 string::const_iterator i_in_end = i_in_begin + uLength;
3526
3527 // Loop through input and convert to output
3528 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3529 (*(i_out++)) =
3530 m_NcbieaaNcbistdaa->m_Table[static_cast<unsigned char>(*i_in)];
3531
3532 return uLength;
3533 }
3534
3535
3536 // Function to convert ncbistdaa (byte) to ncbieaa (byte)
3537 TSeqPos CSeqportUtil_implementation::MapNcbistdaaToNcbieaa
3538 (const CSeq_data& in_seq,
3539 CSeq_data* out_seq,
3540 TSeqPos uBeginIdx,
3541 TSeqPos uLength)
3542 const
3543 {
3544 // Get read-only reference to in_seq data
3545 const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3546
3547 // Get read & write reference to out_seq data
3548 out_seq->Reset();
3549 string& out_seq_data = out_seq->SetNcbieaa().Set();
3550
3551 // If uBeginIdx beyond end of in_seq, return
3552 if(uBeginIdx >= in_seq_data.size())
3553 return 0;
3554
3555 // Adjust uBeginIdx and uLength if necessary
3556 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3557
3558 // Allocate memory for out_seq
3559 out_seq_data.resize(uLength);
3560
3561 // Get iterator for out_seq_data
3562 string::iterator i_out = out_seq_data.begin();
3563
3564 // Declare iterator for in_seq_data and determine begin and end
3565 vector<char>::const_iterator i_in;
3566 vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3567 vector<char>::const_iterator i_in_end = i_in_begin + uLength;
3568
3569 // Loop through input and convert to output
3570 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3571 *(i_out++) =
3572 m_NcbistdaaNcbieaa->m_Table[static_cast<unsigned char>(*i_in)];
3573
3574 return uLength;
3575 }
3576
3577
3578 // Function to convert ncbistdaa (byte) to iupacaa (byte)
3579 TSeqPos CSeqportUtil_implementation::MapNcbistdaaToIupacaa
3580 (const CSeq_data& in_seq,
3581 CSeq_data* out_seq,
3582 TSeqPos uBeginIdx,
3583 TSeqPos uLength)
3584 const
3585 {
3586 // Get read-only reference to in_seq data
3587 const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3588
3589 // Get read & write reference to out_seq data
3590 out_seq->Reset();
3591 string& out_seq_data = out_seq->SetIupacaa().Set();
3592
3593 // If uBeginIdx beyond end of in_seq, return
3594 if(uBeginIdx >= in_seq_data.size())
3595 return 0;
3596
3597 // Adjust uBeginIdx and uLength
3598 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3599
3600 // Allocate memory for out_seq
3601 out_seq_data.resize(uLength);
3602
3603 // Get iterator for out_seq_data
3604 string::iterator i_out = out_seq_data.begin();
3605
3606 // Declare iterator for in_seq_data and determine begin and end
3607 vector<char>::const_iterator i_in;
3608 vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
3609 vector<char>::const_iterator i_in_end = i_in_begin + uLength;
3610
3611 // Loop through input and convert to output
3612 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3613 (*(i_out++)) =
3614 m_NcbistdaaIupacaa->m_Table[static_cast<unsigned char>(*i_in)];
3615
3616 return uLength;
3617 }
3618 */
3619
3620 // Fast validation of iupacna sequence
FastValidateIupacna(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3621 bool CSeqportUtil_implementation::FastValidateIupacna
3622 (const CSeq_data& in_seq,
3623 TSeqPos uBeginIdx,
3624 TSeqPos uLength)
3625 const
3626 {
3627 // Get read-only reference to in_seq data
3628 const string& in_seq_data = in_seq.GetIupacna().Get();
3629
3630 // Check that uBeginIdx is not beyond end of in_seq
3631 if(uBeginIdx >= in_seq_data.size())
3632 return true;
3633
3634 // Adjust uBeginIdx, uLength
3635 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3636
3637 // Declare in iterator on in_seq and determine begin and end
3638 string::const_iterator itor;
3639 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3640 string::const_iterator e_itor = b_itor + uLength;
3641
3642 // Perform Fast Validation
3643 unsigned char ch = '\x00';
3644 for(itor = b_itor; itor != e_itor; ++itor)
3645 ch |= m_Iupacna->m_Table[static_cast<unsigned char>(*itor)];
3646
3647 // Return true if valid, otherwise false
3648 return (ch != 255);
3649 }
3650
3651
FastValidateNcbieaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3652 bool CSeqportUtil_implementation::FastValidateNcbieaa
3653 (const CSeq_data& in_seq,
3654 TSeqPos uBeginIdx,
3655 TSeqPos uLength)
3656 const
3657 {
3658 // Get read-only reference to in_seq data
3659 const string& in_seq_data = in_seq.GetNcbieaa().Get();
3660
3661 // Check that uBeginIdx is not beyond end of in_seq
3662 if(uBeginIdx >= in_seq_data.size())
3663 return true;
3664
3665 // Adjust uBeginIdx, uLength
3666 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3667
3668 // Declare in iterator on in_seq and determine begin and end
3669 string::const_iterator itor;
3670 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3671 string::const_iterator e_itor = b_itor + uLength;
3672
3673 // Perform Fast Validation
3674 unsigned char ch = '\x00';
3675 for(itor = b_itor; itor != e_itor; ++itor)
3676 ch |= m_Ncbieaa->m_Table[static_cast<unsigned char>(*itor)];
3677
3678 // Return true if valid, otherwise false
3679 return (ch != 255);
3680
3681 }
3682
3683
FastValidateNcbistdaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3684 bool CSeqportUtil_implementation::FastValidateNcbistdaa
3685 (const CSeq_data& in_seq,
3686 TSeqPos uBeginIdx,
3687 TSeqPos uLength)
3688 const
3689 {
3690 // Get read-only reference to in_seq data
3691 const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3692
3693 // Check that uBeginIdx is not beyond end of in_seq
3694 if(uBeginIdx >= in_seq_data.size())
3695 return true;
3696
3697 // Adjust uBeginIdx, uLength
3698 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3699
3700 // Declare in iterator on in_seq and determine begin and end
3701 vector<char>::const_iterator itor;
3702 vector<char>::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3703 vector<char>::const_iterator e_itor = b_itor + uLength;
3704
3705 // Perform Fast Validation
3706 unsigned char ch = '\x00';
3707 for(itor = b_itor; itor != e_itor; ++itor)
3708 ch |= m_Ncbistdaa->m_Table[static_cast<unsigned char>(*itor)];
3709
3710 // Return true if valid, otherwise false
3711 return (ch != 255);
3712
3713 }
3714
3715
FastValidateIupacaa(const CSeq_data & in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3716 bool CSeqportUtil_implementation::FastValidateIupacaa
3717 (const CSeq_data& in_seq,
3718 TSeqPos uBeginIdx,
3719 TSeqPos uLength)
3720 const
3721 {
3722 // Get read-only reference to in_seq data
3723 const string& in_seq_data = in_seq.GetIupacaa().Get();
3724
3725 // Check that uBeginIdx is not beyond end of in_seq
3726 if(uBeginIdx >= in_seq_data.size())
3727 return true;
3728
3729 // Adjust uBeginIdx, uLength
3730 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3731
3732 // Declare in iterator on in_seq and determine begin and end
3733 string::const_iterator itor;
3734 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3735 string::const_iterator e_itor = b_itor + uLength;
3736
3737 // Perform Fast Validation
3738 unsigned char ch = '\x00';
3739 for(itor=b_itor; itor!=e_itor; ++itor)
3740 ch |= m_Iupacaa->m_Table[static_cast<unsigned char>(*itor)];
3741
3742 // Return true if valid, otherwise false
3743 return (ch != 255);
3744 }
3745
3746
ValidateIupacna(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3747 void CSeqportUtil_implementation::ValidateIupacna
3748 (const CSeq_data& in_seq,
3749 vector<TSeqPos>* badIdx,
3750 TSeqPos uBeginIdx,
3751 TSeqPos uLength)
3752 const
3753 {
3754 // Get read-only reference to in_seq data
3755 const string& in_seq_data = in_seq.GetIupacna().Get();
3756
3757 // clear out_indices
3758 badIdx->clear();
3759
3760 // Check that uBeginIdx is not beyond end of in_seq
3761 if(uBeginIdx >= in_seq_data.size())
3762 return;
3763
3764 // Adjust uBeginIdx, uLength
3765 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3766
3767 // Declare in iterator on in_seq and determine begin and end
3768 string::const_iterator itor;
3769 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3770 string::const_iterator e_itor = b_itor + uLength;
3771
3772 // Perform Validation
3773 TSeqPos nIdx = uBeginIdx;
3774 for(itor = b_itor; itor != e_itor; ++itor)
3775 if(m_Iupacna->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3776 badIdx->push_back(nIdx++);
3777 else
3778 nIdx++;
3779
3780 // Return list of bad indices
3781 return;
3782 }
3783
3784
ValidateNcbieaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3785 void CSeqportUtil_implementation::ValidateNcbieaa
3786 (const CSeq_data& in_seq,
3787 vector<TSeqPos>* badIdx,
3788 TSeqPos uBeginIdx,
3789 TSeqPos uLength)
3790 const
3791 {
3792 // Get read-only reference to in_seq data
3793 const string& in_seq_data = in_seq.GetNcbieaa().Get();
3794
3795 // clear badIdx
3796 badIdx->clear();
3797
3798 // Check that uBeginIdx is not beyond end of in_seq
3799 if(uBeginIdx >= in_seq_data.size())
3800 return;
3801
3802 // Adjust uBeginIdx, uLength
3803 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3804
3805 // Declare in iterator on in_seq and determine begin and end
3806 string::const_iterator itor;
3807 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3808 string::const_iterator e_itor = b_itor + uLength;
3809
3810 // Perform Validation
3811 TSeqPos nIdx = uBeginIdx;
3812 for(itor = b_itor; itor != e_itor; ++itor)
3813 if(m_Ncbieaa->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3814 badIdx->push_back(nIdx++);
3815 else
3816 nIdx++;
3817
3818 // Return vector of bad indices
3819 return;
3820 }
3821
3822
ValidateNcbistdaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3823 void CSeqportUtil_implementation::ValidateNcbistdaa
3824 (const CSeq_data& in_seq,
3825 vector<TSeqPos>* badIdx,
3826 TSeqPos uBeginIdx,
3827 TSeqPos uLength)
3828 const
3829 {
3830 // Get read-only reference to in_seq data
3831 const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
3832
3833 // Create a vector to return
3834 badIdx->clear();
3835
3836 // Check that uBeginIdx is not beyond end of in_seq
3837 if(uBeginIdx >= in_seq_data.size())
3838 return;
3839
3840 // Adjust uBeginIdx, uLength
3841 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3842
3843 // Declare in iterator on in_seq and determine begin and end
3844 vector<char>::const_iterator itor;
3845 vector<char>::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3846 vector<char>::const_iterator e_itor = b_itor + uLength;
3847
3848 // Perform Validation
3849 TSeqPos nIdx = uBeginIdx;
3850 for(itor=b_itor; itor!=e_itor; ++itor)
3851 if(m_Ncbistdaa->m_Table[static_cast<unsigned char>(*itor)]==char(255))
3852 badIdx->push_back(nIdx++);
3853 else
3854 nIdx++;
3855
3856 // Return vector of bad indices
3857 return;
3858 }
3859
3860
ValidateIupacaa(const CSeq_data & in_seq,vector<TSeqPos> * badIdx,TSeqPos uBeginIdx,TSeqPos uLength) const3861 void CSeqportUtil_implementation::ValidateIupacaa
3862 (const CSeq_data& in_seq,
3863 vector<TSeqPos>* badIdx,
3864 TSeqPos uBeginIdx,
3865 TSeqPos uLength)
3866 const
3867 {
3868 // Get read-only reference to in_seq data
3869 const string& in_seq_data = in_seq.GetIupacaa().Get();
3870
3871 // Create a vector to return
3872 badIdx->clear();
3873
3874 // Check that uBeginIdx is not beyond end of in_seq
3875 if(uBeginIdx >= in_seq_data.size())
3876 return;
3877
3878 // Adjust uBeginIdx, uLength
3879 Adjust(&uBeginIdx, &uLength, in_seq_data.size(), 1, 1);
3880
3881 // Declare in iterator on in_seq and determine begin and end
3882 string::const_iterator itor;
3883 string::const_iterator b_itor = in_seq_data.begin() + uBeginIdx;
3884 string::const_iterator e_itor = b_itor + uLength;
3885
3886 // Perform Validation
3887 TSeqPos nIdx = uBeginIdx;
3888 for(itor=b_itor; itor!=e_itor; ++itor)
3889 if(m_Iupacaa->m_Table[static_cast<unsigned char>(*itor)] == char(255))
3890 badIdx->push_back(nIdx++);
3891 else
3892 nIdx++;
3893
3894 // Return vector of bad indices
3895 return;
3896 }
3897
3898
3899 // Function to make copy of ncbi2na type sequences
GetNcbi2naCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3900 TSeqPos CSeqportUtil_implementation::GetNcbi2naCopy
3901 (const CSeq_data& in_seq,
3902 CSeq_data* out_seq,
3903 TSeqPos uBeginIdx,
3904 TSeqPos uLength)
3905 const
3906 {
3907 // Get reference to out_seq data
3908 out_seq->Reset();
3909 vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
3910
3911 // Get reference to in_seq data
3912 const vector<char>& in_seq_data = in_seq.GetNcbi2na().Get();
3913
3914 // Return if uBeginIdx is after end of in_seq
3915 if(uBeginIdx >= 4 * in_seq_data.size())
3916 return 0;
3917
3918 // Set uLength to actual valid length in out_seq
3919 if( (uLength ==0) || ((uBeginIdx + uLength) > (4*in_seq_data.size() )) )
3920 uLength = 4*in_seq_data.size() - uBeginIdx;
3921
3922 // Allocate memory for out_seq data
3923 if((uLength % 4) == 0)
3924 out_seq_data.resize(uLength/4);
3925 else
3926 out_seq_data.resize(uLength/4 + 1);
3927
3928 // Get iterator on out_seq_data
3929 vector<char>::iterator i_out = out_seq_data.begin() - 1;
3930
3931 // Calculate amounts to shift bits
3932 unsigned int lShift, rShift;
3933 lShift = 2*(uBeginIdx % 4);
3934 rShift = 8 - lShift;
3935
3936 // Get interators on in_seq
3937 vector<char>::const_iterator i_in;
3938 vector<char>::const_iterator i_in_begin =
3939 in_seq_data.begin() + uBeginIdx/4;
3940
3941 // Determine number of input bytes to process
3942 SIZE_TYPE uNumBytes = uLength/4;
3943 if((uLength % 4) != 0)
3944 ++uNumBytes;
3945
3946 // Prevent access beyond end of in_seq_data
3947 bool bDoLastByte = false;
3948 if((uBeginIdx/4 + uNumBytes) >= in_seq_data.size())
3949 {
3950 uNumBytes = in_seq_data.size() - uBeginIdx/4 - 1;
3951 bDoLastByte = true;
3952 }
3953 vector<char>::const_iterator i_in_end = i_in_begin + uNumBytes;
3954
3955 // Loop through input sequence and copy to output sequence
3956 if(lShift > 0)
3957 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3958 (*(++i_out)) =
3959 ((*i_in) << lShift) | (((*(i_in+1)) & 255) >> rShift);
3960 else
3961 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
3962 (*(++i_out)) = (*i_in);
3963
3964 // Handle last input byte if necessary
3965 if(bDoLastByte)
3966 (*(++i_out)) = (*i_in) << lShift;
3967
3968 return uLength;
3969 }
3970
3971
3972 // Function to make copy of ncbi4na type sequences
GetNcbi4naCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const3973 TSeqPos CSeqportUtil_implementation::GetNcbi4naCopy
3974 (const CSeq_data& in_seq,
3975 CSeq_data* out_seq,
3976 TSeqPos uBeginIdx,
3977 TSeqPos uLength)
3978 const
3979 {
3980 // Get reference to out_seq data
3981 out_seq->Reset();
3982 vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
3983
3984 // Get reference to in_seq data
3985 const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
3986
3987 // Return if uBeginIdx is after end of in_seq
3988 if(uBeginIdx >= 2 * in_seq_data.size())
3989 return 0;
3990
3991 // Set uLength to actual valid length in out_seq
3992 if( (uLength ==0) || ((uBeginIdx + uLength) > (2*in_seq_data.size() )) )
3993 uLength = 2*in_seq_data.size() - uBeginIdx;
3994
3995 // Allocate memory for out_seq data
3996 if((uLength % 2) == 0)
3997 out_seq_data.resize(uLength/2);
3998 else
3999 out_seq_data.resize(uLength/2 + 1);
4000
4001
4002 // Get iterator on out_seq_data
4003 vector<char>::iterator i_out = out_seq_data.begin() - 1;
4004
4005 // Calculate amounts to shift bits
4006 unsigned int lShift, rShift;
4007 lShift = 4*(uBeginIdx % 2);
4008 rShift = 8 - lShift;
4009
4010 // Get interators on in_seq
4011 vector<char>::const_iterator i_in;
4012 vector<char>::const_iterator i_in_begin =
4013 in_seq_data.begin() + uBeginIdx/2;
4014
4015 // Determine number of input bytes to process
4016 SIZE_TYPE uNumBytes = uLength/2;
4017 if((uLength % 2) != 0)
4018 ++uNumBytes;
4019
4020 // Prevent access beyond end of in_seq_data
4021 bool bDoLastByte = false;
4022 if((uBeginIdx/2 + uNumBytes) >= in_seq_data.size())
4023 {
4024 uNumBytes = in_seq_data.size() - uBeginIdx/2 - 1;
4025 bDoLastByte = true;
4026 }
4027 vector<char>::const_iterator i_in_end = i_in_begin + uNumBytes;
4028
4029 // Loop through input sequence and copy to output sequence
4030 if(lShift > 0)
4031 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4032 (*(++i_out)) =
4033 ((*i_in) << lShift) | (((*(i_in+1)) & 255) >> rShift);
4034 else
4035 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4036 (*(++i_out)) = (*i_in);
4037
4038 // Handle last input byte
4039 if(bDoLastByte)
4040 (*(++i_out)) = (*i_in) << lShift;
4041
4042 return uLength;
4043 }
4044
4045
4046 // Function to make copy of iupacna type sequences
GetIupacnaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4047 TSeqPos CSeqportUtil_implementation::GetIupacnaCopy
4048 (const CSeq_data& in_seq,
4049 CSeq_data* out_seq,
4050 TSeqPos uBeginIdx,
4051 TSeqPos uLength)
4052 const
4053 {
4054 // Get reference to out_seq data
4055 out_seq->Reset();
4056 string& out_seq_data = out_seq->SetIupacna().Set();
4057
4058 // Get reference to in_seq data
4059 const string& in_seq_data = in_seq.GetIupacna().Get();
4060
4061 // Return if uBeginIdx is after end of in_seq
4062 if(uBeginIdx >= in_seq_data.size())
4063 return 0;
4064
4065 // Set uLength to actual valid length in out_seq
4066 if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4067 uLength = in_seq_data.size() - uBeginIdx;
4068
4069 // Allocate memory for out_seq data
4070 out_seq_data.resize(uLength);
4071
4072 // Get iterator on out_seq_data
4073 string::iterator i_out = out_seq_data.begin() - 1;
4074
4075 // Get interators on in_seq
4076 string::const_iterator i_in;
4077 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4078 string::const_iterator i_in_end = i_in_begin + uLength;
4079
4080 // Loop through input sequence and copy to output sequence
4081 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4082 (*(++i_out)) = (*i_in);
4083
4084 return uLength;
4085 }
4086
4087
4088 // Function to make copy of ncbieaa type sequences
GetNcbieaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4089 TSeqPos CSeqportUtil_implementation::GetNcbieaaCopy
4090 (const CSeq_data& in_seq,
4091 CSeq_data* out_seq,
4092 TSeqPos uBeginIdx,
4093 TSeqPos uLength)
4094 const
4095 {
4096 // Get reference to out_seq data
4097 out_seq->Reset();
4098 string& out_seq_data = out_seq->SetNcbieaa().Set();
4099
4100 // Get reference to in_seq data
4101 const string& in_seq_data = in_seq.GetNcbieaa().Get();
4102
4103 // Return if uBeginIdx is after end of in_seq
4104 if(uBeginIdx >= in_seq_data.size())
4105 return 0;
4106
4107 // Set uLength to actual valid length in out_seq
4108 if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4109 uLength = in_seq_data.size() - uBeginIdx;
4110
4111 // Allocate memory for out_seq data
4112 out_seq_data.resize(uLength);
4113
4114 // Get iterator on out_seq_data
4115 string::iterator i_out = out_seq_data.begin() - 1;
4116
4117 // Get interators on in_seq
4118 string::const_iterator i_in;
4119 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4120 string::const_iterator i_in_end = i_in_begin + uLength;
4121
4122 // Loop through input sequence and copy to output sequence
4123 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4124 (*(++i_out)) = (*i_in);
4125
4126 return uLength;
4127 }
4128
4129
4130 // Function to make copy of ncbistdaa type sequences
GetNcbistdaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4131 TSeqPos CSeqportUtil_implementation::GetNcbistdaaCopy
4132 (const CSeq_data& in_seq,
4133 CSeq_data* out_seq,
4134 TSeqPos uBeginIdx,
4135 TSeqPos uLength)
4136 const
4137 {
4138 // Get reference to out_seq data
4139 out_seq->Reset();
4140 vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
4141
4142 // Get reference to in_seq data
4143 const vector<char>& in_seq_data = in_seq.GetNcbistdaa().Get();
4144
4145 // Return if uBeginIdx is after end of in_seq
4146 if(uBeginIdx >= in_seq_data.size())
4147 return 0;
4148
4149 // Set uLength to actual valid length in out_seq
4150 if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4151 uLength = in_seq_data.size() - uBeginIdx;
4152
4153 // Allocate memory for out_seq data
4154 out_seq_data.resize(uLength);
4155
4156 // Get iterator on out_seq_data
4157 vector<char>::iterator i_out = out_seq_data.begin() - 1;
4158
4159 // Get interators on in_seq
4160 vector<char>::const_iterator i_in;
4161 vector<char>::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4162 vector<char>::const_iterator i_in_end = i_in_begin + uLength;
4163
4164 // Loop through input sequence and copy to output sequence
4165 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4166 (*(++i_out)) = (*i_in);
4167
4168 return uLength;
4169 }
4170
4171
4172 // Function to make copy of iupacaa type sequences
GetIupacaaCopy(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4173 TSeqPos CSeqportUtil_implementation::GetIupacaaCopy
4174 (const CSeq_data& in_seq,
4175 CSeq_data* out_seq,
4176 TSeqPos uBeginIdx,
4177 TSeqPos uLength)
4178 const
4179 {
4180 // Get reference to out_seq data
4181 out_seq->Reset();
4182 string& out_seq_data = out_seq->SetIupacaa().Set();
4183
4184 // Get reference to in_seq data
4185 const string& in_seq_data = in_seq.GetIupacaa().Get();
4186
4187 // Return if uBeginIdx is after end of in_seq
4188 if(uBeginIdx >= in_seq_data.size())
4189 return 0;
4190
4191 // Set uLength to actual valid length in out_seq
4192 if( (uLength ==0) || ((uBeginIdx + uLength) > (in_seq_data.size() )) )
4193 uLength = in_seq_data.size() - uBeginIdx;
4194
4195 // Allocate memory for out_seq data
4196 out_seq_data.resize(uLength);
4197
4198 // Get iterator on out_seq_data
4199 string::iterator i_out = out_seq_data.begin() - 1;
4200
4201 // Get interators on in_seq
4202 string::const_iterator i_in;
4203 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4204 string::const_iterator i_in_end = i_in_begin + uLength;
4205
4206 // Loop through input sequence and copy to output sequence
4207 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4208 (*(++i_out)) = (*i_in);
4209
4210 return uLength;
4211 }
4212
4213
4214 // Function to adjust uBeginIdx to lie on an in_seq byte boundary
4215 // and uLength to lie on on an out_seq byte boundary. Returns
4216 // overhang
Adjust(TSeqPos * uBeginIdx,TSeqPos * uLength,TSeqPos uInSeqBytes,TSeqPos uInSeqsPerByte,TSeqPos uOutSeqsPerByte) const4217 TSeqPos CSeqportUtil_implementation::Adjust
4218 (TSeqPos* uBeginIdx,
4219 TSeqPos* uLength,
4220 TSeqPos uInSeqBytes,
4221 TSeqPos uInSeqsPerByte,
4222 TSeqPos uOutSeqsPerByte)
4223 const
4224 {
4225 // Adjust uBeginIdx and uLength to acceptable values
4226
4227 // If uLength = 0, assume convert to end of sequence
4228 if(*uLength == 0)
4229 *uLength = uInSeqsPerByte * uInSeqBytes;
4230
4231 // Ensure that uBeginIdx does not start at or after end of in_seq_data
4232 if(*uBeginIdx >= uInSeqsPerByte * uInSeqBytes)
4233 *uBeginIdx = uInSeqsPerByte * uInSeqBytes - uInSeqsPerByte;
4234
4235 // Ensure that uBeginIdx is a multiple of uInSeqsPerByte and adjust uLength
4236 *uLength += *uBeginIdx % uInSeqsPerByte;
4237 *uBeginIdx = uInSeqsPerByte * (*uBeginIdx/uInSeqsPerByte);
4238
4239 // Adjust uLength so as not to go beyond end of in_seq_data
4240 if(*uLength > uInSeqsPerByte * uInSeqBytes - *uBeginIdx)
4241 *uLength = uInSeqsPerByte * uInSeqBytes - *uBeginIdx;
4242
4243 // Adjust uLength down to multiple of uOutSeqsPerByte
4244 // and calculate overhang (overhang handled separately at end)
4245 TSeqPos uOverhang = *uLength % uOutSeqsPerByte;
4246 *uLength = uOutSeqsPerByte * (*uLength / uOutSeqsPerByte);
4247
4248 return uOverhang;
4249
4250 }
4251
4252
4253 // Loops through an ncbi4na input sequence and determines
4254 // the ambiguities that would result from conversion to an ncbi2na sequence
4255 // On return, out_seq contains the ncbi4na bases that become ambiguous and
4256 // out_indices contains the indices of the abiguous bases in in_seq
GetAmbigs_ncbi4na_ncbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,TSeqPos uBeginIdx,TSeqPos uLength) const4257 TSeqPos CSeqportUtil_implementation::GetAmbigs_ncbi4na_ncbi2na
4258 (const CSeq_data& in_seq,
4259 CSeq_data* out_seq,
4260 vector<TSeqPos>* out_indices,
4261 TSeqPos uBeginIdx,
4262 TSeqPos uLength)
4263 const
4264 {
4265 // Get read-only reference to in_seq data
4266 const vector<char>& in_seq_data = in_seq.GetNcbi4na().Get();
4267
4268 // Get read & write reference to out_seq data
4269 out_seq->Reset();
4270 vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
4271
4272 // Adjust uBeginIdx and uLength, if necessary
4273 if(uBeginIdx >= 2*in_seq_data.size())
4274 return 0;
4275
4276 if((uLength == 0) || (((uBeginIdx + uLength) > 2*in_seq_data.size())))
4277 uLength = 2*in_seq_data.size() - uBeginIdx;
4278
4279 // Save uBeginIdx and adjust uBeginIdx = 0 mod 2
4280 TSeqPos uBeginSav = uBeginIdx;
4281 TSeqPos uLenSav = uLength;
4282 uLength += uBeginIdx % 2;
4283 uBeginIdx = 2*(uBeginIdx/2);
4284
4285 // Allocate memory for out_seq_data and out_indices
4286 // Note, these will be shrunk at the end to correspond
4287 // to actual memory needed. Note, in test cases, over 50% of the
4288 // time spent in this method is spent in the next two
4289 // statements and 3/4 of that is spent in the second statement.
4290 out_seq_data.resize(uLength/2 + (uLength % 2));
4291 out_indices->resize(uLength);
4292
4293 // Variable to track number of ambigs
4294 TSeqPos uNumAmbigs = 0;
4295
4296 // Get iterators to input sequence
4297 vector<char>::const_iterator i_in;
4298 vector<char>::const_iterator i_in_begin =
4299 in_seq_data.begin() + uBeginIdx/2;
4300 vector<char>::const_iterator i_in_end =
4301 i_in_begin + uLength/2 + (uLength % 2);
4302
4303 // Get iterators to out_seq_data and out_indices
4304 vector<char>::iterator i_out_seq = out_seq_data.begin();
4305 vector<TSeqPos>::iterator i_out_idx = out_indices->begin();
4306
4307 // Index of current input seq base
4308 TSeqPos uIdx = uBeginIdx;
4309
4310 // Loop through input sequence looking for ambiguities
4311 for(i_in = i_in_begin; i_in != i_in_end; ++i_in) {
4312 switch (m_DetectAmbigNcbi4naNcbi2na->m_Table
4313 [static_cast<unsigned char>(*i_in)]) {
4314
4315 case 1: // Low order input nible ambiguous
4316
4317 // Put low order input nible in low order output nible
4318 if(uNumAmbigs & 1)
4319 {
4320 (*i_out_seq) |= (*i_in) & '\x0f';
4321 ++i_out_seq;
4322 }
4323
4324 // Put low order input nible in high order output nible
4325 else
4326 (*i_out_seq) = (*i_in) << 4;
4327
4328 // Record input index that was ambiguous
4329 (*i_out_idx) = uIdx + 1;
4330 ++i_out_idx;
4331
4332 // Increment number of ambiguities
4333 uNumAmbigs++;
4334 break;
4335
4336 case 2: // High order input nible ambiguous
4337
4338 // Put high order input nible in low order output nible
4339 if(uNumAmbigs & 1)
4340 {
4341 (*i_out_seq) |= ((*i_in) >> 4) & '\x0f';
4342 ++i_out_seq;
4343 }
4344
4345 // Put high order input nible in high order output nible
4346 else
4347 (*i_out_seq) = (*i_in) & '\xf0';
4348
4349 // Record input index that was ambiguous
4350 (*i_out_idx) = uIdx;
4351 ++i_out_idx;
4352
4353 // Increment number of ambiguities
4354 uNumAmbigs++;
4355 break;
4356
4357 case 3: // Both input nibles ambiguous
4358
4359 // Put high order input nible in low order
4360 // output nible, move to the next output byte
4361 // and put the low order input nibble in the
4362 // high order output nible.
4363 if(uNumAmbigs & 1)
4364 {
4365 (*i_out_seq) |= ((*i_in) >> 4) & '\x0f';
4366 (*(++i_out_seq)) = (*i_in) << 4;
4367 }
4368
4369 // Put high order input nible in high order
4370 // output nible, put low order input nible
4371 // in low order output nible, and move to
4372 // next output byte
4373 else
4374 {
4375 (*i_out_seq) = (*i_in);
4376 ++i_out_seq;
4377 }
4378
4379 // Record indices that were ambiguous
4380 (*i_out_idx) = uIdx;
4381 (*(++i_out_idx)) = uIdx + 1;
4382 ++i_out_idx;
4383
4384 // Increment the number of ambiguities
4385 uNumAmbigs+=2;
4386 break;
4387 }
4388
4389 // Increment next input byte.
4390 uIdx += 2;
4391 }
4392
4393 // Shrink out_seq_data and out_indices to actual sizes needed
4394 out_indices->resize(uNumAmbigs);
4395 out_seq_data.resize(uNumAmbigs/2 + uNumAmbigs % 2);
4396
4397 // Check to ensure that ambigs outside of requested range are not included
4398 TSeqPos uKeepBeg = 0;
4399 TSeqPos uKeepLen = 0;
4400 if((*out_indices)[0] < uBeginSav)
4401 {
4402 uKeepBeg = 1;
4403 out_indices->erase(out_indices->begin(), out_indices->begin() + 1);
4404 }
4405
4406 if((*out_indices)[out_indices->size()-1] >= uBeginSav + uLenSav)
4407 {
4408 out_indices->pop_back();
4409 uKeepLen = out_indices->size();
4410 }
4411
4412 if((uKeepBeg != 0) || (uKeepLen != 0))
4413 uNumAmbigs = KeepNcbi4na(out_seq, uKeepBeg, uKeepLen);
4414
4415 return uNumAmbigs;
4416 }
4417
4418
4419 // Loops through an iupacna input sequence and determines
4420 // the ambiguities that would result from conversion to an ncbi2na sequence.
4421 // On return, out_seq contains the iupacna bases that become ambiguous and
4422 // out_indices contains the indices of the abiguous bases in in_seq. The
4423 // return is the number of ambiguities found.
GetAmbigs_iupacna_ncbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,vector<TSeqPos> * out_indices,TSeqPos uBeginIdx,TSeqPos uLength) const4424 TSeqPos CSeqportUtil_implementation::GetAmbigs_iupacna_ncbi2na
4425 (const CSeq_data& in_seq,
4426 CSeq_data* out_seq,
4427 vector<TSeqPos>* out_indices,
4428 TSeqPos uBeginIdx,
4429 TSeqPos uLength)
4430 const
4431 {
4432 // Get read-only reference to in_seq data
4433 const string& in_seq_data = in_seq.GetIupacna().Get();
4434
4435 // Get read & write reference to out_seq data
4436 out_seq->Reset();
4437 string& out_seq_data = out_seq->SetIupacna().Set();
4438
4439 // Validate/adjust uBeginIdx and uLength
4440 if(uBeginIdx >= in_seq_data.size())
4441 return 0;
4442
4443 if((uLength == 0) || ((uBeginIdx + uLength) > in_seq_data.size()))
4444 uLength = in_seq_data.size() - uBeginIdx;
4445
4446 // Allocate memory for out_seq_data and out_indices
4447 // Note, these will be shrunk at the end to correspond
4448 // to actual memory needed.
4449 out_seq_data.resize(uLength);
4450 out_indices->resize(uLength);
4451
4452 // Variable to track number of ambigs
4453 TSeqPos uNumAmbigs = 0;
4454
4455 // Get iterators to input sequence
4456 string::const_iterator i_in;
4457 string::const_iterator i_in_begin = in_seq_data.begin() + uBeginIdx;
4458 string::const_iterator i_in_end = i_in_begin + uLength;
4459
4460 // Get iterators to out_seq_data and out_indices
4461 string::iterator i_out_seq = out_seq_data.begin();
4462 vector<TSeqPos>::iterator i_out_idx = out_indices->begin();
4463
4464 // Index of current input seq base
4465 TSeqPos uIdx = uBeginIdx;
4466
4467 // Loop through input sequence looking for ambiguities
4468 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
4469 {
4470 if(m_DetectAmbigIupacnaNcbi2na->m_Table
4471 [static_cast<unsigned char>(*i_in)] == 1)
4472 {
4473 (*i_out_seq) = (*i_in);
4474 ++i_out_seq;
4475 (*i_out_idx) = uIdx;
4476 ++i_out_idx;
4477 ++uNumAmbigs;
4478 }
4479
4480 ++uIdx;
4481 }
4482
4483 out_seq_data.resize(uNumAmbigs);
4484 out_indices->resize(uNumAmbigs);
4485
4486 return uNumAmbigs;
4487 }
4488
4489
4490 // Method to implement Keep for Ncbi2na. Returns length of
4491 // kept sequence
KeepNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4492 TSeqPos CSeqportUtil_implementation::KeepNcbi2na
4493 (CSeq_data* in_seq,
4494 TSeqPos uBeginIdx,
4495 TSeqPos uLength)
4496 const
4497 {
4498 // Get a reference to in_seq
4499 vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
4500
4501 // If uBeginIdx past the end of in_seq, return empty in_seq
4502 if(uBeginIdx >= in_seq_data.size()*4)
4503 {
4504 in_seq_data.clear();
4505 return 0;
4506 }
4507
4508 // If uLength == 0, Keep from uBeginIdx to end of in_seq
4509 if(uLength == 0)
4510 uLength = 4*in_seq_data.size() - uBeginIdx;
4511
4512
4513 // If uLength goes beyond the end of the sequence, trim
4514 // it back to the end of the sequence
4515 if(uLength > (4*in_seq_data.size() - uBeginIdx))
4516 uLength = 4*in_seq_data.size() - uBeginIdx;
4517
4518 // If entire sequence is being requested, just return
4519 if((uBeginIdx == 0) && (uLength >= 4*in_seq_data.size()))
4520 return uLength;
4521
4522 // Determine index in in_seq_data that holds uBeginIdx residue
4523 TSeqPos uStart = uBeginIdx/4;
4524
4525 // Determine index within start byte
4526 TSeqPos uStartInByte = 2 * (uBeginIdx % 4);
4527
4528 // Calculate masks
4529 unsigned char rightMask = 0xff << uStartInByte;
4530 unsigned char leftMask = ~rightMask;
4531
4532 // Determine index in in_seq_data that holds uBeginIdx + uLength
4533 // residue
4534 TSeqPos uEnd = (uBeginIdx + uLength - 1)/4;
4535
4536 // Get iterator for writting
4537 vector<char>::iterator i_write;
4538
4539 // Determine begin and end of read
4540 vector<char>::iterator i_read = in_seq_data.begin() + uStart;
4541 vector<char>::iterator i_read_end = in_seq_data.begin() + uEnd;
4542
4543 // Loop through in_seq_data and copy data of desire
4544 // sub sequence to begining of in_seq_data
4545 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4546 (*i_write) = (((*i_read) << uStartInByte) | leftMask) &
4547 (((*(i_read+1)) >> (8-uStartInByte)) | rightMask);
4548 ++i_read;
4549 }
4550
4551 // Handle last byte
4552 (*i_write) = (*i_read) << uStartInByte;
4553
4554 // Shrink in_seq to to size needed
4555 TSeqPos uSize = uLength/4;
4556 if((uLength % 4) != 0)
4557 uSize++;
4558 in_seq_data.resize(uSize);
4559
4560 return uLength;
4561 }
4562
4563
4564 // Method to implement Keep for Ncbi4na. Returns length of
4565 // kept sequence.
KeepNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4566 TSeqPos CSeqportUtil_implementation::KeepNcbi4na
4567 (CSeq_data* in_seq,
4568 TSeqPos uBeginIdx,
4569 TSeqPos uLength)
4570 const
4571 {
4572 // Get a reference to in_seq
4573 vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
4574
4575 // If uBeginIdx past the end of in_seq, return empty in_seq
4576 if(uBeginIdx >= in_seq_data.size()*2)
4577 {
4578 in_seq_data.clear();
4579 return 0;
4580 }
4581
4582 // If uLength == 0, Keep from uBeginIdx to end of in_seq
4583 if(uLength == 0)
4584 uLength = 2*in_seq_data.size() - uBeginIdx;
4585
4586
4587 // If uLength goes beyond the end of the sequence, trim
4588 // it back to the end of the sequence
4589 if(uLength > (2*in_seq_data.size() - uBeginIdx))
4590 uLength = 2*in_seq_data.size() - uBeginIdx;
4591
4592 // If entire sequence is being requested, just return
4593 if((uBeginIdx == 0) && (uLength >= 2*in_seq_data.size()))
4594 return uLength;
4595
4596 // Determine index in in_seq_data that holds uBeginIdx residue
4597 TSeqPos uStart = uBeginIdx/2;
4598
4599 // Determine index within start byte
4600 unsigned int uStartInByte = 4 * (uBeginIdx % 2);
4601
4602 // Calculate masks
4603 unsigned char rightMask = 0xff << uStartInByte;
4604 unsigned char leftMask = ~rightMask;
4605
4606 // Determine index in in_seq_data that holds uBeginIdx + uLength
4607 // residue
4608 TSeqPos uEnd = (uBeginIdx + uLength - 1)/2;
4609
4610 // Get iterator for writting
4611 vector<char>::iterator i_write;
4612
4613 // Determine begin and end of read
4614 vector<char>::iterator i_read = in_seq_data.begin() + uStart;
4615 vector<char>::iterator i_read_end = in_seq_data.begin() + uEnd;
4616
4617 // Loop through in_seq_data and copy data of desire
4618 // sub sequence to begining of in_seq_data
4619 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4620 (*i_write) = (((*i_read) << uStartInByte) | leftMask) &
4621 (((*(i_read+1)) >> (8-uStartInByte)) | rightMask);
4622 ++i_read;
4623 }
4624
4625 // Handle last byte
4626 (*i_write) = (*i_read) << uStartInByte;
4627
4628 // Shrink in_seq to to size needed
4629 TSeqPos uSize = uLength/2;
4630 if((uLength % 2) != 0)
4631 uSize++;
4632 in_seq_data.resize(uSize);
4633
4634 return uLength;
4635 }
4636
4637
4638 // Method to implement Keep for Iupacna. Return length
4639 // of kept sequence
KeepIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4640 TSeqPos CSeqportUtil_implementation::KeepIupacna
4641 (CSeq_data* in_seq,
4642 TSeqPos uBeginIdx,
4643 TSeqPos uLength)
4644 const
4645 {
4646 // Get a reference to in_seq
4647 string& in_seq_data = in_seq->SetIupacna().Set();
4648
4649
4650 // If uBeginIdx past end of in_seq, return empty in_seq
4651 if(uBeginIdx >= in_seq_data.size())
4652 {
4653 in_seq_data.erase();
4654 return 0;
4655 }
4656
4657 // If uLength is 0, Keep from uBeginIdx to end of in_seq
4658 if(uLength == 0)
4659 uLength = in_seq_data.size() - uBeginIdx;
4660
4661 // Check that uLength does not go beyond end of in_seq
4662 if((uBeginIdx + uLength) > in_seq_data.size())
4663 uLength = in_seq_data.size() - uBeginIdx;
4664
4665 // If uBeginIdx == 0 and uLength == in_seq_data.size()
4666 // just return as the entire sequence is being requested
4667 if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4668 return uLength;
4669
4670 // Get two iterators on in_seq, one read and one write
4671 string::iterator i_read;
4672 string::iterator i_write;
4673
4674 // Determine begin and end of read
4675 i_read = in_seq_data.begin() + uBeginIdx;
4676 string::iterator i_read_end = i_read + uLength;
4677
4678 // Loop through in_seq for uLength bases
4679 // and shift uBeginIdx to beginning
4680 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write)
4681 {
4682 (*i_write) = (*i_read);
4683 ++i_read;
4684 }
4685
4686 // Resize in_seq_data to uLength
4687 in_seq_data.resize(uLength);
4688
4689 return uLength;
4690 }
4691
4692
4693 // Method to implement Keep for Ncbieaa
KeepNcbieaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4694 TSeqPos CSeqportUtil_implementation::KeepNcbieaa
4695 (CSeq_data* in_seq,
4696 TSeqPos uBeginIdx,
4697 TSeqPos uLength)
4698 const
4699 {
4700 // Get a reference to in_seq
4701 string& in_seq_data = in_seq->SetNcbieaa().Set();
4702
4703
4704 // If uBeginIdx past end of in_seq, return empty in_seq
4705 if(uBeginIdx >= in_seq_data.size())
4706 {
4707 in_seq_data.erase();
4708 return 0;
4709 }
4710
4711 // If uLength is 0, Keep from uBeginIdx to end of in_seq
4712 if(uLength == 0)
4713 uLength = in_seq_data.size() - uBeginIdx;
4714
4715 // Check that uLength does not go beyond end of in_seq
4716 if((uBeginIdx + uLength) > in_seq_data.size())
4717 uLength = in_seq_data.size() - uBeginIdx;
4718
4719 // If uBeginIdx == 0 and uLength == in_seq_data.size()
4720 // just return as the entire sequence is being requested
4721 if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4722 return uLength;
4723
4724 // Get two iterators on in_seq, one read and one write
4725 string::iterator i_read;
4726 string::iterator i_write;
4727
4728 // Determine begin and end of read
4729 i_read = in_seq_data.begin() + uBeginIdx;
4730 string::iterator i_read_end = i_read + uLength;
4731
4732 // Loop through in_seq for uLength bases
4733 // and shift uBeginIdx to beginning
4734 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4735 (*i_write) = (*i_read);
4736 ++i_read;
4737 }
4738
4739 // Resize in_seq_data to uLength
4740 in_seq_data.resize(uLength);
4741
4742 return uLength;
4743 }
4744
4745
4746 // Method to implement Keep for Ncbistdaa
KeepNcbistdaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4747 TSeqPos CSeqportUtil_implementation::KeepNcbistdaa
4748 (CSeq_data* in_seq,
4749 TSeqPos uBeginIdx,
4750 TSeqPos uLength)
4751 const
4752 {
4753 // Get a reference to in_seq
4754 vector<char>& in_seq_data = in_seq->SetNcbistdaa().Set();
4755
4756 // If uBeginIdx past end of in_seq, return empty in_seq
4757 if(uBeginIdx >= in_seq_data.size())
4758 {
4759 in_seq_data.clear();
4760 return 0;
4761 }
4762
4763 // If uLength is 0, Keep from uBeginIdx to end of in_seq
4764 if(uLength == 0)
4765 uLength = in_seq_data.size() - uBeginIdx;
4766
4767 // Check that uLength does not go beyond end of in_seq
4768 if((uBeginIdx + uLength) > in_seq_data.size())
4769 uLength = in_seq_data.size() - uBeginIdx;
4770
4771 // If uBeginIdx == 0 and uLength == in_seq_data.size()
4772 // just return as the entire sequence is being requested
4773 if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4774 return uLength;
4775
4776 // Get two iterators on in_seq, one read and one write
4777 vector<char>::iterator i_read;
4778 vector<char>::iterator i_write;
4779
4780 // Determine begin and end of read
4781 i_read = in_seq_data.begin() + uBeginIdx;
4782 vector<char>::iterator i_read_end = i_read + uLength;
4783
4784 // Loop through in_seq for uLength bases
4785 // and shift uBeginIdx to beginning
4786 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4787 (*i_write) = (*i_read);
4788 ++i_read;
4789 }
4790
4791 // Resize in_seq_data to uLength
4792 in_seq_data.resize(uLength);
4793
4794 return uLength;
4795 }
4796
4797
4798 // Method to implement Keep for Iupacaa
KeepIupacaa(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4799 TSeqPos CSeqportUtil_implementation::KeepIupacaa
4800 (CSeq_data* in_seq,
4801 TSeqPos uBeginIdx,
4802 TSeqPos uLength)
4803 const
4804 {
4805 // Get a reference to in_seq
4806 string& in_seq_data = in_seq->SetIupacaa().Set();
4807
4808
4809 // If uBeginIdx past end of in_seq, return empty in_seq
4810 if (uBeginIdx >= in_seq_data.size()) {
4811 in_seq_data.erase();
4812 return 0;
4813 }
4814
4815 // If uLength is 0, Keep from uBeginIdx to end of in_seq
4816 if(uLength == 0)
4817 uLength = in_seq_data.size() - uBeginIdx;
4818
4819 // Check that uLength does not go beyond end of in_seq
4820 if((uBeginIdx + uLength) > in_seq_data.size())
4821 uLength = in_seq_data.size() - uBeginIdx;
4822
4823 // If uBeginIdx == 0 and uLength == in_seq_data.size()
4824 // just return as the entire sequence is being requested
4825 if((uBeginIdx == 0) && (uLength >= in_seq_data.size()))
4826 return uLength;
4827
4828 // Get two iterators on in_seq, one read and one write
4829 string::iterator i_read;
4830 string::iterator i_write;
4831
4832 // Determine begin and end of read
4833 i_read = in_seq_data.begin() + uBeginIdx;
4834 string::iterator i_read_end = i_read + uLength;
4835
4836 // Loop through in_seq for uLength bases
4837 // and shift uBeginIdx to beginning
4838 for(i_write = in_seq_data.begin(); i_read != i_read_end; ++i_write) {
4839 (*i_write) = (*i_read);
4840 ++i_read;
4841 }
4842
4843 // Resize in_seq_data to uLength
4844 in_seq_data.resize(uLength);
4845
4846 return uLength;
4847 }
4848
4849
4850
4851 // Methods to complement na sequences
4852
4853 // In place methods
ComplementIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4854 TSeqPos CSeqportUtil_implementation::ComplementIupacna
4855 (CSeq_data* in_seq,
4856 TSeqPos uBeginIdx,
4857 TSeqPos uLength)
4858 const
4859 {
4860 // Keep just the part of in_seq that will be complemented
4861 TSeqPos uKept = KeepIupacna(in_seq, uBeginIdx, uLength);
4862
4863 // Get in_seq data
4864 string& in_seq_data = in_seq->SetIupacna().Set();
4865
4866 // Get an iterator to in_seq_data
4867 string::iterator i_data;
4868
4869 // Get end of iteration--needed for performance
4870 string::iterator i_data_end = in_seq_data.end();
4871
4872 // Loop through the input sequence and complement it
4873 for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4874 (*i_data) =
4875 m_Iupacna_complement->m_Table[static_cast<unsigned char>(*i_data)];
4876
4877 return uKept;
4878 }
4879
4880
ComplementNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4881 TSeqPos CSeqportUtil_implementation::ComplementNcbi2na
4882 (CSeq_data* in_seq,
4883 TSeqPos uBeginIdx,
4884 TSeqPos uLength)
4885 const
4886 {
4887 // Keep just the part of in_seq that will be complemented
4888 TSeqPos uKept = KeepNcbi2na(in_seq, uBeginIdx, uLength);
4889
4890 // Get in_seq data
4891 vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
4892
4893 // Get an iterator to in_seq_data
4894 vector<char>::iterator i_data;
4895
4896 // Get end of iteration
4897 vector<char>::iterator i_data_end = in_seq_data.end();
4898
4899 // Loop through the input sequence and complement it
4900 for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4901 (*i_data) =
4902 m_Ncbi2naComplement->m_Table[static_cast<unsigned char>(*i_data)];
4903
4904 return uKept;
4905 }
4906
4907
ComplementNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4908 TSeqPos CSeqportUtil_implementation::ComplementNcbi4na
4909 (CSeq_data* in_seq,
4910 TSeqPos uBeginIdx,
4911 TSeqPos uLength)
4912 const
4913 {
4914 // Keep just the part of in_seq that will be complemented
4915 TSeqPos uKept = KeepNcbi4na(in_seq, uBeginIdx, uLength);
4916
4917 // Get in_seq data
4918 vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
4919
4920 // Get an iterator to in_seq_data
4921 vector<char>::iterator i_data;
4922
4923 // Get end of iteration--done for performance
4924 vector<char>::iterator i_data_end = in_seq_data.end();
4925
4926 // Loop through the input sequence and complement it
4927 for(i_data = in_seq_data.begin(); i_data != i_data_end; ++i_data)
4928 (*i_data) =
4929 m_Ncbi4naComplement->m_Table[static_cast<unsigned char>(*i_data)];
4930
4931 return uKept;
4932 }
4933
4934
4935 // Complement in copy methods
ComplementIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4936 TSeqPos CSeqportUtil_implementation::ComplementIupacna
4937 (const CSeq_data& in_seq,
4938 CSeq_data* out_seq,
4939 TSeqPos uBeginIdx,
4940 TSeqPos uLength)
4941 const
4942 {
4943 TSeqPos uKept = GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
4944 TSeqPos uIdx1 = 0, uIdx2 = 0;
4945 ComplementIupacna(out_seq, uIdx1, uIdx2);
4946 return uKept;
4947 }
4948
4949
ComplementNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4950 TSeqPos CSeqportUtil_implementation::ComplementNcbi2na
4951 (const CSeq_data& in_seq,
4952 CSeq_data* out_seq,
4953 TSeqPos uBeginIdx,
4954 TSeqPos uLength)
4955 const
4956 {
4957 TSeqPos uKept = GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
4958 TSeqPos uIdx1 = 0, uIdx2 = 0;
4959 ComplementNcbi2na(out_seq, uIdx1, uIdx2);
4960 return uKept;
4961 }
4962
4963
ComplementNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4964 TSeqPos CSeqportUtil_implementation::ComplementNcbi4na
4965 (const CSeq_data& in_seq,
4966 CSeq_data* out_seq,
4967 TSeqPos uBeginIdx,
4968 TSeqPos uLength)
4969 const
4970 {
4971 TSeqPos uKept = GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
4972 TSeqPos uIdx1 = 0, uIdx2 = 0;
4973 ComplementNcbi4na(out_seq, uIdx1, uIdx2);
4974 return uKept;
4975 }
4976
4977
4978 // Methods to reverse na sequences
4979
4980 // In place methods
ReverseIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const4981 TSeqPos CSeqportUtil_implementation::ReverseIupacna
4982 (CSeq_data* in_seq,
4983 TSeqPos uBeginIdx,
4984 TSeqPos uLength)
4985 const
4986 {
4987 // Keep just the part of in_seq that will be reversed
4988 TSeqPos uKept = KeepIupacna(in_seq, uBeginIdx, uLength);
4989
4990 // Get in_seq data
4991 string& in_seq_data = in_seq->SetIupacna().Set();
4992
4993 // Reverse the order of the string
4994 reverse(in_seq_data.begin(), in_seq_data.end());
4995
4996 return uKept;
4997 }
4998
4999
ReverseNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5000 TSeqPos CSeqportUtil_implementation::ReverseNcbi2na
5001 (CSeq_data* in_seq,
5002 TSeqPos uBeginIdx,
5003 TSeqPos uLength)
5004 const
5005 {
5006 // Get a reference to in_seq data
5007 vector<char>& in_seq_data = in_seq->SetNcbi2na().Set();
5008
5009 // Validate and adjust uBeginIdx and uLength
5010 if(uBeginIdx >= 4*in_seq_data.size())
5011 {
5012 in_seq_data.erase(in_seq_data.begin(), in_seq_data.end());
5013 return 0;
5014 }
5015
5016 // If uLength is zero, set to end of sequence
5017 if(uLength == 0)
5018 uLength = 4*in_seq_data.size() - uBeginIdx;
5019
5020 // Ensure that uLength not beyond end of sequence
5021 if((uBeginIdx + uLength) > (4 * in_seq_data.size()))
5022 uLength = 4*in_seq_data.size() - uBeginIdx;
5023
5024 // Determine start and end bytes
5025 TSeqPos uStart = uBeginIdx/4;
5026 TSeqPos uEnd = uStart + (uLength - 1 +(uBeginIdx % 4))/4 + 1;
5027
5028 // Declare an iterator and get end of sequence
5029 vector<char>::iterator i_in;
5030 vector<char>::iterator i_in_begin = in_seq_data.begin() + uStart;
5031 vector<char>::iterator i_in_end = in_seq_data.begin() + uEnd;
5032
5033 // Loop through in_seq_data and reverse residues in each byte
5034 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
5035 (*i_in) = m_Ncbi2naRev->m_Table[static_cast<unsigned char>(*i_in)];
5036
5037 // Reverse the bytes in the sequence
5038 reverse(i_in_begin, i_in_end);
5039
5040 // Keep just the requested part of the sequence
5041 TSeqPos uJagged = 3 - ((uBeginIdx + uLength - 1) % 4) + 4*uStart;
5042 return KeepNcbi2na(in_seq, uJagged, uLength);
5043 }
5044
5045
ReverseNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5046 TSeqPos CSeqportUtil_implementation::ReverseNcbi4na
5047 (CSeq_data* in_seq,
5048 TSeqPos uBeginIdx,
5049 TSeqPos uLength)
5050 const
5051 {
5052 // Get a reference to in_seq data
5053 vector<char>& in_seq_data = in_seq->SetNcbi4na().Set();
5054
5055 // Validate and adjust uBeginIdx and uLength
5056 if(uBeginIdx >= 2*in_seq_data.size())
5057 {
5058 in_seq_data.erase(in_seq_data.begin(), in_seq_data.end());
5059 return 0;
5060 }
5061
5062 // If uLength is zero, set to end of sequence
5063 if(uLength == 0)
5064 uLength = 2*in_seq_data.size() - uBeginIdx;
5065
5066 // Ensure that uLength not beyond end of sequence
5067 if((uBeginIdx + uLength) > (2 * in_seq_data.size()))
5068 uLength = 2*in_seq_data.size() - uBeginIdx;
5069
5070 // Determine start and end bytes
5071 TSeqPos uStart = uBeginIdx/2;
5072 TSeqPos uEnd = uStart + (uLength - 1 +(uBeginIdx % 2))/2 + 1;
5073
5074 // Declare an iterator and get end of sequence
5075 vector<char>::iterator i_in;
5076 vector<char>::iterator i_in_begin = in_seq_data.begin() + uStart;
5077 vector<char>::iterator i_in_end = in_seq_data.begin() + uEnd;
5078
5079 // Loop through in_seq_data and reverse residues in each byte
5080 for(i_in = i_in_begin; i_in != i_in_end; ++i_in)
5081 (*i_in) = m_Ncbi4naRev->m_Table[static_cast<unsigned char>(*i_in)];
5082
5083 // Reverse the bytes in the sequence
5084 reverse(i_in_begin, i_in_end);
5085
5086 // Keep just the requested part of the sequence
5087 TSeqPos uJagged = 1 - ((uBeginIdx + uLength - 1) % 2) + 2*uStart;
5088 return KeepNcbi4na(in_seq, uJagged, uLength);
5089 }
5090
5091
5092 // Reverse in copy methods
ReverseIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5093 TSeqPos CSeqportUtil_implementation::ReverseIupacna
5094 (const CSeq_data& in_seq,
5095 CSeq_data* out_seq,
5096 TSeqPos uBeginIdx,
5097 TSeqPos uLength)
5098 const
5099 {
5100 GetIupacnaCopy(in_seq, out_seq, uBeginIdx, uLength);
5101
5102 TSeqPos uIdx1 = 0, uIdx2 = uLength;
5103 return ReverseIupacna(out_seq, uIdx1, uIdx2);
5104 }
5105
5106
ReverseNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5107 TSeqPos CSeqportUtil_implementation::ReverseNcbi2na
5108 (const CSeq_data& in_seq,
5109 CSeq_data* out_seq,
5110 TSeqPos uBeginIdx,
5111 TSeqPos uLength)
5112 const
5113 {
5114 GetNcbi2naCopy(in_seq, out_seq, uBeginIdx, uLength);
5115
5116 TSeqPos uIdx1 = 0, uIdx2 = uLength;
5117 return ReverseNcbi2na(out_seq, uIdx1, uIdx2);
5118 }
5119
5120
ReverseNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5121 TSeqPos CSeqportUtil_implementation::ReverseNcbi4na
5122 (const CSeq_data& in_seq,
5123 CSeq_data* out_seq,
5124 TSeqPos uBeginIdx,
5125 TSeqPos uLength)
5126 const
5127 {
5128 GetNcbi4naCopy(in_seq, out_seq, uBeginIdx, uLength);
5129
5130 TSeqPos uIdx1 = 0, uIdx2 = uLength;
5131 return ReverseNcbi4na(out_seq, uIdx1, uIdx2);
5132 }
5133
5134
5135 // Methods to reverse-complement an na sequences
5136
5137 // In place methods
ReverseComplementIupacna(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5138 TSeqPos CSeqportUtil_implementation::ReverseComplementIupacna
5139 (CSeq_data* in_seq,
5140 TSeqPos uBeginIdx,
5141 TSeqPos uLength)
5142 const
5143 {
5144 ReverseIupacna(in_seq, uBeginIdx, uLength);
5145
5146 TSeqPos uIdx = 0;
5147 return ComplementIupacna(in_seq, uIdx, uLength);
5148 }
5149
5150
ReverseComplementNcbi2na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5151 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi2na
5152 (CSeq_data* in_seq,
5153 TSeqPos uBeginIdx,
5154 TSeqPos uLength)
5155 const
5156 {
5157 ReverseNcbi2na(in_seq, uBeginIdx, uLength);
5158
5159 TSeqPos uIdx = 0;
5160 return ComplementNcbi2na(in_seq, uIdx, uLength);
5161 }
5162
5163
ReverseComplementNcbi4na(CSeq_data * in_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5164 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi4na
5165 (CSeq_data* in_seq,
5166 TSeqPos uBeginIdx,
5167 TSeqPos uLength)
5168 const
5169 {
5170 ReverseNcbi4na(in_seq, uBeginIdx, uLength);
5171
5172 TSeqPos uIdx = 0;
5173 return ComplementNcbi4na(in_seq, uIdx, uLength);
5174 }
5175
5176
5177 // Reverse in copy methods
ReverseComplementIupacna(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5178 TSeqPos CSeqportUtil_implementation::ReverseComplementIupacna
5179 (const CSeq_data& in_seq,
5180 CSeq_data* out_seq,
5181 TSeqPos uBeginIdx,
5182 TSeqPos uLength)
5183 const
5184 {
5185 ReverseIupacna(in_seq, out_seq, uBeginIdx, uLength);
5186
5187 TSeqPos uIdx = 0;
5188 return ComplementIupacna(out_seq, uIdx, uLength);
5189 }
5190
5191
ReverseComplementNcbi2na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5192 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi2na
5193 (const CSeq_data& in_seq,
5194 CSeq_data* out_seq,
5195 TSeqPos uBeginIdx,
5196 TSeqPos uLength)
5197 const
5198 {
5199 ReverseNcbi2na(in_seq, out_seq, uBeginIdx, uLength);
5200
5201 TSeqPos uIdx = 0;
5202 return ComplementNcbi2na(out_seq, uIdx, uLength);
5203 }
5204
5205
ReverseComplementNcbi4na(const CSeq_data & in_seq,CSeq_data * out_seq,TSeqPos uBeginIdx,TSeqPos uLength) const5206 TSeqPos CSeqportUtil_implementation::ReverseComplementNcbi4na
5207 (const CSeq_data& in_seq,
5208 CSeq_data* out_seq,
5209 TSeqPos uBeginIdx,
5210 TSeqPos uLength)
5211 const
5212 {
5213 ReverseNcbi4na(in_seq, out_seq, uBeginIdx, uLength);
5214
5215 TSeqPos uIdx = 0;
5216 return ComplementNcbi4na(out_seq, uIdx, uLength);
5217 }
5218
5219
5220 // Append methods
AppendIupacna(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5221 TSeqPos CSeqportUtil_implementation::AppendIupacna
5222 (CSeq_data* out_seq,
5223 const CSeq_data& in_seq1,
5224 TSeqPos uBeginIdx1,
5225 TSeqPos uLength1,
5226 const CSeq_data& in_seq2,
5227 TSeqPos uBeginIdx2,
5228 TSeqPos uLength2)
5229 const
5230 {
5231 // Get references to in_seqs
5232 const string& in_seq1_data = in_seq1.GetIupacna().Get();
5233 const string& in_seq2_data = in_seq2.GetIupacna().Get();
5234
5235 // Get a reference to out_seq
5236 out_seq->Reset();
5237 string& out_seq_data = out_seq->SetIupacna().Set();
5238
5239 // Validate and Adjust uBeginIdx_ and uLength_
5240 if((uBeginIdx1 >= in_seq1_data.size()) &&
5241 (uBeginIdx2 >= in_seq2_data.size()))
5242 return 0;
5243
5244 if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5245 uLength1 = in_seq1_data.size() - uBeginIdx1;
5246
5247 if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5248 uLength2 = in_seq2_data.size() - uBeginIdx2;
5249
5250 // Append the strings
5251 out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5252 out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5253
5254 return uLength1 + uLength2;
5255 }
5256
5257
AppendNcbi2na(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5258 TSeqPos CSeqportUtil_implementation::AppendNcbi2na
5259 (CSeq_data* out_seq,
5260 const CSeq_data& in_seq1,
5261 TSeqPos uBeginIdx1,
5262 TSeqPos uLength1,
5263 const CSeq_data& in_seq2,
5264 TSeqPos uBeginIdx2,
5265 TSeqPos uLength2)
5266 const
5267 {
5268 // Get references to in_seqs
5269 const vector<char>& in_seq1_data = in_seq1.GetNcbi2na().Get();
5270 const vector<char>& in_seq2_data = in_seq2.GetNcbi2na().Get();
5271
5272 // Get a reference to out_seq
5273 out_seq->Reset();
5274 vector<char>& out_seq_data = out_seq->SetNcbi2na().Set();
5275
5276 // Handle case where both uBeginidx go beyond in_seq
5277 if((uBeginIdx1 >= 4*in_seq1_data.size()) &&
5278 (uBeginIdx2 >= 4*in_seq2_data.size()))
5279 return 0;
5280
5281 // Handle case where uBeginIdx1 goes beyond end of in_seq1
5282 if(uBeginIdx1 >= 4*in_seq1_data.size())
5283 return GetNcbi2naCopy(in_seq2, out_seq, uBeginIdx2, uLength2);
5284
5285 // Handle case where uBeginIdx2 goes beyond end of in_seq2
5286 if(uBeginIdx2 >= 4*in_seq2_data.size())
5287 return GetNcbi2naCopy(in_seq1, out_seq, uBeginIdx1, uLength1);
5288
5289 // Validate and Adjust uBeginIdx_ and uLength_
5290 if(((uBeginIdx1 + uLength1) > 4*in_seq1_data.size()) || uLength1 == 0)
5291 uLength1 = 4*in_seq1_data.size() - uBeginIdx1;
5292
5293 if(((uBeginIdx2 + uLength2) > 4*in_seq2_data.size()) || uLength2 == 0)
5294 uLength2 = 4*in_seq2_data.size() - uBeginIdx2;
5295
5296
5297 // Resize out_seq_data to hold appended sequence
5298 TSeqPos uTotalLength = uLength1 + uLength2;
5299 if((uTotalLength % 4) == 0)
5300 out_seq_data.resize(uTotalLength/4);
5301 else
5302 out_seq_data.resize(uTotalLength/4 + 1);
5303
5304 // Calculate bit shifts required for in_seq1
5305 unsigned int lShift1 = 2*(uBeginIdx1 % 4);
5306 unsigned int rShift1 = 8 - lShift1;
5307
5308 // Calculate bit shifts required for in_seq2
5309 unsigned int lShift2, rShift2, uCase;
5310 unsigned int uVacantIdx = 2*(uLength1 % 4);
5311 unsigned int uStartIdx = 2*(uBeginIdx2 % 4);
5312 if((uVacantIdx < uStartIdx) && (uVacantIdx > 0))
5313 {
5314 uCase = 0;
5315 lShift2 = uStartIdx - uVacantIdx;
5316 rShift2 = 8 - lShift2;
5317 }
5318 else if((uVacantIdx < uStartIdx) && (uVacantIdx == 0))
5319 {
5320 uCase = 1;
5321 lShift2 = uStartIdx;
5322 rShift2 = 8 - lShift2;
5323 }
5324 else if((uVacantIdx == uStartIdx) && (uVacantIdx > 0))
5325 {
5326 uCase = 2;
5327 lShift2 = 0;
5328 rShift2 = 8;
5329 }
5330 else if((uVacantIdx == uStartIdx) && (uVacantIdx == 0))
5331 {
5332 uCase = 3;
5333 lShift2 = 0;
5334 rShift2 = 8;
5335 }
5336 else
5337 {
5338 uCase = 4;
5339 rShift2 = uVacantIdx - uStartIdx;
5340 lShift2 = 8 - rShift2;
5341 }
5342
5343
5344 // Determine begin and end points for iterators.
5345 TSeqPos uStart1 = uBeginIdx1/4;
5346 TSeqPos uEnd1;
5347 if(((uBeginIdx1 + uLength1) % 4) == 0)
5348 uEnd1 = (uBeginIdx1 + uLength1)/4;
5349 else
5350 uEnd1 = (uBeginIdx1 + uLength1)/4 + 1;
5351
5352 TSeqPos uStart2 = uBeginIdx2/4;
5353 TSeqPos uEnd2;
5354 if(((uBeginIdx2 + uLength2) % 4) == 0)
5355 uEnd2 = (uBeginIdx2 + uLength2)/4;
5356 else
5357 uEnd2 = (uBeginIdx2 + uLength2)/4 + 1;
5358
5359 // Get begin and end positions on in_seqs
5360 vector<char>::const_iterator i_in1_begin = in_seq1_data.begin() + uStart1;
5361 vector<char>::const_iterator i_in1_end = in_seq1_data.begin() + uEnd1 - 1;
5362 vector<char>::const_iterator i_in2_begin = in_seq2_data.begin() + uStart2;
5363 vector<char>::const_iterator i_in2_end = in_seq2_data.begin() + uEnd2;
5364
5365 // Declare iterators
5366 vector<char>::iterator i_out = out_seq_data.begin() - 1;
5367 vector<char>::const_iterator i_in1;
5368 vector<char>::const_iterator i_in2;
5369
5370 // Insert in_seq1 into out_seq
5371 for(i_in1 = i_in1_begin; i_in1 != i_in1_end; ++i_in1)
5372 (*(++i_out)) = ((*i_in1) << lShift1) | ((*(i_in1+1) & 255) >> rShift1);
5373
5374 // Handle last byte for in_seq1 if necessary
5375 TSeqPos uEndOutByte;
5376 if((uLength1 % 4) == 0)
5377 uEndOutByte = uLength1/4 - 1;
5378 else
5379 uEndOutByte = uLength1/4;
5380 if(i_out != (out_seq_data.begin() + uEndOutByte))
5381 (*(++i_out)) = (*i_in1) << lShift1;
5382
5383 // Connect in_seq1 and in_seq2
5384 unsigned char uMask1 = 255 << (8 - 2*(uLength1 % 4));
5385 unsigned char uMask2 = 255 >> (2*(uBeginIdx2 % 4));
5386 TSeqPos uSeq2Inc = 1;
5387
5388 switch (uCase) {
5389 case 0: // 0 < uVacantIdx < uStartIdx
5390 if((i_in2_begin + 1) == i_in2_end)
5391 {
5392 (*i_out) &= uMask1;
5393 (*i_out) |= ((*i_in2_begin) & uMask2) << lShift2;
5394 return uTotalLength;
5395 }
5396 else
5397 {
5398 (*i_out) &= uMask1;
5399 (*i_out) |=
5400 (((*i_in2_begin) & uMask2) << lShift2) |
5401 (((*(i_in2_begin+1)) & 255) >> rShift2);
5402 }
5403 break;
5404 case 1: // 0 == uVacantIdx < uStartIdx
5405 if((i_in2_begin + 1) == i_in2_end)
5406 {
5407 (*(++i_out)) = (*i_in2_begin) << lShift2;
5408 return uTotalLength;
5409 }
5410 else
5411 {
5412 (*(++i_out)) =
5413 ((*i_in2_begin) << lShift2) |
5414 (((*(i_in2_begin+1)) & 255) >> rShift2);
5415 }
5416 break;
5417 case 2: // uVacantIdx == uStartIdx > 0
5418 (*i_out) &= uMask1;
5419 (*i_out) |= (*i_in2_begin) & uMask2;
5420 if((i_in2_begin + 1) == i_in2_end)
5421 return uTotalLength;
5422 break;
5423 case 3: // uVacantIdx == uStartIdx == 0
5424 (*(++i_out)) = (*i_in2_begin);
5425 if((i_in2_begin + 1) == i_in2_end)
5426 return uTotalLength;
5427 break;
5428 case 4: // uVacantIdx > uStartIdx
5429 if((i_in2_begin + 1) == i_in2_end)
5430 {
5431 (*i_out) &= uMask1;
5432 (*i_out) |= ((*i_in2_begin) & uMask2) >> rShift2;
5433 if(++i_out != out_seq_data.end())
5434 (*i_out) = (*i_in2_begin) << lShift2;
5435 return uTotalLength;
5436 }
5437 else
5438 {
5439 (*i_out) &= uMask1;
5440 (*i_out) |=
5441 (((*i_in2_begin) & uMask2) >> rShift2) |
5442 ((*(i_in2_begin+1) & ~uMask2) << lShift2);
5443 uSeq2Inc = 0;
5444 }
5445
5446 }
5447
5448 // Insert in_seq2 into out_seq
5449 for(i_in2 = i_in2_begin+uSeq2Inc; (i_in2 != i_in2_end) &&
5450 ((i_in2+1) != i_in2_end); ++i_in2) {
5451 (*(++i_out)) = ((*i_in2) << lShift2) | ((*(i_in2+1) & 255) >> rShift2);
5452 }
5453
5454 // Handle last byte for in_seq2, if there is one
5455 if((++i_out != out_seq_data.end()) && (i_in2 != i_in2_end))
5456 (*i_out) = (*i_in2) << lShift2;
5457
5458 return uLength1 + uLength2;
5459 }
5460
5461
AppendNcbi4na(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5462 TSeqPos CSeqportUtil_implementation::AppendNcbi4na
5463 (CSeq_data* out_seq,
5464 const CSeq_data& in_seq1,
5465 TSeqPos uBeginIdx1,
5466 TSeqPos uLength1,
5467 const CSeq_data& in_seq2,
5468 TSeqPos uBeginIdx2,
5469 TSeqPos uLength2)
5470 const
5471 {
5472 // Get references to in_seqs
5473 const vector<char>& in_seq1_data = in_seq1.GetNcbi4na().Get();
5474 const vector<char>& in_seq2_data = in_seq2.GetNcbi4na().Get();
5475
5476 // Get a reference to out_seq
5477 out_seq->Reset();
5478 vector<char>& out_seq_data = out_seq->SetNcbi4na().Set();
5479
5480 // Handle both uBeginidx go beyond end of in_seq
5481 if((uBeginIdx1 >= 4*in_seq1_data.size()) &&
5482 (uBeginIdx2 >= 4*in_seq2_data.size()))
5483 return 0;
5484
5485 // Handle case where uBeginIdx1 goes beyond end of in_seq1
5486 if(uBeginIdx1 >= 4*in_seq1_data.size())
5487 return GetNcbi4naCopy(in_seq2, out_seq, uBeginIdx2, uLength2);
5488
5489 // Handle case where uBeginIdx2 goes beyond end of in_seq2
5490 if(uBeginIdx2 >= 4*in_seq2_data.size())
5491 return GetNcbi4naCopy(in_seq1, out_seq, uBeginIdx1, uLength1);
5492
5493 // Validate and Adjust uBeginIdx_ and uLength_
5494 if(((uBeginIdx1 + uLength1) > 2*in_seq1_data.size()) || uLength1 == 0)
5495 uLength1 = 2*in_seq1_data.size() - uBeginIdx1;
5496
5497 if(((uBeginIdx2 + uLength2) > 2*in_seq2_data.size()) || uLength2 == 0)
5498 uLength2 = 2*in_seq2_data.size() - uBeginIdx2;
5499
5500 // Resize out_seq_data to hold appended sequence
5501 TSeqPos uTotalLength = uLength1 + uLength2;
5502 if((uTotalLength % 2) == 0)
5503 out_seq_data.resize(uTotalLength/2);
5504 else
5505 out_seq_data.resize(uTotalLength/2 + 1);
5506
5507 // Calculate bit shifts required for in_seq1
5508 unsigned int lShift1 = 4*(uBeginIdx1 % 2);
5509 unsigned int rShift1 = 8 - lShift1;
5510
5511 // Calculate bit shifts required for in_seq2
5512 unsigned int lShift2, rShift2, uCase;
5513 unsigned int uVacantIdx = 4*(uLength1 % 2);
5514 unsigned int uStartIdx = 4*(uBeginIdx2 % 2);
5515 if((uVacantIdx < uStartIdx))
5516 {
5517 uCase = 1;
5518 lShift2 = uStartIdx;
5519 rShift2 = 8 - lShift2;
5520 }
5521 else if((uVacantIdx == uStartIdx) && (uVacantIdx > 0))
5522 {
5523 uCase = 2;
5524 lShift2 = 0;
5525 rShift2 = 8;
5526 }
5527 else if((uVacantIdx == uStartIdx) && (uVacantIdx == 0))
5528 {
5529 uCase = 3;
5530 lShift2 = 0;
5531 rShift2 = 8;
5532 }
5533 else
5534 {
5535 uCase = 4;
5536 rShift2 = uVacantIdx - uStartIdx;
5537 lShift2 = 8 - rShift2;
5538 }
5539
5540
5541 // Determine begin and end points for iterators.
5542 TSeqPos uStart1 = uBeginIdx1/2;
5543 TSeqPos uEnd1;
5544 if(((uBeginIdx1 + uLength1) % 2) == 0)
5545 uEnd1 = (uBeginIdx1 + uLength1)/2;
5546 else
5547 uEnd1 = (uBeginIdx1 + uLength1)/2 + 1;
5548
5549 TSeqPos uStart2 = uBeginIdx2/2;
5550 TSeqPos uEnd2;
5551 if(((uBeginIdx2 + uLength2) % 2) == 0)
5552 uEnd2 = (uBeginIdx2 + uLength2)/2;
5553 else
5554 uEnd2 = (uBeginIdx2 + uLength2)/2 + 1;
5555
5556 // Get begin and end positions on in_seqs
5557 vector<char>::const_iterator i_in1_begin = in_seq1_data.begin() + uStart1;
5558 vector<char>::const_iterator i_in1_end = in_seq1_data.begin() + uEnd1 - 1;
5559 vector<char>::const_iterator i_in2_begin = in_seq2_data.begin() + uStart2;
5560 vector<char>::const_iterator i_in2_end = in_seq2_data.begin() + uEnd2;
5561
5562 // Declare iterators
5563 vector<char>::iterator i_out = out_seq_data.begin() - 1;
5564 vector<char>::const_iterator i_in1;
5565 vector<char>::const_iterator i_in2;
5566
5567 // Insert in_seq1 into out_seq
5568 for(i_in1 = i_in1_begin; i_in1 != i_in1_end; ++i_in1)
5569 (*(++i_out)) = ((*i_in1) << lShift1) | ((*(i_in1+1) & 255) >> rShift1);
5570
5571 // Handle last byte for in_seq1 if necessary
5572 TSeqPos uEndOutByte;
5573 if((uLength1 % 2) == 0)
5574 uEndOutByte = uLength1/2 - 1;
5575 else
5576 uEndOutByte = uLength1/2;
5577 if(i_out != (out_seq_data.begin() + uEndOutByte))
5578 (*(++i_out)) = (*i_in1) << lShift1;
5579
5580 // Connect in_seq1 and in_seq2
5581 unsigned char uMask1 = 255 << (8 - 4*(uLength1 % 2));
5582 unsigned char uMask2 = 255 >> (4*(uBeginIdx2 % 2));
5583 TSeqPos uSeq2Inc = 1;
5584
5585 switch (uCase) {
5586 case 1: // 0 == uVacantIdx < uStartIdx
5587 if((i_in2_begin+1) == i_in2_end)
5588 {
5589 (*(++i_out)) = (*i_in2_begin) << lShift2;
5590 return uTotalLength;
5591 }
5592 else
5593 {
5594 (*(++i_out)) =
5595 ((*i_in2_begin) << lShift2) |
5596 (((*(i_in2_begin+1)) & 255) >> rShift2);
5597 }
5598 break;
5599 case 2: // uVacantIdx == uStartIdx > 0
5600 (*i_out) &= uMask1;
5601 (*i_out) |= (*i_in2_begin) & uMask2;
5602 if((i_in2_begin+1) == i_in2_end)
5603 return uTotalLength;
5604 break;
5605 case 3: // uVacantIdx == uStartIdx == 0
5606 (*(++i_out)) = (*i_in2_begin);
5607 if((i_in2_begin+1) == i_in2_end)
5608 return uTotalLength;
5609 break;
5610 case 4: // uVacantIdx > uStartIdx
5611 if((i_in2_begin+1) == i_in2_end)
5612 {
5613 (*i_out) &= uMask1;
5614 (*i_out) |= ((*i_in2_begin) & uMask2) >> rShift2;
5615 if(++i_out != out_seq_data.end())
5616 (*i_out) = (*i_in2_begin) << lShift2;
5617 return uTotalLength;
5618 }
5619 else
5620 {
5621 (*i_out) &= uMask1;
5622 (*i_out) |=
5623 (((*i_in2_begin) & uMask2) >> rShift2) |
5624 ((*(i_in2_begin+1) & ~uMask2) << lShift2);
5625 uSeq2Inc = 0;
5626 }
5627
5628 }
5629
5630 // Insert in_seq2 into out_seq
5631 for(i_in2 = i_in2_begin+uSeq2Inc; (i_in2 != i_in2_end) &&
5632 ((i_in2+1) != i_in2_end); ++i_in2) {
5633 (*(++i_out)) =
5634 ((*i_in2) << lShift2) | ((*(i_in2+1) & 255) >> rShift2);
5635 }
5636
5637 // Handle last byte for in_seq2, if there is one
5638 if((++i_out != out_seq_data.end()) && (i_in2 != i_in2_end))
5639 (*i_out) = (*i_in2) << lShift2;
5640
5641 return uTotalLength;
5642 }
5643
5644
AppendNcbieaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5645 TSeqPos CSeqportUtil_implementation::AppendNcbieaa
5646 (CSeq_data* out_seq,
5647 const CSeq_data& in_seq1,
5648 TSeqPos uBeginIdx1,
5649 TSeqPos uLength1,
5650 const CSeq_data& in_seq2,
5651 TSeqPos uBeginIdx2,
5652 TSeqPos uLength2)
5653 const
5654 {
5655 // Get references to in_seqs
5656 const string& in_seq1_data = in_seq1.GetNcbieaa().Get();
5657 const string& in_seq2_data = in_seq2.GetNcbieaa().Get();
5658
5659 // Get a reference to out_seq
5660 out_seq->Reset();
5661 string& out_seq_data = out_seq->SetNcbieaa().Set();
5662
5663 // Validate and Adjust uBeginIdx_ and uLength_
5664 if((uBeginIdx1 >= in_seq1_data.size()) &&
5665 (uBeginIdx2 >= in_seq2_data.size()))
5666 {
5667 return 0;
5668 }
5669
5670 if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5671 uLength1 = in_seq1_data.size() - uBeginIdx1;
5672
5673 if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5674 uLength2 = in_seq2_data.size() - uBeginIdx2;
5675
5676 // Append the strings
5677 out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5678 out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5679
5680 return uLength1 + uLength2;
5681 }
5682
5683
AppendNcbistdaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5684 TSeqPos CSeqportUtil_implementation::AppendNcbistdaa
5685 (CSeq_data* out_seq,
5686 const CSeq_data& in_seq1,
5687 TSeqPos uBeginIdx1,
5688 TSeqPos uLength1,
5689 const CSeq_data& in_seq2,
5690 TSeqPos uBeginIdx2,
5691 TSeqPos uLength2)
5692 const
5693 {
5694 // Get references to in_seqs
5695 const vector<char>& in_seq1_data = in_seq1.GetNcbistdaa().Get();
5696 const vector<char>& in_seq2_data = in_seq2.GetNcbistdaa().Get();
5697
5698 // Get a reference to out_seq
5699 out_seq->Reset();
5700 vector<char>& out_seq_data = out_seq->SetNcbistdaa().Set();
5701
5702 // Validate and Adjust uBeginIdx_ and uLength_
5703 if((uBeginIdx1 >= in_seq1_data.size()) &&
5704 (uBeginIdx2 >= in_seq2_data.size()))
5705 return 0;
5706
5707 if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5708 uLength1 = in_seq1_data.size() - uBeginIdx1;
5709
5710 if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5711 uLength2 = in_seq2_data.size() - uBeginIdx2;
5712
5713 // Get begin and end positions on in_seqs
5714 vector<char>::const_iterator i_in1_begin =
5715 in_seq1_data.begin() + uBeginIdx1;
5716 vector<char>::const_iterator i_in1_end = i_in1_begin + uLength1;
5717 vector<char>::const_iterator i_in2_begin =
5718 in_seq2_data.begin() + uBeginIdx2;
5719 vector<char>::const_iterator i_in2_end = i_in2_begin + uLength2;
5720
5721 // Insert the in_seqs into out_seq
5722 out_seq_data.insert(out_seq_data.end(), i_in1_begin, i_in1_end);
5723 out_seq_data.insert(out_seq_data.end(), i_in2_begin, i_in2_end);
5724
5725 return uLength1 + uLength2;
5726 }
5727
5728
AppendIupacaa(CSeq_data * out_seq,const CSeq_data & in_seq1,TSeqPos uBeginIdx1,TSeqPos uLength1,const CSeq_data & in_seq2,TSeqPos uBeginIdx2,TSeqPos uLength2) const5729 TSeqPos CSeqportUtil_implementation::AppendIupacaa
5730 (CSeq_data* out_seq,
5731 const CSeq_data& in_seq1,
5732 TSeqPos uBeginIdx1,
5733 TSeqPos uLength1,
5734 const CSeq_data& in_seq2,
5735 TSeqPos uBeginIdx2,
5736 TSeqPos uLength2)
5737 const
5738 {
5739 // Get references to in_seqs
5740 const string& in_seq1_data = in_seq1.GetIupacaa().Get();
5741 const string& in_seq2_data = in_seq2.GetIupacaa().Get();
5742
5743 // Get a reference to out_seq
5744 out_seq->Reset();
5745 string& out_seq_data = out_seq->SetIupacaa().Set();
5746
5747 // Validate and Adjust uBeginIdx_ and uLength_
5748 if((uBeginIdx1 >= in_seq1_data.size()) &&
5749 (uBeginIdx2 >= in_seq2_data.size()))
5750 {
5751 return 0;
5752 }
5753
5754 if(((uBeginIdx1 + uLength1) > in_seq1_data.size()) || uLength1 == 0)
5755 uLength1 = in_seq1_data.size() - uBeginIdx1;
5756
5757 if(((uBeginIdx2 + uLength2) > in_seq2_data.size()) || uLength2 == 0)
5758 uLength2 = in_seq2_data.size() - uBeginIdx2;
5759
5760 // Append the strings
5761 out_seq_data.append(in_seq1_data.substr(uBeginIdx1,uLength1));
5762 out_seq_data.append(in_seq2_data.substr(uBeginIdx2,uLength2));
5763
5764 return uLength1 + uLength2;
5765 }
5766
5767 // Returns the 3 letter Iupacaa3 code for an ncbistdaa index
GetIupacaa3(TIndex ncbistdaa)5768 const string& CSeqportUtil_implementation::GetIupacaa3
5769 (TIndex ncbistdaa)
5770 {
5771 return GetCodeOrName(eSeq_code_type_iupacaa3, ncbistdaa, true);
5772 }
5773
5774 // Returns true if code type is available
IsCodeAvailable(CSeq_data::E_Choice code_type)5775 bool CSeqportUtil_implementation::IsCodeAvailable
5776 (CSeq_data::E_Choice code_type)
5777 {
5778 if (code_type == CSeq_data::e_not_set) {
5779 return false;
5780 } else {
5781 return IsCodeAvailable(EChoiceToESeq(code_type));
5782 }
5783 }
5784
5785 // Return true if code type is available
IsCodeAvailable(ESeq_code_type code_type)5786 bool CSeqportUtil_implementation::IsCodeAvailable (ESeq_code_type code_type)
5787 {
5788 // Iterate through Seq-code-set looking for code type
5789 ITERATE (CSeq_code_set::TCodes, i_ct, m_SeqCodeSet->GetCodes()) {
5790 if((*i_ct)->GetCode() == code_type) {
5791 return true;
5792 }
5793 }
5794 return false;
5795 }
5796
5797 // Return a pair containing the first index (start-at) and last index
5798 // for code_type.
GetCodeIndexFromTo(CSeq_data::E_Choice code_type)5799 CSeqportUtil::TPair CSeqportUtil_implementation::GetCodeIndexFromTo
5800 (CSeq_data::E_Choice code_type)
5801 {
5802 return GetCodeIndexFromTo(EChoiceToESeq(code_type));
5803 }
5804
5805 // Return a pair containing the first index (start-at) and last index
5806 // for code_type.
GetCodeIndexFromTo(ESeq_code_type code_type)5807 CSeqportUtil::TPair CSeqportUtil_implementation::GetCodeIndexFromTo
5808 (ESeq_code_type code_type)
5809 {
5810 // Iterate through Seq-code-set looking for code type
5811 TPair p;
5812 ITERATE (CSeq_code_set::TCodes, i_ct, m_SeqCodeSet->GetCodes()) {
5813 if((*i_ct)->GetCode() == code_type) {
5814 if ( (*i_ct)->IsSetStart_at() ) {
5815 p.first = static_cast<TIndex>((*i_ct)->GetStart_at());
5816 } else {
5817 p.first = 0;
5818 }
5819 p.second = p.first + static_cast<TIndex>((*i_ct)->GetNum() - 1);
5820 return p;
5821 }
5822 }
5823 throw CSeqportUtil::CBadType("GetCodeIndexFromTo");
5824 }
5825
5826 // Converts CSeq_data::E_Choice type to ESeq_code_type
5827 // and calls overloaded GetCodeOrName()
GetCodeOrName(CSeq_data::E_Choice code_type,TIndex idx,bool get_code)5828 const string& CSeqportUtil_implementation::GetCodeOrName
5829 (CSeq_data::E_Choice code_type,
5830 TIndex idx,
5831 bool get_code)
5832 {
5833 return GetCodeOrName(EChoiceToESeq(code_type), idx, get_code);
5834 }
5835
5836 // Returns the code (symbol) of type code_type for index idx.
GetCodeOrName(ESeq_code_type code_type,TIndex idx,bool get_code)5837 const string& CSeqportUtil_implementation::GetCodeOrName
5838 (ESeq_code_type code_type,
5839 TIndex idx,
5840 bool get_code)
5841 {
5842 if ( !m_IndexString[get_code][code_type-1].size() ) {
5843 throw CSeqportUtil::CBadType("GetCodeOrName");
5844 }
5845 idx -= m_StartAt[code_type-1];
5846 if (idx >= m_IndexString[get_code][code_type-1].size()) {
5847 throw CSeqportUtil::CBadIndex(idx, "GetCodeOrName");
5848 }
5849 return m_IndexString[get_code][code_type-1][idx];
5850
5851 }
5852
5853 // Converts CSeq_data::E_Choice type to ESeq_code_type and call
5854 // overloaded GetIndex();
GetIndex(CSeq_data::E_Choice code_type,const string & code)5855 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndex
5856 (CSeq_data::E_Choice code_type,
5857 const string& code)
5858 {
5859 return GetIndex(EChoiceToESeq(code_type), code);
5860 }
5861
5862 // Get the index for code of type code_type. If not found, return -1
GetIndex(ESeq_code_type code_type,const string & code)5863 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndex
5864 (ESeq_code_type code_type,
5865 const string& code)
5866 {
5867 // Iterator to a map mapping a string code to a code index
5868 map<string, TIndex>::const_iterator pos;
5869
5870 if ( !m_StringIndex[code_type-1].size() ) {
5871 throw CSeqportUtil::CBadType("GetIndex");
5872 }
5873 pos = m_StringIndex[code_type-1].find(code);
5874 if (pos != m_StringIndex[code_type-1].end()) {
5875 return pos->second;
5876 } else {
5877 throw CSeqportUtil::CBadSymbol(code, "GetIndex");
5878 }
5879
5880 }
5881
5882 // Gets complement of index for code type. Returns -1 if code
5883 // type does not exist
GetIndexComplement(CSeq_data::E_Choice code_type,TIndex idx)5884 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndexComplement
5885 (CSeq_data::E_Choice code_type,
5886 TIndex idx)
5887 {
5888 return GetIndexComplement(EChoiceToESeq(code_type), idx);
5889 }
5890
5891 // Returns the complement of the index for code_type. If code_type
5892 // does not exist, or complements for code_type do not exist,
5893 // returns -1
GetIndexComplement(ESeq_code_type code_type,TIndex idx)5894 CSeqportUtil::TIndex CSeqportUtil_implementation::GetIndexComplement
5895 (ESeq_code_type code_type,
5896 TIndex idx)
5897 {
5898
5899 // Check that code is available
5900 if (!m_IndexComplement[code_type-1].size()) {
5901 throw CSeqportUtil::CBadType("GetIndexComplement");
5902 }
5903
5904 // Check that idx is in range of code indices
5905 idx -= m_StartAt[code_type-1];
5906 if ( idx >= m_IndexComplement[code_type-1].size() ) {
5907 throw CSeqportUtil::CBadIndex(idx, "GetIndexComplement");
5908 }
5909
5910 // Return the index of the complement
5911 return m_IndexComplement[code_type-1][idx];
5912 }
5913
GetMapToIndex(CSeq_data::E_Choice from_type,CSeq_data::E_Choice to_type,TIndex from_idx)5914 CSeqportUtil::TIndex CSeqportUtil_implementation::GetMapToIndex
5915 (CSeq_data::E_Choice from_type,
5916 CSeq_data::E_Choice to_type,
5917 TIndex from_idx)
5918 {
5919 return GetMapToIndex(EChoiceToESeq(from_type),
5920 EChoiceToESeq(to_type),
5921 from_idx);
5922 }
5923
GetMapToIndex(ESeq_code_type from_type,ESeq_code_type to_type,TIndex from_idx)5924 CSeqportUtil::TIndex CSeqportUtil_implementation::GetMapToIndex
5925 (ESeq_code_type from_type,
5926 ESeq_code_type to_type,
5927 TIndex from_idx)
5928 {
5929 CMap_table* Map = 0;
5930
5931 if (from_type == eSeq_code_type_iupacna) {
5932 if (to_type == eSeq_code_type_ncbi2na) {
5933 Map = m_IupacnaNcbi2na.GetPointer();
5934 } else if (to_type == eSeq_code_type_ncbi4na) {
5935 Map = m_IupacnaNcbi4na.GetPointer();
5936 }
5937 } else if (from_type == eSeq_code_type_ncbi4na) {
5938 if (to_type == eSeq_code_type_iupacna) {
5939 Map = m_Ncbi4naIupacna.GetPointer();
5940 } else if (to_type == eSeq_code_type_ncbi2na) {
5941 Map = m_Ncbi4naNcbi2na.GetPointer();
5942 }
5943 } else if (from_type == eSeq_code_type_ncbi2na) {
5944 if (to_type == eSeq_code_type_iupacna) {
5945 Map = m_Ncbi2naIupacna.GetPointer();
5946 } else if (to_type == eSeq_code_type_ncbi4na) {
5947 Map = m_Ncbi2naNcbi4na.GetPointer();
5948 }
5949 } else if (from_type == eSeq_code_type_iupacaa) {
5950 if (to_type == eSeq_code_type_ncbieaa) {
5951 Map = m_IupacaaNcbieaa.GetPointer();
5952 } else if (to_type == eSeq_code_type_ncbistdaa) {
5953 Map = m_IupacaaNcbistdaa.GetPointer();
5954 }
5955 } else if (from_type == eSeq_code_type_ncbieaa) {
5956 if (to_type == eSeq_code_type_iupacaa) {
5957 Map = m_NcbieaaIupacaa.GetPointer();
5958 } else if (to_type == eSeq_code_type_ncbistdaa) {
5959 Map = m_NcbieaaNcbistdaa.GetPointer();
5960 }
5961 } else if (from_type == eSeq_code_type_ncbistdaa) {
5962 if (to_type == eSeq_code_type_iupacaa) {
5963 Map = m_NcbistdaaIupacaa.GetPointer();
5964 } else if (to_type == eSeq_code_type_ncbieaa) {
5965 Map = m_NcbistdaaNcbieaa.GetPointer();
5966 }
5967 }
5968
5969 // Check that requested map is available
5970 if (!Map) {
5971 throw CSeqportUtil::CBadType("GetMapToIndex");
5972 }
5973
5974 // Check that from_idx is within range of from_type
5975 if (from_idx - (*Map).m_StartAt >= (TIndex)(*Map).m_Size) {
5976 throw CSeqportUtil::CBadIndex(from_idx - (*Map).m_StartAt,
5977 "GetMapToIndex");
5978 }
5979
5980 // Return map value
5981 return (*Map).m_Table[from_idx];
5982
5983
5984 }
5985
5986
x_GetSeqFromSeqData(const CSeq_data & data,const string ** str,const vector<char> ** vec) const5987 void CSeqportUtil_implementation::x_GetSeqFromSeqData
5988 (const CSeq_data& data,
5989 const string** str,
5990 const vector<char>** vec)
5991 const
5992 {
5993 *str = 0;
5994 *vec = 0;
5995
5996 switch ( data.Which() ) {
5997 case CSeq_data::e_Iupacna:
5998 *str = &(data.GetIupacna().Get());
5999 break;
6000
6001 case CSeq_data::e_Ncbi2na:
6002 *vec = &(data.GetNcbi2na().Get());
6003 break;
6004
6005 case CSeq_data::e_Ncbi4na:
6006 *vec = &(data.GetNcbi4na().Get());
6007 break;
6008
6009 case CSeq_data::e_Ncbi8na:
6010 *vec = &(data.GetNcbi8na().Get());
6011 break;
6012
6013 case CSeq_data::e_Iupacaa:
6014 *str = &(data.GetIupacaa().Get());
6015 break;
6016
6017 case CSeq_data::e_Ncbi8aa:
6018 *vec = &(data.GetNcbi8aa().Get());
6019 break;
6020
6021 case CSeq_data::e_Ncbieaa:
6022 *str = &(data.GetNcbieaa().Get());
6023 break;
6024
6025 case CSeq_data::e_Ncbistdaa:
6026 *vec = &(data.GetNcbistdaa().Get());
6027 break;
6028
6029 case CSeq_data::e_not_set:
6030 case CSeq_data::e_Ncbipna:
6031 case CSeq_data::e_Ncbipaa:
6032 case CSeq_data::e_Gap:
6033 break;
6034 } // end of switch statement
6035 }
6036
6037
6038 // same as above, but takes a non-const CSeq_data object.
x_GetSeqFromSeqData(CSeq_data & data,string ** str,vector<char> ** vec) const6039 void CSeqportUtil_implementation::x_GetSeqFromSeqData
6040 (CSeq_data& data,
6041 string** str,
6042 vector<char>** vec)
6043 const
6044 {
6045 *str = 0;
6046 *vec = 0;
6047
6048 switch ( data.Which() ) {
6049 case CSeq_data::e_Iupacna:
6050 *str = &(data.SetIupacna().Set());
6051 break;
6052
6053 case CSeq_data::e_Ncbi2na:
6054 *vec = &(data.SetNcbi2na().Set());
6055 break;
6056
6057 case CSeq_data::e_Ncbi4na:
6058 *vec = &(data.SetNcbi4na().Set());
6059 break;
6060
6061 case CSeq_data::e_Ncbi8na:
6062 *vec = &(data.SetNcbi8na().Set());
6063 break;
6064
6065 case CSeq_data::e_Iupacaa:
6066 *str = &(data.SetIupacaa().Set());
6067 break;
6068
6069 case CSeq_data::e_Ncbi8aa:
6070 *vec = &(data.SetNcbi8aa().Set());
6071 break;
6072
6073 case CSeq_data::e_Ncbieaa:
6074 *str = &(data.SetNcbieaa().Set());
6075 break;
6076
6077 case CSeq_data::e_Ncbistdaa:
6078 *vec = &(data.SetNcbistdaa().Set());
6079 break;
6080
6081 case CSeq_data::e_not_set:
6082 case CSeq_data::e_Ncbipna:
6083 case CSeq_data::e_Ncbipaa:
6084 case CSeq_data::e_Gap:
6085 break;
6086 } // end of switch statement
6087 }
6088
6089
6090 /////////////////////////////////////////////////////////////////////////////
6091 // CSeqportUtil_implementation::sm_StrAsnData -- some very long and ugly string
6092 //
6093
6094 // local copy of seqcode.prt sequence alphabet and conversion table ASN.1
6095 const char* CSeqportUtil_implementation::sm_StrAsnData[] =
6096 {
6097 "-- This is the set of NCBI sequence code tables\n",
6098 "-- J.Ostell 10/18/91\n",
6099 "--\n",
6100 "\n",
6101 "Seq-code-set ::= {\n",
6102 " codes { -- codes\n",
6103 " { -- IUPACna\n",
6104 " code iupacna ,\n",
6105 " num 25 , -- continuous 65-89\n",
6106 " one-letter TRUE , -- all one letter codes\n",
6107 " start-at 65 , -- starts with A, ASCII 65\n",
6108 " table {\n",
6109 " { symbol \"A\", name \"Adenine\" },\n",
6110 " { symbol \"B\" , name \"G or T or C\" },\n",
6111 " { symbol \"C\", name \"Cytosine\" },\n",
6112 " { symbol \"D\", name \"G or A or T\" },\n",
6113 " { symbol \"\", name \"\" },\n",
6114 " { symbol \"\", name \"\" },\n",
6115 " { symbol \"G\", name \"Guanine\" },\n",
6116 " { symbol \"H\", name \"A or C or T\" } ,\n",
6117 " { symbol \"\", name \"\" },\n",
6118 " { symbol \"\", name \"\" },\n",
6119 " { symbol \"K\", name \"G or T\" },\n",
6120 " { symbol \"\", name \"\"},\n",
6121 " { symbol \"M\", name \"A or C\" },\n",
6122 " { symbol \"N\", name \"A or G or C or T\" } ,\n",
6123 " { symbol \"\", name \"\" },\n",
6124 " { symbol \"\", name \"\" },\n",
6125 " { symbol \"\", name \"\"},\n",
6126 " { symbol \"R\", name \"G or A\"},\n",
6127 " { symbol \"S\", name \"G or C\"},\n",
6128 " { symbol \"T\", name \"Thymine\"},\n",
6129 " { symbol \"\", name \"\"},\n",
6130 " { symbol \"V\", name \"G or C or A\"},\n",
6131 " { symbol \"W\", name \"A or T\" },\n",
6132 " { symbol \"\", name \"\"},\n",
6133 " { symbol \"Y\", name \"T or C\"}\n",
6134 " } , -- end of table\n",
6135 " comps { -- complements\n",
6136 " 84,\n",
6137 " 86,\n",
6138 " 71,\n",
6139 " 72,\n",
6140 " 69,\n",
6141 " 70,\n",
6142 " 67,\n",
6143 " 68,\n",
6144 " 73,\n",
6145 " 74,\n",
6146 " 77,\n",
6147 " 76,\n",
6148 " 75,\n",
6149 " 78,\n",
6150 " 79,\n",
6151 " 80,\n",
6152 " 81,\n",
6153 " 89,\n",
6154 " 83,\n",
6155 " 65,\n",
6156 " 85,\n",
6157 " 66,\n",
6158 " 87,\n",
6159 " 88,\n",
6160 " 82\n",
6161 " }\n",
6162 " } ,\n",
6163 " { -- IUPACaa\n",
6164 " code iupacaa ,\n",
6165 " num 26 , -- continuous 65-90\n",
6166 " one-letter TRUE , -- all one letter codes\n",
6167 " start-at 65 , -- starts with A, ASCII 65\n",
6168 " table {\n",
6169 " { symbol \"A\", name \"Alanine\" },\n",
6170 " { symbol \"B\" , name \"Asp or Asn\" },\n",
6171 " { symbol \"C\", name \"Cysteine\" },\n",
6172 " { symbol \"D\", name \"Aspartic Acid\" },\n",
6173 " { symbol \"E\", name \"Glutamic Acid\" },\n",
6174 " { symbol \"F\", name \"Phenylalanine\" },\n",
6175 " { symbol \"G\", name \"Glycine\" },\n",
6176 " { symbol \"H\", name \"Histidine\" } ,\n",
6177 " { symbol \"I\", name \"Isoleucine\" },\n",
6178 " { symbol \"J\", name \"Leu or Ile\" },\n",
6179 " { symbol \"K\", name \"Lysine\" },\n",
6180 " { symbol \"L\", name \"Leucine\" },\n",
6181 " { symbol \"M\", name \"Methionine\" },\n",
6182 " { symbol \"N\", name \"Asparagine\" } ,\n",
6183 " { symbol \"O\", name \"Pyrrolysine\" },\n",
6184 " { symbol \"P\", name \"Proline\" },\n",
6185 " { symbol \"Q\", name \"Glutamine\"},\n",
6186 " { symbol \"R\", name \"Arginine\"},\n",
6187 " { symbol \"S\", name \"Serine\"},\n",
6188 " { symbol \"T\", name \"Threonine\"},\n",
6189 " { symbol \"U\", name \"Selenocysteine\"}, -- was empty\n",
6190 " { symbol \"V\", name \"Valine\"},\n",
6191 " { symbol \"W\", name \"Tryptophan\" },\n",
6192 " { symbol \"X\", name \"Undetermined or atypical\"},\n",
6193 " { symbol \"Y\", name \"Tyrosine\"},\n",
6194 " { symbol \"Z\", name \"Glu or Gln\" }\n",
6195 " } -- end of table \n",
6196 " } ,\n",
6197 " { -- IUPACeaa\n",
6198 " code ncbieaa ,\n",
6199 " num 49 , -- continuous 42-90\n",
6200 " one-letter TRUE , -- all one letter codes\n",
6201 " start-at 42 , -- starts with *, ASCII 42\n",
6202 " table {\n",
6203 " { symbol \"*\", name \"Termination\" } ,\n",
6204 " { symbol \"\", name \"\" } ,\n",
6205 " { symbol \"\", name \"\" } ,\n",
6206 " { symbol \"-\", name \"Gap\" } ,\n",
6207 " { symbol \"\", name \"\" } ,\n",
6208 " { symbol \"\", name \"\" } ,\n",
6209 " { symbol \"\", name \"\" } ,\n",
6210 " { symbol \"\", name \"\" } ,\n",
6211 " { symbol \"\", name \"\" } ,\n",
6212 " { symbol \"\", name \"\" } ,\n",
6213 " { symbol \"\", name \"\" } ,\n",
6214 " { symbol \"\", name \"\" } ,\n",
6215 " { symbol \"\", name \"\" } ,\n",
6216 " { symbol \"\", name \"\" } ,\n",
6217 " { symbol \"\", name \"\" } ,\n",
6218 " { symbol \"\", name \"\" } ,\n",
6219 " { symbol \"\", name \"\" } ,\n",
6220 " { symbol \"\", name \"\" } ,\n",
6221 " { symbol \"\", name \"\" } ,\n",
6222 " { symbol \"\", name \"\" } ,\n",
6223 " { symbol \"\", name \"\" } ,\n",
6224 " { symbol \"\", name \"\" } ,\n",
6225 " { symbol \"\", name \"\" } ,\n",
6226 " { symbol \"A\", name \"Alanine\" },\n",
6227 " { symbol \"B\" , name \"Asp or Asn\" },\n",
6228 " { symbol \"C\", name \"Cysteine\" },\n",
6229 " { symbol \"D\", name \"Aspartic Acid\" },\n",
6230 " { symbol \"E\", name \"Glutamic Acid\" },\n",
6231 " { symbol \"F\", name \"Phenylalanine\" },\n",
6232 " { symbol \"G\", name \"Glycine\" },\n",
6233 " { symbol \"H\", name \"Histidine\" } ,\n",
6234 " { symbol \"I\", name \"Isoleucine\" },\n",
6235 " { symbol \"J\", name \"Leu or Ile\" },\n",
6236 " { symbol \"K\", name \"Lysine\" },\n",
6237 " { symbol \"L\", name \"Leucine\" },\n",
6238 " { symbol \"M\", name \"Methionine\" },\n",
6239 " { symbol \"N\", name \"Asparagine\" } ,\n",
6240 " { symbol \"O\", name \"Pyrrolysine\" },\n",
6241 " { symbol \"P\", name \"Proline\" },\n",
6242 " { symbol \"Q\", name \"Glutamine\"},\n",
6243 " { symbol \"R\", name \"Arginine\"},\n",
6244 " { symbol \"S\", name \"Serine\"},\n",
6245 " { symbol \"T\", name \"Threonine\"},\n",
6246 " { symbol \"U\", name \"Selenocysteine\"},\n",
6247 " { symbol \"V\", name \"Valine\"},\n",
6248 " { symbol \"W\", name \"Tryptophan\" },\n",
6249 " { symbol \"X\", name \"Undetermined or atypical\"},\n",
6250 " { symbol \"Y\", name \"Tyrosine\"},\n",
6251 " { symbol \"Z\", name \"Glu or Gln\" }\n",
6252 " } -- end of table \n",
6253 " } ,\n",
6254 " { -- IUPACaa3\n",
6255 " code iupacaa3 ,\n",
6256 " num 28 , -- continuous 0-27\n",
6257 " one-letter FALSE , -- all 3 letter codes\n",
6258 " table {\n",
6259 " { symbol \"---\", name \"Gap\" } ,\n",
6260 " { symbol \"Ala\", name \"Alanine\" },\n",
6261 " { symbol \"Asx\" , name \"Asp or Asn\" },\n",
6262 " { symbol \"Cys\", name \"Cysteine\" },\n",
6263 " { symbol \"Asp\", name \"Aspartic Acid\" },\n",
6264 " { symbol \"Glu\", name \"Glutamic Acid\" },\n",
6265 " { symbol \"Phe\", name \"Phenylalanine\" },\n",
6266 " { symbol \"Gly\", name \"Glycine\" },\n",
6267 " { symbol \"His\", name \"Histidine\" } ,\n",
6268 " { symbol \"Ile\", name \"Isoleucine\" },\n",
6269 " { symbol \"Lys\", name \"Lysine\" },\n",
6270 " { symbol \"Leu\", name \"Leucine\" },\n",
6271 " { symbol \"Met\", name \"Methionine\" },\n",
6272 " { symbol \"Asn\", name \"Asparagine\" } ,\n",
6273 " { symbol \"Pro\", name \"Proline\" },\n",
6274 " { symbol \"Gln\", name \"Glutamine\"},\n",
6275 " { symbol \"Arg\", name \"Arginine\"},\n",
6276 " { symbol \"Ser\", name \"Serine\"},\n",
6277 " { symbol \"Thr\", name \"Threonine\"},\n",
6278 " { symbol \"Val\", name \"Valine\"},\n",
6279 " { symbol \"Trp\", name \"Tryptophan\" },\n",
6280 " { symbol \"Xxx\", name \"Undetermined or atypical\"},\n",
6281 " { symbol \"Tyr\", name \"Tyrosine\"},\n",
6282 " { symbol \"Glx\", name \"Glu or Gln\" },\n",
6283 " { symbol \"Sec\", name \"Selenocysteine\"},\n",
6284 " { symbol \"Ter\", name \"Termination\" },\n",
6285 " { symbol \"Pyl\", name \"Pyrrolysine\"},\n",
6286 " { symbol \"Xle\", name \"Leu or Ile\"}\n",
6287 " } -- end of table \n",
6288 " } ,\n",
6289 " { -- NCBIstdaa\n",
6290 " code ncbistdaa ,\n",
6291 " num 28 , -- continuous 0-27\n",
6292 " one-letter TRUE , -- all one letter codes\n",
6293 " table {\n",
6294 " { symbol \"-\", name \"Gap\" } , -- 0\n",
6295 " { symbol \"A\", name \"Alanine\" }, -- 1\n",
6296 " { symbol \"B\" , name \"Asp or Asn\" }, -- 2\n",
6297 " { symbol \"C\", name \"Cysteine\" }, -- 3\n",
6298 " { symbol \"D\", name \"Aspartic Acid\" }, -- 4\n",
6299 " { symbol \"E\", name \"Glutamic Acid\" }, -- 5\n",
6300 " { symbol \"F\", name \"Phenylalanine\" }, -- 6\n",
6301 " { symbol \"G\", name \"Glycine\" }, -- 7\n",
6302 " { symbol \"H\", name \"Histidine\" } , -- 8\n",
6303 " { symbol \"I\", name \"Isoleucine\" }, -- 9\n",
6304 " { symbol \"K\", name \"Lysine\" }, -- 10\n",
6305 " { symbol \"L\", name \"Leucine\" }, -- 11\n",
6306 " { symbol \"M\", name \"Methionine\" }, -- 12\n",
6307 " { symbol \"N\", name \"Asparagine\" } , -- 13\n",
6308 " { symbol \"P\", name \"Proline\" }, -- 14\n",
6309 " { symbol \"Q\", name \"Glutamine\"}, -- 15\n",
6310 " { symbol \"R\", name \"Arginine\"}, -- 16\n",
6311 " { symbol \"S\", name \"Serine\"}, -- 17\n",
6312 " { symbol \"T\", name \"Threoine\"}, -- 18\n",
6313 " { symbol \"V\", name \"Valine\"}, -- 19\n",
6314 " { symbol \"W\", name \"Tryptophan\" }, -- 20\n",
6315 " { symbol \"X\", name \"Undetermined or atypical\"}, -- 21\n",
6316 " { symbol \"Y\", name \"Tyrosine\"}, -- 22\n",
6317 " { symbol \"Z\", name \"Glu or Gln\" }, -- 23\n",
6318 " { symbol \"U\", name \"Selenocysteine\"}, -- 24 \n",
6319 " { symbol \"*\", name \"Termination\" }, -- 25\n",
6320 " { symbol \"O\", name \"Pyrrolysine\" }, -- 26\n",
6321 " { symbol \"J\", name \"Leu or Ile\" } -- 27\n",
6322 " } -- end of table \n",
6323 " } ,\n",
6324 " { -- NCBI2na\n",
6325 " code ncbi2na ,\n",
6326 " num 4 , -- continuous 0-3\n",
6327 " one-letter TRUE , -- all one letter codes\n",
6328 " table {\n",
6329 " { symbol \"A\", name \"Adenine\" },\n",
6330 " { symbol \"C\", name \"Cytosine\" },\n",
6331 " { symbol \"G\", name \"Guanine\" },\n",
6332 " { symbol \"T\", name \"Thymine/Uracil\"}\n",
6333 " } , -- end of table \n",
6334 " comps { -- complements\n",
6335 " 3,\n",
6336 " 2,\n",
6337 " 1,\n",
6338 " 0\n",
6339 " }\n",
6340 " } ,\n",
6341 " { -- NCBI4na\n",
6342 " code ncbi4na ,\n",
6343 " num 16 , -- continuous 0-15\n",
6344 " one-letter TRUE , -- all one letter codes\n",
6345 " table {\n",
6346 " { symbol \"-\", name \"Gap\" } ,\n",
6347 " { symbol \"A\", name \"Adenine\" },\n",
6348 " { symbol \"C\", name \"Cytosine\" },\n",
6349 " { symbol \"M\", name \"A or C\" },\n",
6350 " { symbol \"G\", name \"Guanine\" },\n",
6351 " { symbol \"R\", name \"G or A\"},\n",
6352 " { symbol \"S\", name \"G or C\"},\n",
6353 " { symbol \"V\", name \"G or C or A\"},\n",
6354 " { symbol \"T\", name \"Thymine/Uracil\"},\n",
6355 " { symbol \"W\", name \"A or T\" },\n",
6356 " { symbol \"Y\", name \"T or C\"} ,\n",
6357 " { symbol \"H\", name \"A or C or T\" } ,\n",
6358 " { symbol \"K\", name \"G or T\" },\n",
6359 " { symbol \"D\", name \"G or A or T\" },\n",
6360 " { symbol \"B\" , name \"G or T or C\" },\n",
6361 " { symbol \"N\", name \"A or G or C or T\" }\n",
6362 " } , -- end of table \n",
6363 " comps { -- complements\n",
6364 " 0 ,\n",
6365 " 8 ,\n",
6366 " 4 ,\n",
6367 " 12,\n",
6368 " 2 ,\n",
6369 " 10,\n",
6370 " 6 ,\n",
6371 " 14,\n",
6372 " 1 ,\n",
6373 " 9 ,\n",
6374 " 5 ,\n",
6375 " 13,\n",
6376 " 3 ,\n",
6377 " 11,\n",
6378 " 7 ,\n",
6379 " 15\n",
6380 " }\n",
6381 " }\n",
6382 " } , -- end of codes\n",
6383 " maps {\n",
6384 " {\n",
6385 " from iupacna ,\n",
6386 " to ncbi2na ,\n",
6387 " num 25 ,\n",
6388 " start-at 65 ,\n",
6389 " table {\n",
6390 " 0, -- A -> A\n",
6391 " 1, -- B -> C\n",
6392 " 1, -- C -> C\n",
6393 " 2, -- D -> G\n",
6394 " 255,\n",
6395 " 255,\n",
6396 " 2, -- G -> G\n",
6397 " 0, -- H -> A\n",
6398 " 255,\n",
6399 " 255,\n",
6400 " 2, -- K -> G\n",
6401 " 255,\n",
6402 " 1, -- M -> C\n",
6403 " 0, -- N -> A\n",
6404 " 255,\n",
6405 " 255,\n",
6406 " 255,\n",
6407 " 2, -- R -> G\n",
6408 " 1, -- S -> C\n",
6409 " 3, -- T -> T\n",
6410 " 255,\n",
6411 " 0, -- V -> A\n",
6412 " 3, -- W -> T\n",
6413 " 255,\n",
6414 " 3 } -- Y -> T\n",
6415 " } ,\n",
6416 " {\n",
6417 " from iupacna ,\n",
6418 " to ncbi4na ,\n",
6419 " num 26 ,\n",
6420 " start-at 64 ,\n",
6421 " table {\n",
6422 " 0, -- @ used by FastaToSeqEntry to convert hyphen to gap\n",
6423 " 1, -- A\n",
6424 " 14, -- B\n",
6425 " 2, -- C\n",
6426 " 13, -- D\n",
6427 " 255,\n",
6428 " 255,\n",
6429 " 4, -- G\n",
6430 " 11, -- H\n",
6431 " 255,\n",
6432 " 255,\n",
6433 " 12, -- K\n",
6434 " 255,\n",
6435 " 3, -- M\n",
6436 " 15, -- N\n",
6437 " 255,\n",
6438 " 255,\n",
6439 " 255,\n",
6440 " 5, -- R\n",
6441 " 6, -- S\n",
6442 " 8, -- T\n",
6443 " 255,\n",
6444 " 7, -- V\n",
6445 " 9, -- W\n",
6446 " 255,\n",
6447 " 10 } -- Y\n",
6448 " } ,\n",
6449 " {\n",
6450 " from ncbi2na ,\n",
6451 " to iupacna ,\n",
6452 " num 4 ,\n",
6453 " table {\n",
6454 " 65, -- A\n",
6455 " 67, -- C\n",
6456 " 71, -- G\n",
6457 " 84 } -- T\n",
6458 " } ,\n",
6459 " {\n",
6460 " from ncbi2na ,\n",
6461 " to ncbi4na ,\n",
6462 " num 4 ,\n",
6463 " table {\n",
6464 " 1, -- A\n",
6465 " 2, -- C\n",
6466 " 4, -- G\n",
6467 " 8 } -- T\n",
6468 " } ,\n",
6469 " {\n",
6470 " from ncbi4na ,\n",
6471 " to iupacna ,\n",
6472 " num 16 ,\n",
6473 " table {\n",
6474 " 78, -- gap -> N\n",
6475 " 65, -- A\n",
6476 " 67, -- C\n",
6477 " 77, -- M\n",
6478 " 71, -- G\n",
6479 " 82, -- R\n",
6480 " 83, -- S\n",
6481 " 86, -- V\n",
6482 " 84, -- T\n",
6483 " 87, -- W\n",
6484 " 89, -- Y\n",
6485 " 72, -- H\n",
6486 " 75, -- K\n",
6487 " 68, -- D\n",
6488 " 66, -- B\n",
6489 " 78 } -- N\n",
6490 " } ,\n",
6491 " {\n",
6492 " from ncbi4na ,\n",
6493 " to ncbi2na ,\n",
6494 " num 16 ,\n",
6495 " table {\n",
6496 " 3, -- gap -> T\n",
6497 " 0, -- A -> A\n",
6498 " 1, -- C -> C\n",
6499 " 1, -- M -> C\n",
6500 " 2, -- G -> G\n",
6501 " 2, -- R -> G\n",
6502 " 1, -- S -> C\n",
6503 " 0, -- V -> A\n",
6504 " 3, -- T -> T\n",
6505 " 3, -- W -> T\n",
6506 " 3, -- Y -> T\n",
6507 " 0, -- H -> A\n",
6508 " 2, -- K -> G\n",
6509 " 2, -- D -> G\n",
6510 " 1, -- B -> C\n",
6511 " 0 } -- N -> A\n",
6512 " } ,\n",
6513 " {\n",
6514 " from iupacaa ,\n",
6515 " to ncbieaa ,\n",
6516 " num 26 ,\n",
6517 " start-at 65 ,\n",
6518 " table {\n",
6519 " 65 , -- they map directly\n",
6520 " 66 ,\n",
6521 " 67 ,\n",
6522 " 68,\n",
6523 " 69,\n",
6524 " 70,\n",
6525 " 71,\n",
6526 " 72,\n",
6527 " 73,\n",
6528 " 74, -- J - was 255\n",
6529 " 75,\n",
6530 " 76,\n",
6531 " 77,\n",
6532 " 78,\n",
6533 " 79, -- O - was 255\n",
6534 " 80,\n",
6535 " 81,\n",
6536 " 82,\n",
6537 " 83,\n",
6538 " 84,\n",
6539 " 85, -- U - was 255\n",
6540 " 86,\n",
6541 " 87,\n",
6542 " 88,\n",
6543 " 89,\n",
6544 " 90 }\n",
6545 " } ,\n",
6546 " {\n",
6547 " from ncbieaa ,\n",
6548 " to iupacaa ,\n",
6549 " num 49 ,\n",
6550 " start-at 42 ,\n",
6551 " table {\n",
6552 " 88 , -- termination -> X\n",
6553 " 255,\n",
6554 " 255,\n",
6555 " 88, -- Gap -> X\n",
6556 " 255,\n",
6557 " 255,\n",
6558 " 255,\n",
6559 " 255,\n",
6560 " 255,\n",
6561 " 255,\n",
6562 " 255,\n",
6563 " 255,\n",
6564 " 255,\n",
6565 " 255,\n",
6566 " 255,\n",
6567 " 255,\n",
6568 " 255,\n",
6569 " 255,\n",
6570 " 255,\n",
6571 " 255,\n",
6572 " 255,\n",
6573 " 255,\n",
6574 " 255,\n",
6575 " 65 , -- from here they map directly\n",
6576 " 66 ,\n",
6577 " 67 ,\n",
6578 " 68,\n",
6579 " 69,\n",
6580 " 70,\n",
6581 " 71,\n",
6582 " 72,\n",
6583 " 73,\n",
6584 " 74, -- J - was 255\n",
6585 " 75,\n",
6586 " 76,\n",
6587 " 77,\n",
6588 " 78,\n",
6589 " 79, -- O - was 255\n",
6590 " 80,\n",
6591 " 81,\n",
6592 " 82,\n",
6593 " 83,\n",
6594 " 84,\n",
6595 " 85, -- U was -> X 88\n",
6596 " 86,\n",
6597 " 87,\n",
6598 " 88,\n",
6599 " 89,\n",
6600 " 90 }\n",
6601 " } ,\n",
6602 " {\n",
6603 " from iupacaa ,\n",
6604 " to ncbistdaa ,\n",
6605 " num 26 ,\n",
6606 " start-at 65 ,\n",
6607 " table {\n",
6608 " 1 , -- they map directly\n",
6609 " 2 ,\n",
6610 " 3 ,\n",
6611 " 4,\n",
6612 " 5,\n",
6613 " 6,\n",
6614 " 7,\n",
6615 " 8,\n",
6616 " 9,\n",
6617 " 27, -- J - was 255\n",
6618 " 10,\n",
6619 " 11,\n",
6620 " 12,\n",
6621 " 13,\n",
6622 " 26, -- O - was 255\n",
6623 " 14,\n",
6624 " 15,\n",
6625 " 16,\n",
6626 " 17,\n",
6627 " 18,\n",
6628 " 24, -- U - was 255\n",
6629 " 19,\n",
6630 " 20,\n",
6631 " 21,\n",
6632 " 22,\n",
6633 " 23 }\n",
6634 " } ,\n",
6635 " {\n",
6636 " from ncbieaa ,\n",
6637 " to ncbistdaa ,\n",
6638 " num 49 ,\n",
6639 " start-at 42 ,\n",
6640 " table {\n",
6641 " 25, -- termination\n",
6642 " 255,\n",
6643 " 255,\n",
6644 " 0, -- Gap\n",
6645 " 255,\n",
6646 " 255,\n",
6647 " 255,\n",
6648 " 255,\n",
6649 " 255,\n",
6650 " 255,\n",
6651 " 255,\n",
6652 " 255,\n",
6653 " 255,\n",
6654 " 255,\n",
6655 " 255,\n",
6656 " 255,\n",
6657 " 255,\n",
6658 " 255,\n",
6659 " 255,\n",
6660 " 255,\n",
6661 " 255,\n",
6662 " 255,\n",
6663 " 255,\n",
6664 " 1 , -- they map directly\n",
6665 " 2 ,\n",
6666 " 3 ,\n",
6667 " 4,\n",
6668 " 5,\n",
6669 " 6,\n",
6670 " 7,\n",
6671 " 8,\n",
6672 " 9,\n",
6673 " 27, -- J - was 255\n",
6674 " 10,\n",
6675 " 11,\n",
6676 " 12,\n",
6677 " 13,\n",
6678 " 26, -- O - was 255\n",
6679 " 14,\n",
6680 " 15,\n",
6681 " 16,\n",
6682 " 17,\n",
6683 " 18,\n",
6684 " 24, -- U\n",
6685 " 19,\n",
6686 " 20,\n",
6687 " 21,\n",
6688 " 22,\n",
6689 " 23 }\n",
6690 " } ,\n",
6691 " {\n",
6692 " from ncbistdaa ,\n",
6693 " to ncbieaa ,\n",
6694 " num 28 ,\n",
6695 " table {\n",
6696 " 45 , -- \"-\"\n",
6697 " 65 , -- they map directly with holes for O and J\n",
6698 " 66 ,\n",
6699 " 67 ,\n",
6700 " 68,\n",
6701 " 69,\n",
6702 " 70,\n",
6703 " 71,\n",
6704 " 72,\n",
6705 " 73,\n",
6706 " 75,\n",
6707 " 76,\n",
6708 " 77,\n",
6709 " 78,\n",
6710 " 80,\n",
6711 " 81,\n",
6712 " 82,\n",
6713 " 83,\n",
6714 " 84,\n",
6715 " 86,\n",
6716 " 87,\n",
6717 " 88,\n",
6718 " 89,\n",
6719 " 90,\n",
6720 " 85, -- U\n",
6721 " 42, -- *\n",
6722 " 79, -- O - new\n",
6723 " 74} -- J - new\n",
6724 " } ,\n",
6725 " {\n",
6726 " from ncbistdaa ,\n",
6727 " to iupacaa ,\n",
6728 " num 28 ,\n",
6729 " table {\n",
6730 " 255 , -- \"-\"\n",
6731 " 65 , -- they map directly with holes for O and J\n",
6732 " 66 ,\n",
6733 " 67 ,\n",
6734 " 68,\n",
6735 " 69,\n",
6736 " 70,\n",
6737 " 71,\n",
6738 " 72,\n",
6739 " 73,\n",
6740 " 75,\n",
6741 " 76,\n",
6742 " 77,\n",
6743 " 78,\n",
6744 " 80,\n",
6745 " 81,\n",
6746 " 82,\n",
6747 " 83,\n",
6748 " 84,\n",
6749 " 86,\n",
6750 " 87,\n",
6751 " 88,\n",
6752 " 89,\n",
6753 " 90,\n",
6754 " 85, -- U - was 88\n",
6755 " 255, -- *\n",
6756 " 79, -- O - new\n",
6757 " 74} -- J - new\n",
6758 " } \n",
6759 " }\n",
6760 "-- end of seq-code-set -- }", // make sure '}' is last symbol of ASN text
6761 0 // to indicate that there is no more data
6762 };
6763
6764
6765 END_objects_SCOPE
6766 END_NCBI_SCOPE
6767