1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef SSTRING_H_
21 #define SSTRING_H_
22
23 #include <string.h>
24 #include <iostream>
25 #include "assert_helpers.h"
26 #include "alphabet.h"
27 #include "random_source.h"
28
29 /**
30 * Four kinds of strings defined here:
31 *
32 * SString:
33 * A fixed-length string using heap memory with size set at construction time
34 * or when install() member is called.
35 *
36 * S2bDnaString:
37 * Like SString, but stores a list uint32_t words where each word is divided
38 * into 16 2-bit slots interpreted as holding one A/C/G/T nucleotide each.
39 *
40 * TODO: S3bDnaString allowing N. S4bDnaString allowing nucleotide masks.
41 *
42 * SStringExpandable:
43 * A string using heap memory where the size of the backing store is
44 * automatically resized as needed. Supports operations like append, insert,
45 * erase, etc.
46 *
47 * SStringFixed:
48 * A fixed-length string using stack memory where size is set at compile
49 * time.
50 *
51 * All string classes have some extra facilities that make it easy to print the
52 * string, including when the string uses an encoded alphabet. See toZBuf()
53 * and toZBufXForm().
54 *
55 * Global lt, eq, and gt template functions are supplied. They are capable of
56 * doing lexicographical comparisons between any of the three categories of
57 * strings defined here.
58 */
59
60 template<typename T>
61 class Class_sstr_len {
62 public:
sstr_len(const T & s)63 static inline size_t sstr_len(const T& s) {
64 return s.length();
65 }
66 };
67
68 template<unsigned N>
69 class Class_sstr_len<const char[N]> {
70 public:
sstr_len(const char s[N])71 static inline size_t sstr_len(const char s[N]) {
72 return strlen(s);
73 }
74 };
75
76 template<>
77 class Class_sstr_len<const char *> {
78 public:
sstr_len(const char * s)79 static inline size_t sstr_len(const char *s) {
80 return strlen(s);
81 }
82 };
83
84 template<>
85 class Class_sstr_len<const unsigned char *> {
86 public:
sstr_len(const unsigned char * s)87 static inline size_t sstr_len(const unsigned char *s) {
88 return strlen((const char *)s);
89 }
90 };
91
92 template<typename T1, typename T2>
sstr_eq(const T1 & s1,const T2 & s2)93 static inline bool sstr_eq(const T1& s1, const T2& s2) {
94 size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
95 size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
96 if(len1 != len2) return false;
97 for(size_t i = 0; i < len1; i++) {
98 if(s1[i] != s2[i]) return false;
99 }
100 return true;
101 }
102
103 template<typename T1, typename T2>
sstr_neq(const T1 & s1,const T2 & s2)104 static inline bool sstr_neq(const T1& s1, const T2& s2) {
105 return !sstr_eq(s1, s2);
106 }
107
108 /**
109 * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
110 * to upto characters.
111 */
112 template<typename T1, typename T2>
113 static inline bool sstr_suf_upto_eq(
114 const T1& s1, size_t suf1,
115 const T2& s2, size_t suf2,
116 size_t upto,
117 bool endlt = true)
118 {
119 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
120 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
121 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
122 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
123 if(len1 > upto) len1 = upto;
124 if(len2 > upto) len2 = upto;
125 if(len1 != len2) return false;
126 for(size_t i = 0; i < len1; i++) {
127 if(s1[suf1+i] != s2[suf2+i]) {
128 return false;
129 }
130 }
131 return true;
132 }
133
134 /**
135 * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
136 * to upto characters.
137 */
138 template<typename T1, typename T2>
139 static inline bool sstr_suf_upto_neq(
140 const T1& s1, size_t suf1,
141 const T2& s2, size_t suf2,
142 size_t upto,
143 bool endlt = true)
144 {
145 return !sstr_suf_upto_eq(s1, suf1, s2, suf2, upto, endlt);
146 }
147
148 /**
149 * Return true iff s1 is less than s2.
150 */
151 template<typename T1, typename T2>
152 static inline bool sstr_lt(const T1& s1, const T2& s2, bool endlt = true) {
153 size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
154 size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
155 size_t minlen = (len1 < len2 ? len1 : len2);
156 for(size_t i = 0; i < minlen; i++) {
157 if(s1[i] < s2[i]) {
158 return true;
159 } else if(s1[i] > s2[i]) {
160 return false;
161 }
162 }
163 if(len1 == len2) return false;
164 return (len1 < len2) == endlt;
165 }
166
167 /**
168 * Return true iff the given suffix of s1 is less than the given suffix of s2.
169 */
170 template<typename T1, typename T2>
171 static inline bool sstr_suf_lt(
172 const T1& s1, size_t suf1,
173 const T2& s2, size_t suf2,
174 bool endlt = true)
175 {
176 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
177 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
178 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
179 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
180 size_t minlen = (len1 < len2 ? len1 : len2);
181 for(size_t i = 0; i < minlen; i++) {
182 if(s1[suf1+i] < s2[suf2+i]) {
183 return true;
184 } else if(s1[suf1+i] > s2[suf2+i]) {
185 return false;
186 }
187 }
188 if(len1 == len2) return false;
189 return (len1 < len2) == endlt;
190 }
191
192 /**
193 * Return true iff the given suffix of s1 is less than the given suffix of s2.
194 * Treat s1 and s2 as though they have lengths len1/len2.
195 */
196 template<typename T1, typename T2>
197 static inline bool sstr_suf_lt(
198 const T1& s1, size_t suf1, size_t len1,
199 const T2& s2, size_t suf2, size_t len2,
200 bool endlt = true)
201 {
202 assert_leq(suf1, len1);
203 assert_leq(suf2, len2);
204 size_t left1 = len1 - suf1;
205 size_t left2 = len2 - suf2;
206 size_t minleft = (left1 < left2 ? left1 : left2);
207 for(size_t i = 0; i < minleft; i++) {
208 if(s1[suf1+i] < s2[suf2+i]) {
209 return true;
210 } else if(s1[suf1+i] > s2[suf2+i]) {
211 return false;
212 }
213 }
214 if(left1 == left2) return false;
215 return (left1 < left2) == endlt;
216 }
217
218 /**
219 * Return true iff the given suffix of s1 is less than the given suffix of s2
220 * up to upto characters.
221 */
222 template<typename T1, typename T2>
223 static inline bool sstr_suf_upto_lt(
224 const T1& s1, size_t suf1,
225 const T2& s2, size_t suf2,
226 size_t upto,
227 bool endlt = true)
228 {
229 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
230 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
231 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
232 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
233 if(len1 > upto) len1 = upto;
234 if(len2 > upto) len2 = upto;
235 size_t minlen = (len1 < len2 ? len1 : len2);
236 for(size_t i = 0; i < minlen; i++) {
237 if(s1[suf1+i] < s2[suf2+i]) {
238 return true;
239 } else if(s1[suf1+i] > s2[suf2+i]) {
240 return false;
241 }
242 }
243 if(len1 == len2) return false;
244 return (len1 < len2) == endlt;
245 }
246
247 /**
248 * Return true iff the given prefix of s1 is less than the given prefix of s2.
249 */
250 template<typename T1, typename T2>
251 static inline bool sstr_pre_lt(
252 const T1& s1, size_t pre1,
253 const T2& s2, size_t pre2,
254 bool endlt = true)
255 {
256 assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
257 assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
258 size_t len1 = pre1;
259 size_t len2 = pre2;
260 size_t minlen = (len1 < len2 ? len1 : len2);
261 for(size_t i = 0; i < minlen; i++) {
262 if(s1[i] < s2[i]) {
263 return true;
264 } else if(s1[i] > s2[i]) {
265 return false;
266 }
267 }
268 if(len1 == len2) return false;
269 return (len1 < len2) == endlt;
270 }
271
272 /**
273 * Return true iff s1 is less than or equal to s2.
274 */
275 template<typename T1, typename T2>
276 static inline bool sstr_leq(const T1& s1, const T2& s2, bool endlt = true) {
277 size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
278 size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
279 size_t minlen = (len1 < len2 ? len1 : len2);
280 for(size_t i = 0; i < minlen; i++) {
281 if(s1[i] < s2[i]) {
282 return true;
283 } else if(s1[i] > s2[i]) {
284 return false;
285 }
286 }
287 if(len1 == len2) return true;
288 return (len1 < len2) == endlt;
289 }
290
291 /**
292 * Return true iff the given suffix of s1 is less than or equal to the given
293 * suffix of s2.
294 */
295 template<typename T1, typename T2>
296 static inline bool sstr_suf_leq(
297 const T1& s1, size_t suf1,
298 const T2& s2, size_t suf2,
299 bool endlt = true)
300 {
301 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
302 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
303 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
304 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
305 size_t minlen = (len1 < len2 ? len1 : len2);
306 for(size_t i = 0; i < minlen; i++) {
307 if(s1[suf1+i] < s2[suf2+i]) {
308 return true;
309 } else if(s1[suf1+i] > s2[suf2+i]) {
310 return false;
311 }
312 }
313 if(len1 == len2) return true;
314 return (len1 < len2) == endlt;
315 }
316
317 /**
318 * Return true iff the given prefix of s1 is less than or equal to the given
319 * prefix of s2.
320 */
321 template<typename T1, typename T2>
322 static inline bool sstr_pre_leq(
323 const T1& s1, size_t pre1,
324 const T2& s2, size_t pre2,
325 bool endlt = true)
326 {
327 assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
328 assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
329 size_t len1 = pre1;
330 size_t len2 = pre2;
331 size_t minlen = (len1 < len2 ? len1 : len2);
332 for(size_t i = 0; i < minlen; i++) {
333 if(s1[i] < s2[i]) {
334 return true;
335 } else if(s1[i] > s2[i]) {
336 return false;
337 }
338 }
339 if(len1 == len2) return true;
340 return (len1 < len2) == endlt;
341 }
342
343 /**
344 * Return true iff s1 is greater than s2.
345 */
346 template<typename T1, typename T2>
347 static inline bool sstr_gt(const T1& s1, const T2& s2, bool endlt = true) {
348 size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
349 size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
350 size_t minlen = (len1 < len2 ? len1 : len2);
351 for(size_t i = 0; i < minlen; i++) {
352 if(s1[i] > s2[i]) {
353 return true;
354 } else if(s1[i] < s2[i]) {
355 return false;
356 }
357 }
358 if(len1 == len2) return false;
359 return (len1 > len2) == endlt;
360 }
361
362 /**
363 * Return true iff the given suffix of s1 is greater than the given suffix of
364 * s2.
365 */
366 template<typename T1, typename T2>
367 static inline bool sstr_suf_gt(
368 const T1& s1, size_t suf1,
369 const T2& s2, size_t suf2,
370 bool endlt = true)
371 {
372 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
373 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
374 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
375 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
376 size_t minlen = (len1 < len2 ? len1 : len2);
377 for(size_t i = 0; i < minlen; i++) {
378 if(s1[suf1+i] > s2[suf2+i]) {
379 return true;
380 } else if(s1[suf1+i] < s2[suf2+i]) {
381 return false;
382 }
383 }
384 if(len1 == len2) return false;
385 return (len1 > len2) == endlt;
386 }
387
388 /**
389 * Return true iff the given prefix of s1 is greater than the given prefix of
390 * s2.
391 */
392 template<typename T1, typename T2>
393 static inline bool sstr_pre_gt(
394 const T1& s1, size_t pre1,
395 const T2& s2, size_t pre2,
396 bool endlt = true)
397 {
398 assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
399 assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
400 size_t len1 = pre1;
401 size_t len2 = pre2;
402 size_t minlen = (len1 < len2 ? len1 : len2);
403 for(size_t i = 0; i < minlen; i++) {
404 if(s1[i] > s2[i]) {
405 return true;
406 } else if(s1[i] < s2[i]) {
407 return false;
408 }
409 }
410 if(len1 == len2) return false;
411 return (len1 > len2) == endlt;
412 }
413
414 /**
415 * Return true iff s1 is greater than or equal to s2.
416 */
417 template<typename T1, typename T2>
418 static inline bool sstr_geq(const T1& s1, const T2& s2, bool endlt = true) {
419 size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
420 size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
421 size_t minlen = (len1 < len2 ? len1 : len2);
422 for(size_t i = 0; i < minlen; i++) {
423 if(s1[i] > s2[i]) {
424 return true;
425 } else if(s1[i] < s2[i]) {
426 return false;
427 }
428 }
429 if(len1 == len2) return true;
430 return (len1 > len2) == endlt;
431 }
432
433 /**
434 * Return true iff the given suffix of s1 is greater than or equal to the given
435 * suffix of s2.
436 */
437 template<typename T1, typename T2>
438 static inline bool sstr_suf_geq(
439 const T1& s1, size_t suf1,
440 const T2& s2, size_t suf2,
441 bool endlt = true)
442 {
443 assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
444 assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
445 size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
446 size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
447 size_t minlen = (len1 < len2 ? len1 : len2);
448 for(size_t i = 0; i < minlen; i++) {
449 if(s1[suf1+i] > s2[suf2+i]) {
450 return true;
451 } else if(s1[suf1+i] < s2[suf2+i]) {
452 return false;
453 }
454 }
455 if(len1 == len2) return true;
456 return (len1 > len2) == endlt;
457 }
458
459 /**
460 * Return true iff the given prefix of s1 is greater than or equal to the given
461 * prefix of s2.
462 */
463 template<typename T1, typename T2>
464 static inline bool sstr_pre_geq(
465 const T1& s1, size_t pre1,
466 const T2& s2, size_t pre2,
467 bool endlt = true)
468 {
469 assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
470 assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
471 size_t len1 = pre1;
472 size_t len2 = pre2;
473 size_t minlen = (len1 < len2 ? len1 : len2);
474 for(size_t i = 0; i < minlen; i++) {
475 if(s1[i] > s2[i]) {
476 return true;
477 } else if(s1[i] < s2[i]) {
478 return false;
479 }
480 }
481 if(len1 == len2) return true;
482 return (len1 > len2) == endlt;
483 }
484
485 template<typename T>
sstr_to_cstr(const T & s)486 static inline const char * sstr_to_cstr(const T& s) {
487 return s.toZBuf();
488 }
489
490 template<>
491 inline const char * sstr_to_cstr<std::basic_string<char> >(
492 const std::basic_string<char>& s)
493 {
494 return s.c_str();
495 }
496
497 /**
498 * Simple string class with backing memory whose size is managed by the user
499 * using the constructor and install() member function. No behind-the-scenes
500 * reallocation or copying takes place.
501 */
502 template<typename T>
503 class SString {
504 public:
505
SString()506 explicit SString() :
507 cs_(NULL),
508 printcs_(NULL),
509 len_(0)
510 { }
511
SString(size_t sz)512 explicit SString(size_t sz) :
513 cs_(NULL),
514 printcs_(NULL),
515 len_(0)
516 {
517 resize(sz);
518 }
519
520 /**
521 * Create an SStringExpandable from another SStringExpandable.
522 */
SString(const SString<T> & o)523 SString(const SString<T>& o) :
524 cs_(NULL),
525 printcs_(NULL),
526 len_(0)
527 {
528 *this = o;
529 }
530
531 /**
532 * Create an SStringExpandable from a std::basic_string of the
533 * appropriate type.
534 */
SString(const std::basic_string<T> & str)535 explicit SString(const std::basic_string<T>& str) :
536 cs_(NULL),
537 printcs_(NULL),
538 len_(0)
539 {
540 install(str.c_str(), str.length());
541 }
542
543 /**
544 * Create an SStringExpandable from an array and size.
545 */
SString(const T * b,size_t sz)546 explicit SString(const T* b, size_t sz) :
547 cs_(NULL),
548 printcs_(NULL),
549 len_(0)
550 {
551 install(b, sz);
552 }
553
554 /**
555 * Create an SStringExpandable from a zero-terminated array.
556 */
SString(const T * b)557 explicit SString(const T* b) :
558 cs_(NULL),
559 printcs_(NULL),
560 len_(0)
561 {
562 install(b, strlen(b));
563 }
564
565 /**
566 * Destroy the expandable string object.
567 */
~SString()568 virtual ~SString() {
569 if(cs_ != NULL) {
570 delete[] cs_;
571 cs_ = NULL;
572 }
573 if(printcs_ != NULL) {
574 delete[] printcs_;
575 printcs_ = NULL;
576 }
577 len_ = 0;
578 }
579
580 /**
581 * Assignment to other SString.
582 */
583 SString<T>& operator=(const SString<T>& o) {
584 install(o.cs_, o.len_);
585 return *this;
586 }
587
588 /**
589 * Assignment to other SString.
590 */
591 SString<T>& operator=(const std::basic_string<T>& o) {
592 install(o);
593 return *this;
594 }
595
596 /**
597 * Resizes the string without preserving its contents.
598 */
resize(size_t sz)599 void resize(size_t sz) {
600 if(cs_ != NULL) {
601 delete cs_;
602 cs_ = NULL;
603 }
604 if(printcs_ != NULL) {
605 delete printcs_;
606 printcs_ = NULL;
607 }
608 if(sz != 0) {
609 cs_ = new T[sz+1];
610 }
611 len_ = sz;
612 }
613
614 /**
615 * Return ith character from the left of either the forward or the
616 * reverse version of the read.
617 */
618 T windowGet(
619 size_t i,
620 bool fw,
621 size_t depth = 0,
622 size_t len = 0) const
623 {
624 if(len == 0) len = len_;
625 assert_lt(i, len);
626 assert_leq(len, len_ - depth);
627 return fw ? cs_[depth+i] : cs_[depth+len-i-1];
628 }
629
630 /**
631 * Return ith character from the left of either the forward or the
632 * reverse-complement version of the read.
633 */
634 void windowGet(
635 T& ret,
636 bool fw,
637 size_t depth = 0,
638 size_t len = 0) const
639 {
640 if(len == 0) len = len_;
641 assert_leq(len, len_ - depth);
642 ret.resize(len);
643 for(size_t i = 0; i < len; i++) {
644 ret.set(fw ? cs_[depth+i] : cs_[depth+len-i-1], i);
645 }
646 }
647
648 /**
649 * Set character at index 'idx' to 'c'.
650 */
set(int c,size_t idx)651 inline void set(int c, size_t idx) {
652 assert_lt(idx, len_);
653 cs_[idx] = c;
654 }
655
656 /**
657 * Retrieve constant version of element i.
658 */
659 inline const T& operator[](size_t i) const {
660 assert_lt(i, len_);
661 return cs_[i];
662 }
663
664 /**
665 * Retrieve mutable version of element i.
666 */
667 inline T& operator[](size_t i) {
668 assert_lt(i, len_);
669 return cs_[i];
670 }
671
672 /**
673 * Retrieve constant version of element i.
674 */
get(size_t i)675 inline const T& get(size_t i) const {
676 assert_lt(i, len_);
677 return cs_[i];
678 }
679
680 /**
681 * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
682 * operator=.
683 */
install(const T * b,size_t sz)684 virtual void install(const T* b, size_t sz) {
685 if(sz == 0) return;
686 resize(sz);
687 memcpy(cs_, b, sz * sizeof(T));
688 }
689
690 /**
691 * Copy 'sz' bytes from buffer 'b' into this string. memcpy is used, not
692 * operator=.
693 */
install(const std::basic_string<T> & b)694 virtual void install(const std::basic_string<T>& b) {
695 size_t sz = b.length();
696 if(sz == 0) return;
697 resize(sz);
698 memcpy(cs_, b.c_str(), sz * sizeof(T));
699 }
700
701 /**
702 * Copy all bytes from zero-terminated buffer 'b' into this string.
703 */
install(const T * b)704 void install(const T* b) {
705 install(b, strlen(b));
706 }
707
708 /**
709 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
710 * in the process.
711 */
installReverse(const char * b,size_t sz)712 void installReverse(const char* b, size_t sz) {
713 if(sz == 0) return;
714 resize(sz);
715 for(size_t i = 0; i < sz; i++) {
716 cs_[i] = b[sz-i-1];
717 }
718 len_ = sz;
719 }
720
721 /**
722 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
723 * in the process.
724 */
installReverse(const SString<T> & b)725 void installReverse(const SString<T>& b) {
726 installReverse(b.cs_, b.len_);
727 }
728
729 /**
730 * Return true iff the two strings are equal.
731 */
732 bool operator==(const SString<T>& o) {
733 return sstr_eq(*this, o);
734 }
735
736 /**
737 * Return true iff the two strings are not equal.
738 */
739 bool operator!=(const SString<T>& o) {
740 return sstr_neq(*this, o);
741 }
742
743 /**
744 * Return true iff this string is less than given string.
745 */
746 bool operator<(const SString<T>& o) {
747 return sstr_lt(*this, o);
748 }
749
750 /**
751 * Return true iff this string is greater than given string.
752 */
753 bool operator>(const SString<T>& o) {
754 return sstr_gt(*this, o);
755 }
756
757 /**
758 * Return true iff this string is less than or equal to given string.
759 */
760 bool operator<=(const SString<T>& o) {
761 return sstr_leq(*this, o);
762 }
763
764 /**
765 * Return true iff this string is greater than or equal to given string.
766 */
767 bool operator>=(const SString<T>& o) {
768 return sstr_geq(*this, o);
769 }
770
771 /**
772 * Reverse the buffer in place.
773 */
reverse()774 void reverse() {
775 for(size_t i = 0; i < (len_ >> 1); i++) {
776 T tmp = get(i);
777 set(get(len_-i-1), i);
778 set(tmp, len_-i-1);
779 }
780 }
781
782 /**
783 * Reverse a substring of the buffer in place.
784 */
reverseWindow(size_t off,size_t len)785 void reverseWindow(size_t off, size_t len) {
786 assert_leq(off, len_);
787 assert_leq(off + len, len_);
788 size_t mid = len >> 1;
789 for(size_t i = 0; i < mid; i++) {
790 T tmp = get(off+i);
791 set(get(off+len-i-1), off+i);
792 set(tmp, off+len-i-1);
793 }
794 }
795
796 /**
797 * Set the first len elements of the buffer to el.
798 */
fill(size_t len,const T & el)799 void fill(size_t len, const T& el) {
800 assert_leq(len, len_);
801 for(size_t i = 0; i < len; i++) {
802 set(el, i);
803 }
804 }
805
806 /**
807 * Set all elements of the buffer to el.
808 */
fill(const T & el)809 void fill(const T& el) {
810 fill(len_, el);
811 }
812
813 /**
814 * Return the length of the string.
815 */
length()816 inline size_t length() const { return len_; }
817
818 /**
819 * Clear the buffer.
820 */
clear()821 void clear() { len_ = 0; }
822
823 /**
824 * Return true iff the buffer is empty.
825 */
empty()826 inline bool empty() const { return len_ == 0; }
827
828 /**
829 * Put a terminator in the 'len_'th element and then return a
830 * pointer to the buffer. Useful for printing.
831 */
toZBufXForm(const char * xform)832 const char* toZBufXForm(const char *xform) const {
833 ASSERT_ONLY(size_t xformElts = strlen(xform));
834 // Lazily allocate space for print buffer
835 if(printcs_ == NULL) {
836 const_cast<char*&>(printcs_) = new char[len_+1];
837 }
838 char* printcs = const_cast<char*>(printcs_);
839 assert(printcs != NULL);
840 for(size_t i = 0; i < len_; i++) {
841 assert_lt(cs_[i], (int)xformElts);
842 printcs[i] = xform[cs_[i]];
843 }
844 printcs[len_] = 0;
845 return printcs_;
846 }
847
848 /**
849 * Put a terminator in the 'len_'th element and then return a
850 * pointer to the buffer. Useful for printing.
851 */
toZBuf()852 virtual const T* toZBuf() const {
853 const_cast<T*>(cs_)[len_] = 0;
854 return cs_;
855 }
856
857 /**
858 * Return a const version of the raw buffer.
859 */
buf()860 const T* buf() const { return cs_; }
861
862 /**
863 * Return a writeable version of the raw buffer.
864 */
wbuf()865 T* wbuf() { return cs_; }
866
867 protected:
868
869 T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
870 char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
871 size_t len_; // # elements
872 };
873
874 /**
875 * Simple string class with backing memory whose size is managed by the user
876 * using the constructor and install() member function. No behind-the-scenes
877 * reallocation or copying takes place.
878 */
879 class S2bDnaString {
880
881 public:
882
S2bDnaString()883 explicit S2bDnaString() :
884 cs_(NULL),
885 printcs_(NULL),
886 len_(0)
887 { }
888
S2bDnaString(size_t sz)889 explicit S2bDnaString(size_t sz) :
890 cs_(NULL),
891 printcs_(NULL),
892 len_(0)
893 {
894 resize(sz);
895 }
896
897 /**
898 * Copy another object of the same class.
899 */
S2bDnaString(const S2bDnaString & o)900 S2bDnaString(const S2bDnaString& o) :
901 cs_(NULL),
902 printcs_(NULL),
903 len_(0)
904 {
905 *this = o;
906 }
907
908 /**
909 * Create an SStringExpandable from a std::basic_string of the
910 * appropriate type.
911 */
912 explicit S2bDnaString(
913 const std::basic_string<char>& str,
914 bool chars = false,
915 bool colors = false) :
cs_(NULL)916 cs_(NULL),
917 printcs_(NULL),
918 len_(0)
919 {
920 if(chars) {
921 if(colors) {
922 installColors(str.c_str(), str.length());
923 } else {
924 installChars(str.c_str(), str.length());
925 }
926 } else {
927 install(str.c_str(), str.length());
928 }
929 }
930
931 /**
932 * Create an SStringExpandable from an array and size.
933 */
934 explicit S2bDnaString(
935 const char* b,
936 size_t sz,
937 bool chars = false,
938 bool colors = false) :
cs_(NULL)939 cs_(NULL),
940 printcs_(NULL),
941 len_(0)
942 {
943 if(chars) {
944 if(colors) {
945 installColors(b, sz);
946 } else {
947 installChars(b, sz);
948 }
949 } else {
950 install(b, sz);
951 }
952 }
953
954 /**
955 * Create an SStringFixed from a zero-terminated string.
956 */
957 explicit S2bDnaString(
958 const char* b,
959 bool chars = false,
960 bool colors = false) :
cs_(NULL)961 cs_(NULL),
962 printcs_(NULL),
963 len_(0)
964 {
965 if(chars) {
966 if(colors) {
967 installColors(b, strlen(b));
968 } else {
969 installChars(b, strlen(b));
970 }
971 } else {
972 install(b, strlen(b));
973 }
974 }
975
976 /**
977 * Destroy the expandable string object.
978 */
~S2bDnaString()979 virtual ~S2bDnaString() {
980 if(cs_ != NULL) {
981 delete[] cs_;
982 cs_ = NULL;
983 }
984 if(printcs_ != NULL) {
985 delete[] printcs_;
986 printcs_ = NULL;
987 }
988 len_ = 0;
989 }
990
991 /**
992 * Assignment to other SString.
993 */
994 template<typename T>
995 S2bDnaString& operator=(const T& o) {
996 install(o.c_str(), o.length());
997 return *this;
998 }
999
1000 /**
1001 * Assignment from a std::basic_string
1002 */
1003 template<typename T>
1004 S2bDnaString& operator=(const std::basic_string<char>& o) {
1005 install(o);
1006 return *this;
1007 }
1008
1009 /**
1010 * Resizes the string without preserving its contents.
1011 */
resize(size_t sz)1012 void resize(size_t sz) {
1013 if(cs_ != NULL) {
1014 delete cs_;
1015 cs_ = NULL;
1016 }
1017 if(printcs_ != NULL) {
1018 delete printcs_;
1019 printcs_ = NULL;
1020 }
1021 len_ = sz;
1022 if(sz != 0) {
1023 cs_ = new uint32_t[nwords()];
1024 }
1025 }
1026
1027 /**
1028 * Return DNA character corresponding to element 'idx'.
1029 */
toChar(size_t idx)1030 char toChar(size_t idx) const {
1031 int c = (int)get(idx);
1032 assert_range(0, 3, c);
1033 return "ACGT"[c];
1034 }
1035
1036 /**
1037 * Return color character corresponding to element 'idx'.
1038 */
toColor(size_t idx)1039 char toColor(size_t idx) const {
1040 int c = (int)get(idx);
1041 assert_range(0, 3, c);
1042 return "0123"[c];
1043 }
1044
1045 /**
1046 * Return ith character from the left of either the forward or the
1047 * reverse version of the read.
1048 */
1049 char windowGet(
1050 size_t i,
1051 bool fw,
1052 size_t depth = 0,
1053 size_t len = 0) const
1054 {
1055 if(len == 0) len = len_;
1056 assert_lt(i, len);
1057 assert_leq(len, len_ - depth);
1058 return fw ? get(depth+i) : get(depth+len-i-1);
1059 }
1060
1061 /**
1062 * Return ith character from the left of either the forward or the
1063 * reverse-complement version of the read.
1064 */
1065 template<typename T>
1066 void windowGet(
1067 T& ret,
1068 bool fw,
1069 size_t depth = 0,
1070 size_t len = 0) const
1071 {
1072 if(len == 0) len = len_;
1073 assert_leq(len, len_ - depth);
1074 ret.resize(len);
1075 for(size_t i = 0; i < len; i++) {
1076 ret.set((fw ? get(depth+i) : get(depth+len-i-1)), i);
1077 }
1078 }
1079
1080 /**
1081 * Return length in 32-bit words.
1082 */
nwords()1083 size_t nwords() const {
1084 return (len_ + 15) >> 4;
1085 }
1086
1087 /**
1088 * Set character at index 'idx' to 'c'.
1089 */
set(int c,size_t idx)1090 void set(int c, size_t idx) {
1091 assert_lt(idx, len_);
1092 assert_range(0, 3, c);
1093 size_t word = idx >> 4;
1094 size_t bpoff = (idx & 15) << 1;
1095 cs_[word] = cs_[word] & ~(uint32_t)(3 << bpoff);
1096 cs_[word] = cs_[word] | (uint32_t)(c << bpoff);
1097 }
1098
1099 /**
1100 * Set character at index 'idx' to DNA char 'c'.
1101 */
setChar(int c,size_t idx)1102 void setChar(int c, size_t idx) {
1103 assert_in(toupper(c), "ACGT");
1104 int bp = asc2dna[c];
1105 set(bp, idx);
1106 }
1107
1108 /**
1109 * Set character at index 'idx' to color char 'c'.
1110 */
setColor(int c,size_t idx)1111 void setColor(int c, size_t idx) {
1112 assert_in(toupper(c), "0123");
1113 int co = asc2col[c];
1114 set(co, idx);
1115 }
1116
1117 /**
1118 * Set the ith 32-bit word to given word.
1119 */
setWord(uint32_t w,size_t i)1120 void setWord(uint32_t w, size_t i) {
1121 assert_lt(i, nwords());
1122 cs_[i] = w;
1123 }
1124
1125 /**
1126 * Retrieve constant version of element i.
1127 */
1128 char operator[](size_t i) const {
1129 assert_lt(i, len_);
1130 return get(i);
1131 }
1132
1133 /**
1134 * Retrieve constant version of element i.
1135 */
get(size_t i)1136 char get(size_t i) const {
1137 assert_lt(i, len_);
1138 size_t word = i >> 4;
1139 size_t bpoff = (i & 15) << 1;
1140 return (char)((cs_[word] >> bpoff) & 3);
1141 }
1142
1143 /**
1144 * Copy packed words from string 'b' into this packed string.
1145 */
install(const uint32_t * b,size_t sz)1146 void install(const uint32_t* b, size_t sz) {
1147 if(sz == 0) return;
1148 resize(sz);
1149 memcpy(cs_, b, sizeof(uint32_t)*nwords());
1150 }
1151
1152 /**
1153 * Copy 'sz' DNA characters encoded as integers from buffer 'b' into this
1154 * packed string.
1155 */
install(const char * b,size_t sz)1156 void install(const char* b, size_t sz) {
1157 if(sz == 0) return;
1158 resize(sz);
1159 size_t wordi = 0;
1160 for(size_t i = 0; i < sz; i += 16) {
1161 uint32_t word = 0;
1162 for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1163 uint32_t bp = (int)b[i+j];
1164 uint32_t shift = (uint32_t)j << 1;
1165 assert_range(0, 3, (int)bp);
1166 word |= (bp << shift);
1167 }
1168 cs_[wordi++] = word;
1169 }
1170 }
1171
1172 /**
1173 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1174 */
installChars(const char * b,size_t sz)1175 void installChars(const char* b, size_t sz) {
1176 if(sz == 0) return;
1177 resize(sz);
1178 size_t wordi = 0;
1179 for(size_t i = 0; i < sz; i += 16) {
1180 uint32_t word = 0;
1181 for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1182 char c = b[i+j];
1183 assert_in(toupper(c), "ACGT");
1184 int bp = asc2dna[(int)c];
1185 assert_range(0, 3, (int)bp);
1186 uint32_t shift = (uint32_t)j << 1;
1187 word |= (bp << shift);
1188 }
1189 cs_[wordi++] = word;
1190 }
1191 }
1192
1193 /**
1194 * Copy 'sz' color characters from buffer 'b' into this packed string.
1195 */
installColors(const char * b,size_t sz)1196 void installColors(const char* b, size_t sz) {
1197 if(sz == 0) return;
1198 resize(sz);
1199 size_t wordi = 0;
1200 for(size_t i = 0; i < sz; i += 16) {
1201 uint32_t word = 0;
1202 for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1203 char c = b[i+j];
1204 assert_in(c, "0123");
1205 int bp = asc2col[(int)c];
1206 assert_range(0, 3, (int)bp);
1207 uint32_t shift = (uint32_t)j << 1;
1208 word |= (bp << shift);
1209 }
1210 cs_[wordi++] = word;
1211 }
1212 }
1213
1214 /**
1215 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1216 */
install(const char * b)1217 void install(const char* b) {
1218 install(b, strlen(b));
1219 }
1220
1221 /**
1222 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1223 */
installChars(const char * b)1224 void installChars(const char* b) {
1225 installChars(b, strlen(b));
1226 }
1227
1228 /**
1229 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1230 */
installColors(const char * b)1231 void installColors(const char* b) {
1232 installColors(b, strlen(b));
1233 }
1234
1235 /**
1236 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1237 */
install(const std::basic_string<char> & b)1238 void install(const std::basic_string<char>& b) {
1239 install(b.c_str(), b.length());
1240 }
1241
1242 /**
1243 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1244 */
installChars(const std::basic_string<char> & b)1245 void installChars(const std::basic_string<char>& b) {
1246 installChars(b.c_str(), b.length());
1247 }
1248
1249 /**
1250 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1251 */
installColors(const std::basic_string<char> & b)1252 void installColors(const std::basic_string<char>& b) {
1253 installColors(b.c_str(), b.length());
1254 }
1255
1256 /**
1257 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1258 * in the process.
1259 */
installReverse(const char * b,size_t sz)1260 void installReverse(const char* b, size_t sz) {
1261 resize(sz);
1262 if(sz == 0) return;
1263 size_t wordi = 0;
1264 size_t bpi = 0;
1265 cs_[0] = 0;
1266 for(size_t i =sz; i > 0; i--) {
1267 assert_range(0, 3, (int)b[i-1]);
1268 cs_[wordi] |= ((int)b[i-1] << (bpi<<1));
1269 if(bpi == 15) {
1270 wordi++;
1271 cs_[wordi] = 0;
1272 bpi = 0;
1273 } else bpi++;
1274 }
1275 }
1276
1277 /**
1278 * Copy all chars from buffer of DNA characters 'b' into this string,
1279 * reversing them in the process.
1280 */
installReverse(const char * b)1281 void installReverse(const char* b) {
1282 installReverse(b, strlen(b));
1283 }
1284
1285 /**
1286 * Copy 'sz' bytes from buffer of DNA characters 'b' into this string,
1287 * reversing them in the process.
1288 */
installReverseChars(const char * b,size_t sz)1289 void installReverseChars(const char* b, size_t sz) {
1290 resize(sz);
1291 if(sz == 0) return;
1292 size_t wordi = 0;
1293 size_t bpi = 0;
1294 cs_[0] = 0;
1295 for(size_t i =sz; i > 0; i--) {
1296 char c = b[i-1];
1297 assert_in(toupper(c), "ACGT");
1298 int bp = asc2dna[(int)c];
1299 assert_range(0, 3, bp);
1300 cs_[wordi] |= (bp << (bpi<<1));
1301 if(bpi == 15) {
1302 wordi++;
1303 cs_[wordi] = 0;
1304 bpi = 0;
1305 } else bpi++;
1306 }
1307 }
1308
1309 /**
1310 * Copy all chars from buffer of DNA characters 'b' into this string,
1311 * reversing them in the process.
1312 */
installReverseChars(const char * b)1313 void installReverseChars(const char* b) {
1314 installReverseChars(b, strlen(b));
1315 }
1316
1317 /**
1318 * Copy 'sz' bytes from buffer of color characters 'b' into this string,
1319 * reversing them in the process.
1320 */
installReverseColors(const char * b,size_t sz)1321 void installReverseColors(const char* b, size_t sz) {
1322 resize(sz);
1323 if(sz == 0) return;
1324 size_t wordi = 0;
1325 size_t bpi = 0;
1326 cs_[0] = 0;
1327 for(size_t i =sz; i > 0; i--) {
1328 char c = b[i-1];
1329 assert_in(c, "0123");
1330 int bp = asc2col[(int)c];
1331 assert_range(0, 3, bp);
1332 cs_[wordi] |= (bp << (bpi<<1));
1333 if(bpi == 15) {
1334 wordi++;
1335 cs_[wordi] = 0;
1336 bpi = 0;
1337 } else bpi++;
1338 }
1339 }
1340
1341 /**
1342 * Copy all chars from buffer of color characters 'b' into this string,
1343 * reversing them in the process.
1344 */
installReverseColors(const char * b)1345 void installReverseColors(const char* b) {
1346 installReverseColors(b, strlen(b));
1347 }
1348
1349 /**
1350 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1351 * in the process.
1352 */
installReverse(const S2bDnaString & b)1353 void installReverse(const S2bDnaString& b) {
1354 resize(b.len_);
1355 if(b.len_ == 0) return;
1356 size_t wordi = 0;
1357 size_t bpi = 0;
1358 size_t wordb = b.nwords()-1;
1359 size_t bpb = (b.len_-1) & 15;
1360 cs_[0] = 0;
1361 for(size_t i = b.len_; i > 0; i--) {
1362 int bbp = (int)((b[wordb] >> (bpb << 1)) & 3);
1363 assert_range(0, 3, bbp);
1364 cs_[wordi] |= (bbp << (bpi << 1));
1365 if(bpi == 15) {
1366 wordi++;
1367 cs_[wordi] = 0;
1368 bpi = 0;
1369 } else bpi++;
1370 if(bpb == 0) {
1371 wordb--;
1372 bpi = 15;
1373 } else bpi--;
1374 }
1375 }
1376
1377 /**
1378 * Return true iff the two strings are equal.
1379 */
1380 bool operator==(const S2bDnaString& o) {
1381 return sstr_eq(*this, o);
1382 }
1383
1384 /**
1385 * Return true iff the two strings are not equal.
1386 */
1387 bool operator!=(const S2bDnaString& o) {
1388 return sstr_neq(*this, o);
1389 }
1390
1391 /**
1392 * Return true iff this string is less than given string.
1393 */
1394 bool operator<(const S2bDnaString& o) {
1395 return sstr_lt(*this, o);
1396 }
1397
1398 /**
1399 * Return true iff this string is greater than given string.
1400 */
1401 bool operator>(const S2bDnaString& o) {
1402 return sstr_gt(*this, o);
1403 }
1404
1405 /**
1406 * Return true iff this string is less than or equal to given string.
1407 */
1408 bool operator<=(const S2bDnaString& o) {
1409 return sstr_leq(*this, o);
1410 }
1411
1412 /**
1413 * Return true iff this string is greater than or equal to given string.
1414 */
1415 bool operator>=(const S2bDnaString& o) {
1416 return sstr_geq(*this, o);
1417 }
1418
1419 /**
1420 * Reverse the 2-bit encoded DNA string in-place.
1421 */
reverse()1422 void reverse() {
1423 if(len_ <= 1) return;
1424 size_t wordf = nwords()-1;
1425 size_t bpf = (len_-1) & 15;
1426 size_t wordi = 0;
1427 size_t bpi = 0;
1428 while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
1429 int f = (cs_[wordf] >> (bpf << 1)) & 3;
1430 int i = (cs_[wordi] >> (bpi << 1)) & 3;
1431 cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
1432 cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
1433 cs_[wordf] |= (uint32_t)(i << (bpf << 1));
1434 cs_[wordi] |= (uint32_t)(f << (bpi << 1));
1435 if(bpf == 0) {
1436 bpf = 15;
1437 wordf--;
1438 } else bpf--;
1439 if(bpi == 15) {
1440 bpi = 0;
1441 wordi++;
1442 } else bpi++;
1443 }
1444 }
1445
1446 /**
1447 * Reverse a substring of the buffer in place.
1448 */
reverseWindow(size_t off,size_t len)1449 void reverseWindow(size_t off, size_t len) {
1450 assert_leq(off, len_);
1451 assert_leq(off+len, len_);
1452 if(len <= 1) return;
1453 size_t wordf = (off+len-1) >> 4;
1454 size_t bpf = (off+len-1) & 15;
1455 size_t wordi = (off ) >> 4;
1456 size_t bpi = (off ) & 15;
1457 while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
1458 int f = (cs_[wordf] >> (bpf << 1)) & 3;
1459 int i = (cs_[wordi] >> (bpi << 1)) & 3;
1460 cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
1461 cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
1462 cs_[wordf] |= (uint32_t)(i << (bpf << 1));
1463 cs_[wordi] |= (uint32_t)(f << (bpi << 1));
1464 if(bpf == 0) {
1465 bpf = 15;
1466 wordf--;
1467 } else bpf--;
1468 if(bpi == 15) {
1469 bpi = 0;
1470 wordi++;
1471 } else bpi++;
1472 }
1473 }
1474
1475
1476 /**
1477 * Set the first len elements of the buffer to el.
1478 */
fill(size_t len,char el)1479 void fill(size_t len, char el) {
1480 assert_leq(len, len_);
1481 assert_range(0, 3, (int)el);
1482 size_t word = 0;
1483 if(len > 32) {
1484 // Copy el throughout block
1485 uint32_t bl = (uint32_t)el;
1486 bl |= (bl << 2);
1487 bl |= (bl << 4);
1488 bl |= (bl << 8);
1489 bl |= (bl << 16);
1490 // Fill with blocks
1491 size_t blen = len >> 4;
1492 for(; word < blen; word++) {
1493 cs_[word] = bl;
1494 }
1495 len = len & 15;
1496 }
1497 size_t bp = 0;
1498 for(size_t i = 0; i < len; i++) {
1499 cs_[word] &= ~(uint32_t)(3 << (bp << 1));
1500 cs_[word] |= (uint32_t)(el << (bp << 1));
1501 if(bp == 15) {
1502 word++;
1503 bp = 0;
1504 } else bp++;
1505 }
1506 }
1507
1508 /**
1509 * Set all elements of the buffer to el.
1510 */
fill(char el)1511 void fill(char el) {
1512 fill(len_, el);
1513 }
1514
1515 /**
1516 * Return the ith character in the window defined by fw, color, depth and
1517 * len.
1518 */
1519 char windowGetDna(
1520 size_t i,
1521 bool fw,
1522 size_t depth = 0,
1523 size_t len = 0) const
1524 {
1525 if(len == 0) len = len_;
1526 assert_lt(i, len);
1527 assert_leq(len, len_ - depth);
1528 if(fw) {
1529 return get(depth+i);
1530 } else {
1531 return compDna(get(depth+len-i-1));
1532 }
1533 }
1534
1535 /**
1536 * Fill the given DNA buffer with the substring specified by fw,
1537 * color, depth and len.
1538 */
1539 template<typename T>
1540 void windowGetDna(
1541 T& buf,
1542 bool fw,
1543 size_t depth = 0,
1544 size_t len = 0) const
1545 {
1546 if(len == 0) len = len_;
1547 assert_leq(len, len_ - depth);
1548 buf.resize(len);
1549 for(size_t i = 0; i < len; i++) {
1550 buf.set(
1551 (fw ?
1552 get(depth+i) :
1553 compDna(get(depth+len-i-1))), i);
1554 }
1555 }
1556
1557 /**
1558 * Return the length of the string.
1559 */
length()1560 inline size_t length() const { return len_; }
1561
1562 /**
1563 * Clear the buffer.
1564 */
clear()1565 void clear() { len_ = 0; }
1566
1567 /**
1568 * Return true iff the buffer is empty.
1569 */
empty()1570 inline bool empty() const { return len_ == 0; }
1571
1572 /**
1573 * Return a const version of the raw buffer.
1574 */
buf()1575 const uint32_t* buf() const { return cs_; }
1576
1577 /**
1578 * Return a writeable version of the raw buffer.
1579 */
wbuf()1580 uint32_t* wbuf() { return cs_; }
1581
1582 /**
1583 * Note: the size of the string once it's stored in the print buffer is 4
1584 * times as large as the string as stored in compact 2-bit-per-char words.
1585 */
toZBuf()1586 const char* toZBuf() const {
1587 if(printcs_ == NULL) {
1588 const_cast<char*&>(printcs_) = new char[len_+1];
1589 }
1590 char *printcs = const_cast<char*>(printcs_);
1591 size_t word = 0, bp = 0;
1592 for(size_t i = 0; i < len_; i++) {
1593 int c = (cs_[word] >> (bp << 1)) & 3;
1594 printcs[i] = "ACGT"[c];
1595 if(bp == 15) {
1596 word++;
1597 bp = 0;
1598 } else bp++;
1599 }
1600 printcs[len_] = '\0';
1601 return printcs_;
1602 }
1603
1604 protected:
1605
1606 uint32_t *cs_; // 2-bit packed words
1607 char *printcs_;
1608 size_t len_; // # elements
1609 };
1610
1611 /**
1612 * Simple string class with backing memory that automatically expands as needed.
1613 */
1614 template<typename T, int S = 1024, int M = 2, int I = 0>
1615 class SStringExpandable {
1616
1617 public:
1618
SStringExpandable()1619 explicit SStringExpandable() :
1620 cs_(NULL),
1621 printcs_(NULL),
1622 len_(0),
1623 sz_(0)
1624 {
1625 if(I > 0) {
1626 expandNoCopy(I);
1627 }
1628 }
1629
SStringExpandable(size_t sz)1630 explicit SStringExpandable(size_t sz) :
1631 cs_(NULL),
1632 printcs_(NULL),
1633 len_(0),
1634 sz_(0)
1635 {
1636 expandNoCopy(sz);
1637 }
1638
1639 /**
1640 * Create an SStringExpandable from another SStringExpandable.
1641 */
SStringExpandable(const SStringExpandable<T,S> & o)1642 SStringExpandable(const SStringExpandable<T, S>& o) :
1643 cs_(NULL),
1644 printcs_(NULL),
1645 len_(0),
1646 sz_(0)
1647 {
1648 *this = o;
1649 }
1650
1651 /**
1652 * Create an SStringExpandable from a std::basic_string of the
1653 * appropriate type.
1654 */
SStringExpandable(const std::basic_string<T> & str)1655 explicit SStringExpandable(const std::basic_string<T>& str) :
1656 cs_(NULL),
1657 printcs_(NULL),
1658 len_(0),
1659 sz_(0)
1660 {
1661 install(str.c_str(), str.length());
1662 }
1663
1664 /**
1665 * Create an SStringExpandable from an array and size.
1666 */
SStringExpandable(const T * b,size_t sz)1667 explicit SStringExpandable(const T* b, size_t sz) :
1668 cs_(NULL),
1669 printcs_(NULL),
1670 len_(0),
1671 sz_(0)
1672 {
1673 install(b, sz);
1674 }
1675
1676 /**
1677 * Create an SStringExpandable from a zero-terminated array.
1678 */
SStringExpandable(const T * b)1679 explicit SStringExpandable(const T* b) :
1680 cs_(NULL),
1681 printcs_(NULL),
1682 len_(0),
1683 sz_(0)
1684 {
1685 install(b, strlen(b));
1686 }
1687
1688 /**
1689 * Destroy the expandable string object.
1690 */
~SStringExpandable()1691 virtual ~SStringExpandable() {
1692 if(cs_ != NULL) {
1693 delete[] cs_;
1694 cs_ = NULL;
1695 }
1696 if(printcs_ != NULL) {
1697 delete[] printcs_;
1698 printcs_ = NULL;
1699 }
1700 sz_ = len_ = 0;
1701 }
1702
1703 /**
1704 * Return ith character from the left of either the forward or the
1705 * reverse-complement version of the read.
1706 */
1707 T windowGet(
1708 size_t i,
1709 bool fw,
1710 size_t depth = 0,
1711 size_t len = 0) const
1712 {
1713 if(len == 0) len = len_;
1714 assert_lt(i, len);
1715 assert_leq(len, len_ - depth);
1716 return fw ? cs_[depth+i] : cs_[depth+len-i-1];
1717 }
1718
1719 /**
1720 * Return ith character from the left of either the forward or the
1721 * reverse-complement version of the read.
1722 */
1723 void windowGet(
1724 T& ret,
1725 bool fw,
1726 size_t depth = 0,
1727 size_t len = 0) const
1728 {
1729 if(len == 0) len = len_;
1730 assert_leq(len, len_ - depth);
1731 for(size_t i = 0; i < len; i++) {
1732 ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
1733 }
1734 }
1735
1736 /**
1737 * Assignment to other SStringExpandable.
1738 */
1739 SStringExpandable<T,S,M,I>& operator=(const SStringExpandable<T,S,M,I>& o) {
1740 install(o.cs_, o.len_);
1741 return *this;
1742 }
1743
1744 /**
1745 * Assignment from a std::basic_string
1746 */
1747 SStringExpandable<T,S>& operator=(const std::basic_string<T>& o) {
1748 install(o.c_str(), o.length());
1749 return *this;
1750 }
1751
1752 /**
1753 * Insert char c before position 'idx'; slide subsequent chars down.
1754 */
insert(const T & c,size_t idx)1755 void insert(const T& c, size_t idx) {
1756 assert_lt(idx, len_);
1757 if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
1758 len_++;
1759 // Move everyone down by 1
1760 // len_ is the *new* length
1761 for(size_t i = len_; i > idx+1; i--) {
1762 cs_[i-1] = cs_[i-2];
1763 }
1764 cs_[idx] = c;
1765 }
1766
1767 /**
1768 * Set character at index 'idx' to 'c'.
1769 */
set(int c,size_t idx)1770 void set(int c, size_t idx) {
1771 assert_lt(idx, len_);
1772 cs_[idx] = c;
1773 }
1774
1775 /**
1776 * Append char c.
1777 */
append(const T & c)1778 void append(const T& c) {
1779 if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
1780 cs_[len_++] = c;
1781 }
1782
1783 /**
1784 * Delete char at position 'idx'; slide subsequent chars up.
1785 */
remove(size_t idx)1786 void remove(size_t idx) {
1787 assert_lt(idx, len_);
1788 assert_gt(len_, 0);
1789 for(size_t i = idx; i < len_-1; i++) {
1790 cs_[i] = cs_[i+1];
1791 }
1792 len_--;
1793 }
1794
1795 /**
1796 * Retrieve constant version of element i.
1797 */
1798 const T& operator[](size_t i) const {
1799 assert_lt(i, len_);
1800 return cs_[i];
1801 }
1802
1803 /**
1804 * Retrieve mutable version of element i.
1805 */
1806 T& operator[](size_t i) {
1807 assert_lt(i, len_);
1808 return cs_[i];
1809 }
1810
1811 /**
1812 * Retrieve constant version of element i.
1813 */
get(size_t i)1814 const T& get(size_t i) const {
1815 assert_lt(i, len_);
1816 return cs_[i];
1817 }
1818
1819 /**
1820 * Copy 'sz' bytes from buffer 'b' into this string.
1821 */
install(const T * b,size_t sz)1822 virtual void install(const T* b, size_t sz) {
1823 if(sz_ < sz) expandNoCopy((sz + S) * M);
1824 memcpy(cs_, b, sz * sizeof(T));
1825 len_ = sz;
1826 }
1827
1828
1829 /**
1830 * Copy all bytes from zero-terminated buffer 'b' into this string.
1831 */
install(const T * b)1832 void install(const T* b) { install(b, strlen(b)); }
1833
1834 /**
1835 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1836 * in the process.
1837 */
installReverse(const char * b,size_t sz)1838 void installReverse(const char* b, size_t sz) {
1839 if(sz_ < sz) expandNoCopy((sz + S) * M);
1840 for(size_t i = 0; i < sz; i++) {
1841 cs_[i] = b[sz-i-1];
1842 }
1843 len_ = sz;
1844 }
1845
1846 /**
1847 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1848 * in the process.
1849 */
installReverse(const SStringExpandable<T,S> & b)1850 void installReverse(const SStringExpandable<T, S>& b) {
1851 if(sz_ < b.len_) expandNoCopy((b.len_ + S) * M);
1852 for(size_t i = 0; i < b.len_; i++) {
1853 cs_[i] = b.cs_[b.len_ - i - 1];
1854 }
1855 len_ = b.len_;
1856 }
1857
1858 /**
1859 * Return true iff the two strings are equal.
1860 */
1861 bool operator==(const SStringExpandable<T, S>& o) {
1862 return sstr_eq(*this, o);
1863 }
1864
1865 /**
1866 * Return true iff the two strings are not equal.
1867 */
1868 bool operator!=(const SStringExpandable<T, S>& o) {
1869 return sstr_neq(*this, o);
1870 }
1871
1872 /**
1873 * Return true iff this string is less than given string.
1874 */
1875 bool operator<(const SStringExpandable<T, S>& o) {
1876 return sstr_lt(*this, o);
1877 }
1878
1879 /**
1880 * Return true iff this string is greater than given string.
1881 */
1882 bool operator>(const SStringExpandable<T, S>& o) {
1883 return sstr_gt(*this, o);
1884 }
1885
1886 /**
1887 * Return true iff this string is less than or equal to given string.
1888 */
1889 bool operator<=(const SStringExpandable<T, S>& o) {
1890 return sstr_leq(*this, o);
1891 }
1892
1893 /**
1894 * Return true iff this string is greater than or equal to given string.
1895 */
1896 bool operator>=(const SStringExpandable<T, S>& o) {
1897 return sstr_geq(*this, o);
1898 }
1899
1900 /**
1901 * Reverse the buffer in place.
1902 */
reverse()1903 void reverse() {
1904 for(size_t i = 0; i < (len_ >> 1); i++) {
1905 T tmp = get(i);
1906 set(get(len_-i-1), i);
1907 set(tmp, len_-i-1);
1908 }
1909 }
1910
1911 /**
1912 * Reverse a substring of the buffer in place.
1913 */
reverseWindow(size_t off,size_t len)1914 void reverseWindow(size_t off, size_t len) {
1915 assert_leq(off, len_);
1916 assert_leq(off + len, len_);
1917 size_t mid = len >> 1;
1918 for(size_t i = 0; i < mid; i++) {
1919 T tmp = get(off+i);
1920 set(get(off+len-i-1), off+i);
1921 set(tmp, off+len-i-1);
1922 }
1923 }
1924
1925 /**
1926 * Simply resize the buffer. If the buffer is resized to be
1927 * longer, the newly-added elements will contain garbage and should
1928 * be initialized immediately.
1929 */
resize(size_t len)1930 void resize(size_t len) {
1931 if(sz_ < len) expandCopy((len + S) * M);
1932 len_ = len;
1933 }
1934
1935 /**
1936 * Simply resize the buffer. If the buffer is resized to be
1937 * longer, new elements will be initialized with 'el'.
1938 */
resize(size_t len,const T & el)1939 void resize(size_t len, const T& el) {
1940 if(sz_ < len) expandCopy((len + S) * M);
1941 if(len > len_) {
1942 for(size_t i = len_; i < len; i++) {
1943 cs_[i] = el;
1944 }
1945 }
1946 len_ = len;
1947 }
1948
1949 /**
1950 * Set the first len elements of the buffer to el.
1951 */
fill(size_t len,const T & el)1952 void fill(size_t len, const T& el) {
1953 assert_leq(len, len_);
1954 for(size_t i = 0; i < len; i++) {
1955 cs_[i] = el;
1956 }
1957 }
1958
1959 /**
1960 * Set all elements of the buffer to el.
1961 */
fill(const T & el)1962 void fill(const T& el) {
1963 fill(len_, el);
1964 }
1965
1966 /**
1967 * Trim len characters from the beginning of the string.
1968 */
trimBegin(size_t len)1969 void trimBegin(size_t len) {
1970 assert_leq(len, len_);
1971 if(len == len_) {
1972 len_ = 0; return;
1973 }
1974 for(size_t i = 0; i < len_-len; i++) {
1975 cs_[i] = cs_[i+len];
1976 }
1977 len_ -= len;
1978 }
1979
1980 /**
1981 * Trim len characters from the end of the string.
1982 */
trimEnd(size_t len)1983 size_t trimEnd(size_t len) {
1984 if(len >= len_) {
1985 size_t ret = len_;
1986 len_ = 0;
1987 return ret;
1988 }
1989 len_ -= len;
1990 return len;
1991 }
1992
1993 /**
1994 * Copy 'sz' bytes from buffer 'b' into this string.
1995 */
append(const T * b,size_t sz)1996 void append(const T* b, size_t sz) {
1997 if(sz_ < len_ + sz) expandCopy((len_ + sz + S) * M);
1998 memcpy(cs_ + len_, b, sz * sizeof(T));
1999 len_ += sz;
2000 }
2001
2002 /**
2003 * Copy bytes from zero-terminated buffer 'b' into this string.
2004 */
append(const T * b)2005 void append(const T* b) {
2006 append(b, strlen(b));
2007 }
2008
2009 /**
2010 * Return the length of the string.
2011 */
length()2012 size_t length() const { return len_; }
2013
2014 /**
2015 * Clear the buffer.
2016 */
clear()2017 void clear() { len_ = 0; }
2018
2019 /**
2020 * Return true iff the buffer is empty.
2021 */
empty()2022 bool empty() const { return len_ == 0; }
2023
2024 /**
2025 * Put a terminator in the 'len_'th element and then return a
2026 * pointer to the buffer. Useful for printing.
2027 */
toZBufXForm(const char * xform)2028 const char* toZBufXForm(const char *xform) const {
2029 ASSERT_ONLY(size_t xformElts = strlen(xform));
2030 if(empty()) {
2031 const_cast<char&>(zero_) = 0;
2032 return &zero_;
2033 }
2034 char* printcs = const_cast<char*>(printcs_);
2035 // Lazily allocate space for print buffer
2036 for(size_t i = 0; i < len_; i++) {
2037 assert_lt(cs_[i], (int)xformElts);
2038 printcs[i] = xform[(int)cs_[i]];
2039 }
2040 printcs[len_] = 0;
2041 return printcs_;
2042 }
2043
2044 /**
2045 * Put a terminator in the 'len_'th element and then return a
2046 * pointer to the buffer. Useful for printing.
2047 */
toZBuf()2048 virtual const T* toZBuf() const {
2049 if(empty()) {
2050 const_cast<T&>(zeroT_) = 0;
2051 return &zeroT_;
2052 }
2053 assert_leq(len_, sz_);
2054 const_cast<T*>(cs_)[len_] = 0;
2055 return cs_;
2056 }
2057
2058 /**
2059 * Return true iff this DNA string matches the given nucleotide
2060 * character string.
2061 */
eq(const char * str)2062 bool eq(const char *str) const {
2063 const char *self = toZBuf();
2064 return strcmp(str, self) == 0;
2065 }
2066
2067 /**
2068 * Return a const version of the raw buffer.
2069 */
buf()2070 const T* buf() const { return cs_; }
2071
2072 /**
2073 * Return a writeable version of the raw buffer.
2074 */
wbuf()2075 T* wbuf() { return cs_; }
2076
2077 protected:
2078 /**
2079 * Allocate new, bigger buffer and copy old contents into it. If
2080 * requested size can be accommodated by current buffer, do nothing.
2081 */
expandCopy(size_t sz)2082 void expandCopy(size_t sz) {
2083 if(sz_ >= sz) return; // done!
2084 T *tmp = new T[sz + 1];
2085 char *ptmp = new char[sz + 1];
2086 if(cs_ != NULL) {
2087 memcpy(tmp, cs_, sizeof(T)*len_);
2088 delete[] cs_;
2089 }
2090 if(printcs_ != NULL) {
2091 memcpy(ptmp, printcs_, sizeof(char)*len_);
2092 delete[] printcs_;
2093 }
2094 cs_ = tmp;
2095 printcs_ = ptmp;
2096 sz_ = sz;
2097 }
2098
2099 /**
2100 * Allocate new, bigger buffer. If requested size can be
2101 * accommodated by current buffer, do nothing.
2102 */
expandNoCopy(size_t sz)2103 void expandNoCopy(size_t sz) {
2104 if(sz_ >= sz) return; // done!
2105 if(cs_ != NULL) delete[] cs_;
2106 if(printcs_ != NULL) delete[] printcs_;
2107 cs_ = new T[sz + 1];
2108 printcs_ = new char[sz + 1];
2109 sz_ = sz;
2110 }
2111
2112 T *cs_; // +1 so that we have the option of dropping in a terminating "\0"
2113 char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
2114 char zero_; // 0 terminator for empty string
2115 T zeroT_; // 0 terminator for empty string
2116 size_t len_; // # filled-in elements
2117 size_t sz_; // size capacity of cs_
2118 };
2119
2120 /**
2121 * Simple string class with in-object storage.
2122 *
2123 * All copies induced by, e.g., operator=, the copy constructor,
2124 * install() and append(), are shallow (using memcpy/sizeof). If deep
2125 * copies are needed, use a different class.
2126 *
2127 * Reading from an uninitialized element results in an assert as long
2128 * as NDEBUG is not defined. If NDEBUG is defined, the result is
2129 * undefined.
2130 */
2131 template<typename T, int S>
2132 class SStringFixed {
2133 public:
SStringFixed()2134 explicit SStringFixed() : len_(0) { }
2135
2136 /**
2137 * Create an SStringFixed from another SStringFixed.
2138 */
SStringFixed(const SStringFixed<T,S> & o)2139 SStringFixed(const SStringFixed<T, S>& o) {
2140 *this = o;
2141 }
2142
2143 /**
2144 * Create an SStringFixed from another SStringFixed.
2145 */
SStringFixed(const std::basic_string<T> & str)2146 explicit SStringFixed(const std::basic_string<T>& str) {
2147 install(str.c_str(), str.length());
2148 }
2149
2150 /**
2151 * Create an SStringFixed from an array and size.
2152 */
SStringFixed(const T * b,size_t sz)2153 explicit SStringFixed(const T* b, size_t sz) {
2154 install(b, sz);
2155 }
2156
2157 /**
2158 * Create an SStringFixed from a zero-terminated string.
2159 */
SStringFixed(const T * b)2160 explicit SStringFixed(const T* b) {
2161 install(b, strlen(b));
2162 }
2163
~SStringFixed()2164 virtual ~SStringFixed() { } // C++ needs this
2165
2166 /**
2167 * Retrieve constant version of element i.
2168 */
2169 inline const T& operator[](size_t i) const {
2170 return get(i);
2171 }
2172
2173 /**
2174 * Retrieve mutable version of element i.
2175 */
2176 inline T& operator[](size_t i) {
2177 return get(i);
2178 }
2179
2180 /**
2181 * Retrieve constant version of element i.
2182 */
get(size_t i)2183 inline const T& get(size_t i) const {
2184 assert_lt(i, len_);
2185 return cs_[i];
2186 }
2187
2188 /**
2189 * Retrieve mutable version of element i.
2190 */
get(size_t i)2191 inline T& get(size_t i) {
2192 assert_lt(i, len_);
2193 return cs_[i];
2194 }
2195
2196 /**
2197 * Return ith character from the left of either the forward or the
2198 * reverse-complement version of the read.
2199 */
2200 T windowGet(
2201 size_t i,
2202 bool fw,
2203 size_t depth = 0,
2204 size_t len = 0) const
2205 {
2206 if(len == 0) len = len_;
2207 assert_lt(i, len);
2208 assert_leq(len, len_ - depth);
2209 return fw ? cs_[depth+i] : cs_[depth+len-i-1];
2210 }
2211
2212 /**
2213 * Return ith character from the left of either the forward or the
2214 * reverse-complement version of the read.
2215 */
2216 void windowGet(
2217 T& ret,
2218 bool fw,
2219 size_t depth = 0,
2220 size_t len = 0) const
2221 {
2222 if(len == 0) len = len_;
2223 assert_leq(len, len_ - depth);
2224 for(size_t i = 0; i < len; i++) {
2225 ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
2226 }
2227 }
2228
2229 /**
2230 * Assignment to other SStringFixed.
2231 */
2232 SStringFixed<T,S>& operator=(const SStringFixed<T,S>& o) {
2233 install(o.cs_, o.len_);
2234 return *this;
2235 }
2236
2237 /**
2238 * Assignment from a std::basic_string
2239 */
2240 SStringFixed<T,S>& operator=(const std::basic_string<T>& o) {
2241 install(o);
2242 return *this;
2243 }
2244
2245 /**
2246 * Insert char c before position 'idx'; slide subsequent chars down.
2247 */
insert(const T & c,size_t idx)2248 void insert(const T& c, size_t idx) {
2249 assert_lt(len_, S);
2250 assert_lt(idx, len_);
2251 // Move everyone down by 1
2252 for(int i = len_; i > idx; i--) {
2253 cs_[i] = cs_[i-1];
2254 }
2255 cs_[idx] = c;
2256 len_++;
2257 }
2258
2259 /**
2260 * Set character at index 'idx' to 'c'.
2261 */
set(int c,size_t idx)2262 void set(int c, size_t idx) {
2263 assert_lt(idx, len_);
2264 cs_[idx] = c;
2265 }
2266
2267 /**
2268 * Append char c.
2269 */
append(const T & c)2270 void append(const T& c) {
2271 assert_lt(len_, S);
2272 cs_[len_++] = c;
2273 }
2274
2275 /**
2276 * Delete char at position 'idx'; slide subsequent chars up.
2277 */
remove(size_t idx)2278 void remove(size_t idx) {
2279 assert_lt(idx, len_);
2280 assert_gt(len_, 0);
2281 for(size_t i = idx; i < len_-1; i++) {
2282 cs_[i] = cs_[i+1];
2283 }
2284 len_--;
2285 }
2286
2287 /**
2288 * Copy 'sz' bytes from buffer 'b' into this string.
2289 */
install(const T * b,size_t sz)2290 virtual void install(const T* b, size_t sz) {
2291 assert_leq(sz, S);
2292 memcpy(cs_, b, sz * sizeof(T));
2293 len_ = sz;
2294 }
2295
2296 /**
2297 * Copy all bytes from zero-terminated buffer 'b' into this string.
2298 */
install(const T * b)2299 void install(const T* b) { install(b, strlen(b)); }
2300
2301 /**
2302 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
2303 * in the process.
2304 */
installReverse(const char * b,size_t sz)2305 void installReverse(const char* b, size_t sz) {
2306 assert_leq(sz, S);
2307 for(size_t i = 0; i < sz; i++) {
2308 cs_[i] = b[sz-i-1];
2309 }
2310 len_ = sz;
2311 }
2312
2313 /**
2314 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
2315 * in the process.
2316 */
installReverse(const SStringFixed<T,S> & b)2317 void installReverse(const SStringFixed<T, S>& b) {
2318 assert_leq(b.len_, S);
2319 for(size_t i = 0; i < b.len_; i++) {
2320 cs_[i] = b.cs_[b.len_ - i - 1];
2321 }
2322 len_ = b.len_;
2323 }
2324
2325 /**
2326 * Return true iff the two strings are equal.
2327 */
2328 bool operator==(const SStringFixed<T, S>& o) {
2329 return sstr_eq(*this, o);
2330 }
2331
2332 /**
2333 * Return true iff the two strings are not equal.
2334 */
2335 bool operator!=(const SStringFixed<T, S>& o) {
2336 return sstr_neq(*this, o);
2337 }
2338
2339 /**
2340 * Return true iff this string is less than given string.
2341 */
2342 bool operator<(const SStringFixed<T, S>& o) {
2343 return sstr_lt(*this, o);
2344 }
2345
2346 /**
2347 * Return true iff this string is greater than given string.
2348 */
2349 bool operator>(const SStringFixed<T, S>& o) {
2350 return sstr_gt(*this, o);
2351 }
2352
2353 /**
2354 * Return true iff this string is less than or equal to given string.
2355 */
2356 bool operator<=(const SStringFixed<T, S>& o) {
2357 return sstr_leq(*this, o);
2358 }
2359
2360 /**
2361 * Return true iff this string is greater than or equal to given string.
2362 */
2363 bool operator>=(const SStringFixed<T, S>& o) {
2364 return sstr_geq(*this, o);
2365 }
2366
2367 /**
2368 * Reverse the buffer in place.
2369 */
reverse()2370 void reverse() {
2371 for(size_t i = 0; i < (len_ >> 1); i++) {
2372 T tmp = get(i);
2373 set(get(len_-i-1), i);
2374 set(tmp, len_-i-1);
2375 }
2376 }
2377
2378 /**
2379 * Reverse a substring of the buffer in place.
2380 */
reverseWindow(size_t off,size_t len)2381 void reverseWindow(size_t off, size_t len) {
2382 assert_leq(off, len_);
2383 assert_leq(off + len, len_);
2384 size_t mid = len >> 1;
2385 for(size_t i = 0; i < mid; i++) {
2386 T tmp = get(off+i);
2387 set(get(off+len-i-1), off+i);
2388 set(tmp, off+len-i-1);
2389 }
2390 }
2391
2392 /**
2393 * Simply resize the buffer. If the buffer is resized to be
2394 * longer, the newly-added elements will contain garbage and should
2395 * be initialized immediately.
2396 */
resize(size_t len)2397 void resize(size_t len) {
2398 assert_lt(len, S);
2399 len_ = len;
2400 }
2401
2402 /**
2403 * Simply resize the buffer. If the buffer is resized to be
2404 * longer, new elements will be initialized with 'el'.
2405 */
resize(size_t len,const T & el)2406 void resize(size_t len, const T& el) {
2407 assert_lt(len, S);
2408 if(len > len_) {
2409 for(size_t i = len_; i < len; i++) {
2410 cs_[i] = el;
2411 }
2412 }
2413 len_ = len;
2414 }
2415
2416 /**
2417 * Set the first len elements of the buffer to el.
2418 */
fill(size_t len,const T & el)2419 void fill(size_t len, const T& el) {
2420 assert_leq(len, len_);
2421 for(size_t i = 0; i < len; i++) {
2422 cs_[i] = el;
2423 }
2424 }
2425
2426 /**
2427 * Set all elements of the buffer to el.
2428 */
fill(const T & el)2429 void fill(const T& el) {
2430 fill(len_, el);
2431 }
2432
2433 /**
2434 * Trim len characters from the beginning of the string.
2435 */
trimBegin(size_t len)2436 void trimBegin(size_t len) {
2437 assert_leq(len, len_);
2438 if(len == len_) {
2439 len_ = 0; return;
2440 }
2441 for(size_t i = 0; i < len_-len; i++) {
2442 cs_[i] = cs_[i+len];
2443 }
2444 len_ -= len;
2445 }
2446
2447 /**
2448 * Trim len characters from the end of the string.
2449 */
trimEnd(size_t len)2450 void trimEnd(size_t len) {
2451 if(len >= len_) len_ = 0;
2452 else len_ -= len;
2453 }
2454
2455 /**
2456 * Copy 'sz' bytes from buffer 'b' into this string.
2457 */
append(const T * b,size_t sz)2458 void append(const T* b, size_t sz) {
2459 assert_leq(sz + len_, S);
2460 memcpy(cs_ + len_, b, sz * sizeof(T));
2461 len_ += sz;
2462 }
2463
2464 /**
2465 * Copy bytes from zero-terminated buffer 'b' into this string.
2466 */
append(const T * b)2467 void append(const T* b) {
2468 append(b, strlen(b));
2469 }
2470
2471 /**
2472 * Return the length of the string.
2473 */
length()2474 size_t length() const { return len_; }
2475
2476 /**
2477 * Clear the buffer.
2478 */
clear()2479 void clear() { len_ = 0; }
2480
2481 /**
2482 * Return true iff the buffer is empty.
2483 */
empty()2484 bool empty() const { return len_ == 0; }
2485
2486 /**
2487 * Put a terminator in the 'len_'th element and then return a
2488 * pointer to the buffer. Useful for printing.
2489 */
toZBuf()2490 virtual const T* toZBuf() const {
2491 const_cast<T*>(cs_)[len_] = 0;
2492 return cs_;
2493 }
2494
2495 /**
2496 * Return true iff this DNA string matches the given nucleotide
2497 * character string.
2498 */
eq(const char * str)2499 bool eq(const char *str) const {
2500 const char *self = toZBuf();
2501 return strcmp(str, self) == 0;
2502 }
2503
2504 /**
2505 * Put a terminator in the 'len_'th element and then return a
2506 * pointer to the buffer. Useful for printing.
2507 */
toZBufXForm(const char * xform)2508 const char* toZBufXForm(const char *xform) const {
2509 ASSERT_ONLY(size_t xformElts = strlen(xform));
2510 char* printcs = const_cast<char*>(printcs_);
2511 for(size_t i = 0; i < len_; i++) {
2512 assert_lt(cs_[i], (int)xformElts);
2513 printcs[i] = xform[cs_[i]];
2514 }
2515 printcs[len_] = 0;
2516 return printcs_;
2517 }
2518
2519 /**
2520 * Return a const version of the raw buffer.
2521 */
buf()2522 const T* buf() const { return cs_; }
2523
2524 /**
2525 * Return a writeable version of the raw buffer.
2526 */
wbuf()2527 T* wbuf() { return cs_; }
2528
2529 protected:
2530 T cs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
2531 char printcs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
2532 size_t len_;
2533 };
2534
2535 //
2536 // Stream put operators
2537 //
2538
2539 template <typename T, int S, int M>
2540 std::ostream& operator<< (std::ostream& os, const SStringExpandable<T, S, M>& str) {
2541 os << str.toZBuf();
2542 return os;
2543 }
2544
2545 template <typename T, int S>
2546 std::ostream& operator<< (std::ostream& os, const SStringFixed<T, S>& str) {
2547 os << str.toZBuf();
2548 return os;
2549 }
2550
2551 extern uint8_t asc2dna[];
2552 extern uint8_t asc2col[];
2553
2554 /**
2555 * Encapsulates a fixed-length DNA string with characters encoded as
2556 * chars. Only capable of encoding A, C, G, T and N. The length is
2557 * specified via the template parameter S.
2558 */
2559 template<int S>
2560 class SDnaStringFixed : public SStringFixed<char, S> {
2561 public:
2562
SDnaStringFixed()2563 explicit SDnaStringFixed() : SStringFixed<char, S>() { }
2564
2565 /**
2566 * Create an SStringFixed from another SStringFixed.
2567 */
SDnaStringFixed(const SDnaStringFixed<S> & o)2568 SDnaStringFixed(const SDnaStringFixed<S>& o) :
2569 SStringFixed<char, S>(o) { }
2570
2571 /**
2572 * Create an SStringFixed from a C++ basic_string.
2573 */
SDnaStringFixed(const std::basic_string<char> & str)2574 explicit SDnaStringFixed(const std::basic_string<char>& str) :
2575 SStringFixed<char, S>(str) { }
2576
2577 /**
2578 * Create an SStringFixed from an array and size.
2579 */
SDnaStringFixed(const char * b,size_t sz)2580 explicit SDnaStringFixed(const char* b, size_t sz) :
2581 SStringFixed<char, S>(b, sz) { }
2582
2583 /**
2584 * Create an SStringFixed from a zero-terminated string.
2585 */
2586 explicit SDnaStringFixed(
2587 const char* b,
2588 bool chars = false,
2589 bool colors = false) :
2590 SStringFixed<char, S>()
2591 {
2592 if(chars) {
2593 if(colors) {
2594 installColors(b, strlen(b));
2595 } else {
2596 installChars(b, strlen(b));
2597 }
2598 } else {
2599 install(b, strlen(b));
2600 }
2601 }
2602
~SDnaStringFixed()2603 virtual ~SDnaStringFixed() { } // C++ needs this
2604
2605 /**
2606 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2607 * complementing them in the process, assuming an encoding where
2608 * 0=A, 1=C, 2=G, 3=T, 4=N.
2609 */
installReverseComp(const char * b,size_t sz)2610 void installReverseComp(const char* b, size_t sz) {
2611 assert_leq(sz, S);
2612 for(size_t i = 0; i < sz; i++) {
2613 this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
2614 }
2615 this->len_ = sz;
2616 }
2617
2618 /**
2619 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2620 * complementing them in the process, assuming an encoding where
2621 * 0=A, 1=C, 2=G, 3=T, 4=N.
2622 */
installReverseComp(const SDnaStringFixed<S> & b)2623 void installReverseComp(const SDnaStringFixed<S>& b) {
2624 assert_leq(b.len_, S);
2625 for(size_t i = 0; i < b.len_; i++) {
2626 this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
2627 }
2628 this->len_ = b.len_;
2629 }
2630
2631 /**
2632 * Either reverse or reverse-complement (depending on "color") this
2633 * DNA buffer in-place.
2634 */
reverseComp()2635 void reverseComp() {
2636 for(size_t i = 0; i < (this->len_ >> 1); i++) {
2637 char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
2638 char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
2639 this->cs_[i] = tmp2;
2640 this->cs_[this->len_-i-1] = tmp1;
2641 }
2642 // Do middle element iff there are an odd number
2643 if((this->len_ & 1) != 0) {
2644 char tmp = this->cs_[this->len_ >> 1];
2645 tmp = (tmp == 4 ? 4 : tmp ^ 3);
2646 this->cs_[this->len_ >> 1] = tmp;
2647 }
2648 }
2649
2650 /**
2651 * Copy 'sz' bytes from buffer 'b' into this string.
2652 */
install(const char * b,size_t sz)2653 virtual void install(const char* b, size_t sz) {
2654 assert_leq(sz, S);
2655 memcpy(this->cs_, b, sz);
2656 #ifndef NDEBUG
2657 for(size_t i = 0; i < sz; i++) {
2658 assert_leq(this->cs_[i], 4);
2659 assert_geq(this->cs_[i], 0);
2660 }
2661 #endif
2662 this->len_ = sz;
2663 }
2664
2665 /**
2666 * Copy buffer 'b' of ASCII DNA characters into normal DNA
2667 * characters.
2668 */
installChars(const char * b,size_t sz)2669 virtual void installChars(const char* b, size_t sz) {
2670 assert_leq(sz, S);
2671 for(size_t i = 0; i < sz; i++) {
2672 assert_in(toupper(b[i]), "ACGTN-");
2673 this->cs_[i] = asc2dna[(int)b[i]];
2674 assert_geq(this->cs_[i], 0);
2675 assert_leq(this->cs_[i], 4);
2676 }
2677 this->len_ = sz;
2678 }
2679
2680 /**
2681 * Copy buffer 'b' of ASCII color characters into normal DNA
2682 * characters.
2683 */
installColors(const char * b,size_t sz)2684 virtual void installColors(const char* b, size_t sz) {
2685 assert_leq(sz, S);
2686 for(size_t i = 0; i < sz; i++) {
2687 assert_in(b[i], "0123.");
2688 this->cs_[i] = asc2col[(int)b[i]];
2689 assert_geq(this->cs_[i], 0);
2690 assert_leq(this->cs_[i], 4);
2691 }
2692 this->len_ = sz;
2693 }
2694
2695 /**
2696 * Copy C++ string of ASCII DNA characters into normal DNA
2697 * characters.
2698 */
installChars(const std::basic_string<char> & str)2699 virtual void installChars(const std::basic_string<char>& str) {
2700 installChars(str.c_str(), str.length());
2701 }
2702
2703 /**
2704 * Copy C++ string of ASCII color characters into normal DNA
2705 * characters.
2706 */
installColors(const std::basic_string<char> & str)2707 virtual void installColors(const std::basic_string<char>& str) {
2708 installColors(str.c_str(), str.length());
2709 }
2710
2711 /**
2712 * Set DNA character at index 'idx' to 'c'.
2713 */
set(int c,size_t idx)2714 void set(int c, size_t idx) {
2715 assert_lt(idx, this->len_);
2716 assert_leq(c, 4);
2717 assert_geq(c, 0);
2718 this->cs_[idx] = c;
2719 }
2720
2721 /**
2722 * Append DNA char c.
2723 */
append(const char & c)2724 void append(const char& c) {
2725 assert_lt(this->len_, S);
2726 assert_leq(c, 4);
2727 assert_geq(c, 0);
2728 this->cs_[this->len_++] = c;
2729 }
2730
2731 /**
2732 * Set DNA character at index 'idx' to 'c'.
2733 */
setChar(char c,size_t idx)2734 void setChar(char c, size_t idx) {
2735 assert_lt(idx, this->len_);
2736 assert_in(toupper(c), "ACGTN");
2737 this->cs_[idx] = asc2dna[(int)c];
2738 }
2739
2740 /**
2741 * Append DNA character.
2742 */
appendChar(char c)2743 void appendChar(char c) {
2744 assert_lt(this->len_, S);
2745 assert_in(toupper(c), "ACGTN");
2746 this->cs_[this->len_++] = asc2dna[(int)c];
2747 }
2748
2749 /**
2750 * Return DNA character corresponding to element 'idx'.
2751 */
toChar(size_t idx)2752 char toChar(size_t idx) const {
2753 assert_geq((int)this->cs_[idx], 0);
2754 assert_leq((int)this->cs_[idx], 4);
2755 return "ACGTN"[(int)this->cs_[idx]];
2756 }
2757
2758 /**
2759 * Retrieve constant version of element i.
2760 */
2761 const char& operator[](size_t i) const {
2762 return this->get(i);
2763 }
2764
2765 /**
2766 * Retrieve constant version of element i.
2767 */
get(size_t i)2768 const char& get(size_t i) const {
2769 assert_lt(i, this->len_);
2770 assert_leq(this->cs_[i], 4);
2771 assert_geq(this->cs_[i], 0);
2772 return this->cs_[i];
2773 }
2774
2775 /**
2776 * Return the ith character in the window defined by fw, color,
2777 * depth and len.
2778 */
2779 char windowGetDna(
2780 size_t i,
2781 bool fw,
2782 size_t depth = 0,
2783 size_t len = 0) const
2784 {
2785 if(len == 0) len = this->len_;
2786 assert_lt(i, len);
2787 assert_leq(len, this->len_ - depth);
2788 if(fw) return this->cs_[depth+i];
2789 else return compDna(this->cs_[depth+len-i-1]);
2790 }
2791
2792 /**
2793 * Fill the given DNA buffer with the substring specified by fw,
2794 * color, depth and len.
2795 */
2796 void windowGetDna(
2797 SDnaStringFixed<S>& buf,
2798 bool fw,
2799 size_t depth = 0,
2800 size_t len = 0) const
2801 {
2802 if(len == 0) len = this->len_;
2803 assert_leq(len, this->len_ - depth);
2804 for(size_t i = 0; i < len; i++) {
2805 buf.append(fw ? this->cs_[depth+i] :
2806 compDna(this->cs_[depth+len-i-1]));
2807 }
2808 }
2809
2810 /**
2811 * Put a terminator in the 'len_'th element and then return a
2812 * pointer to the buffer. Useful for printing.
2813 */
toZBuf()2814 virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
2815 };
2816
2817 /**
2818 * Encapsulates a fixed-length DNA string with characters encoded as
2819 * chars. Only capable of encoding A, C, G, T and N. The length is
2820 * specified via the template parameter S.
2821 */
2822
2823 template<int S = 1024, int M = 2>
2824 class SDnaStringExpandable : public SStringExpandable<char, S, M> {
2825 public:
2826
SDnaStringExpandable()2827 explicit SDnaStringExpandable() : SStringExpandable<char, S, M>() { }
2828
2829 /**
2830 * Create an SStringFixed from another SStringFixed.
2831 */
SDnaStringExpandable(const SDnaStringExpandable<S,M> & o)2832 SDnaStringExpandable(const SDnaStringExpandable<S, M>& o) :
2833 SStringExpandable<char, S, M>(o) { }
2834
2835 /**
2836 * Create an SStringFixed from a C++ basic_string.
2837 */
2838 explicit SDnaStringExpandable(
2839 const std::basic_string<char>& str,
2840 bool chars = false,
2841 bool colors = false) :
2842 SStringExpandable<char, S, M>()
2843 {
2844 if(chars) {
2845 if(colors) {
2846 installColors(str);
2847 } else {
2848 installChars(str);
2849 }
2850 } else {
2851 install(str);
2852 }
2853 }
2854
2855 /**
2856 * Create an SStringFixed from an array and size.
2857 */
2858 explicit SDnaStringExpandable(
2859 const char* b,
2860 size_t sz,
2861 bool chars = false,
2862 bool colors = false) :
2863 SStringExpandable<char, S, M>()
2864 {
2865 if(chars) {
2866 if(colors) {
2867 installColors(b, sz);
2868 } else {
2869 installChars(b, sz);
2870 }
2871 } else {
2872 install(b, sz);
2873 }
2874 }
2875
2876 /**
2877 * Create an SStringFixed from a zero-terminated string.
2878 */
2879 explicit SDnaStringExpandable(
2880 const char* b,
2881 bool chars = false,
2882 bool colors = false) :
2883 SStringExpandable<char, S, M>()
2884 {
2885 install(b, chars, colors);
2886 }
2887
~SDnaStringExpandable()2888 virtual ~SDnaStringExpandable() { } // C++ needs this
2889
2890 /**
2891 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2892 * complementing them in the process, assuming an encoding where
2893 * 0=A, 1=C, 2=G, 3=T, 4=N.
2894 */
installReverseComp(const char * b,size_t sz)2895 void installReverseComp(const char* b, size_t sz) {
2896 if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2897 for(size_t i = 0; i < sz; i++) {
2898 this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
2899 }
2900 this->len_ = sz;
2901 }
2902
2903 /**
2904 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2905 * complementing them in the process, assuming an encoding where
2906 * 0=A, 1=C, 2=G, 3=T, 4=N.
2907 */
installReverseComp(const SDnaStringExpandable<S,M> & b)2908 void installReverseComp(const SDnaStringExpandable<S, M>& b) {
2909 if(this->sz_ < b.len_) this->expandCopy((b.len_ + S) * M);
2910 for(size_t i = 0; i < b.len_; i++) {
2911 this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
2912 }
2913 this->len_ = b.len_;
2914 }
2915
2916 /**
2917 * Either reverse or reverse-complement (depending on "color") this
2918 * DNA buffer in-place.
2919 */
reverseComp()2920 void reverseComp() {
2921 for(size_t i = 0; i < (this->len_ >> 1); i++) {
2922 char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
2923 char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
2924 this->cs_[i] = tmp2;
2925 this->cs_[this->len_-i-1] = tmp1;
2926 }
2927 // Do middle element iff there are an odd number
2928 if((this->len_ & 1) != 0) {
2929 char tmp = this->cs_[this->len_ >> 1];
2930 tmp = (tmp == 4 ? 4 : tmp ^ 3);
2931 this->cs_[this->len_ >> 1] = tmp;
2932 }
2933 }
2934
2935 /**
2936 * Copy 'sz' bytes from buffer 'b' into this string.
2937 */
2938 virtual void install(
2939 const char* b,
2940 bool chars = false,
2941 bool colors = false)
2942 {
2943 if(chars) {
2944 if(colors) {
2945 installColors(b, strlen(b));
2946 } else {
2947 installChars(b, strlen(b));
2948 }
2949 } else {
2950 install(b, strlen(b));
2951 }
2952 }
2953
2954 /**
2955 * Copy 'sz' bytes from buffer 'b' into this string.
2956 */
install(const char * b,size_t sz)2957 virtual void install(const char* b, size_t sz) {
2958 if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2959 memcpy(this->cs_, b, sz);
2960 #ifndef NDEBUG
2961 for(size_t i = 0; i < sz; i++) {
2962 assert_range(0, 4, (int)this->cs_[i]);
2963 }
2964 #endif
2965 this->len_ = sz;
2966 }
2967
2968 /**
2969 * Copy buffer 'b' of ASCII DNA characters into normal DNA
2970 * characters.
2971 */
installChars(const char * b,size_t sz)2972 virtual void installChars(const char* b, size_t sz) {
2973 if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2974 for(size_t i = 0; i < sz; i++) {
2975 assert_in(toupper(b[i]), "ACGTN-");
2976 this->cs_[i] = asc2dna[(int)b[i]];
2977 assert_range(0, 4, (int)this->cs_[i]);
2978 }
2979 this->len_ = sz;
2980 }
2981
2982 /**
2983 * Copy buffer 'b' of ASCII color characters into normal DNA
2984 * characters.
2985 */
installColors(const char * b,size_t sz)2986 virtual void installColors(const char* b, size_t sz) {
2987 if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2988 for(size_t i = 0; i < sz; i++) {
2989 assert_in(b[i], "0123.");
2990 this->cs_[i] = asc2col[(int)b[i]];
2991 assert_range(0, 4, (int)this->cs_[i]);
2992 }
2993 this->len_ = sz;
2994 }
2995
2996 /**
2997 * Copy C++ string of ASCII DNA characters into normal DNA
2998 * characters.
2999 */
installChars(const std::basic_string<char> & str)3000 virtual void installChars(const std::basic_string<char>& str) {
3001 installChars(str.c_str(), str.length());
3002 }
3003
3004 /**
3005 * Copy C++ string of ASCII color characters into normal DNA
3006 * characters.
3007 */
installColors(const std::basic_string<char> & str)3008 virtual void installColors(const std::basic_string<char>& str) {
3009 installColors(str.c_str(), str.length());
3010 }
3011
3012 /**
3013 * Set DNA character at index 'idx' to 'c'.
3014 */
set(int c,size_t idx)3015 void set(int c, size_t idx) {
3016 assert_lt(idx, this->len_);
3017 assert_range(0, 4, c);
3018 this->cs_[idx] = c;
3019 }
3020
3021 /**
3022 * Append DNA char c.
3023 */
append(const char & c)3024 void append(const char& c) {
3025 if(this->sz_ < this->len_ + 1) {
3026 this->expandCopy((this->len_ + 1 + S) * M);
3027 }
3028 assert_range(0, 4, (int)c);
3029 this->cs_[this->len_++] = c;
3030 }
3031
3032 /**
3033 * Set DNA character at index 'idx' to 'c'.
3034 */
setChar(char c,size_t idx)3035 void setChar(char c, size_t idx) {
3036 assert_lt(idx, this->len_);
3037 assert_in(toupper(c), "ACGTN");
3038 this->cs_[idx] = asc2dna[(int)c];
3039 }
3040
3041 /**
3042 * Append DNA character.
3043 */
appendChar(char c)3044 void appendChar(char c) {
3045 if(this->sz_ < this->len_ + 1) {
3046 this->expandCopy((this->len_ + 1 + S) * M);
3047 }
3048 assert_in(toupper(c), "ACGTN");
3049 this->cs_[this->len_++] = asc2dna[(int)c];
3050 }
3051
3052 /**
3053 * Return DNA character corresponding to element 'idx'.
3054 */
toChar(size_t idx)3055 char toChar(size_t idx) const {
3056 assert_range(0, 4, (int)this->cs_[idx]);
3057 return "ACGTN"[(int)this->cs_[idx]];
3058 }
3059
3060 /**
3061 * Retrieve constant version of element i.
3062 */
3063 inline const char& operator[](size_t i) const {
3064 return this->get(i);
3065 }
3066
3067 /**
3068 * Retrieve constant version of element i.
3069 */
get(size_t i)3070 inline const char& get(size_t i) const {
3071 assert_lt(i, this->len_);
3072 assert_range(0, 4, (int)this->cs_[i]);
3073 return this->cs_[i];
3074 }
3075
3076 /**
3077 * Return the ith character in the window defined by fw, color,
3078 * depth and len.
3079 */
3080 char windowGetDna(
3081 size_t i,
3082 bool fw,
3083 size_t depth = 0,
3084 size_t len = 0) const
3085 {
3086 if(len == 0) len = this->len_;
3087 assert_lt(i, len);
3088 assert_leq(len, this->len_ - depth);
3089 if(fw) return this->cs_[depth+i];
3090 else return compDna(this->cs_[depth+len-i-1]);
3091 }
3092
3093 /**
3094 * Fill the given DNA buffer with the substring specified by fw,
3095 * color, depth and len.
3096 */
3097 void windowGetDna(
3098 SDnaStringExpandable<S, M>& buf,
3099 bool fw,
3100 size_t depth = 0,
3101 size_t len = 0) const
3102 {
3103 if(len == 0) len = this->len_;
3104 assert_leq(len, this->len_ - depth);
3105 for(size_t i = 0; i < len; i++) {
3106 buf.append(fw ? this->cs_[depth+i] :
3107 compDna(this->cs_[depth+len-i-1]));
3108 }
3109 }
3110
3111 /**
3112 * Put a terminator in the 'len_'th element and then return a
3113 * pointer to the buffer. Useful for printing.
3114 */
toZBuf()3115 virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
3116 };
3117
3118 /**
3119 * Encapsulates an expandable DNA string with characters encoded as
3120 * char-sized masks. Encodes A, C, G, T, and all IUPAC, as well as the
3121 * empty mask indicating "matches nothing."
3122 */
3123 template<int S = 16, int M = 2>
3124 class SDnaMaskString : public SStringExpandable<char, S, M> {
3125 public:
3126
SDnaMaskString()3127 explicit SDnaMaskString() : SStringExpandable<char, S, M>() { }
3128
3129 /**
3130 * Create an SStringFixed from another SStringFixed.
3131 */
SDnaMaskString(const SDnaMaskString<S,M> & o)3132 SDnaMaskString(const SDnaMaskString<S, M>& o) :
3133 SStringExpandable<char, S, M>(o) { }
3134
3135 /**
3136 * Create an SStringFixed from a C++ basic_string.
3137 */
SDnaMaskString(const std::basic_string<char> & str)3138 explicit SDnaMaskString(const std::basic_string<char>& str) :
3139 SStringExpandable<char, S, M>(str) { }
3140
3141 /**
3142 * Create an SStringFixed from an array and size.
3143 */
SDnaMaskString(const char * b,size_t sz)3144 explicit SDnaMaskString(const char* b, size_t sz) :
3145 SStringExpandable<char, S, M>(b, sz) { }
3146
3147 /**
3148 * Create an SStringFixed from a zero-terminated string.
3149 */
3150 explicit SDnaMaskString(const char* b, bool chars = false) :
3151 SStringExpandable<char, S, M>()
3152 {
3153 if(chars) {
3154 installChars(b, strlen(b));
3155 } else {
3156 install(b, strlen(b));
3157 }
3158 }
3159
~SDnaMaskString()3160 virtual ~SDnaMaskString() { } // C++ needs this
3161
3162 /**
3163 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
3164 * complementing them in the process, assuming an encoding where
3165 * 0=A, 1=C, 2=G, 3=T, 4=N.
3166 */
installReverseComp(const char * b,size_t sz)3167 void installReverseComp(const char* b, size_t sz) {
3168 while(this->sz_ < sz) {
3169 this->expandNoCopy((sz + S) * M);
3170 }
3171 for(size_t i = 0; i < sz; i++) {
3172 this->cs_[i] = maskcomp[(int)b[sz-i-1]];
3173 }
3174 this->len_ = sz;
3175 }
3176
3177 /**
3178 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
3179 * complementing them in the process, assuming an encoding where
3180 * 0=A, 1=C, 2=G, 3=T, 4=N.
3181 */
installReverseComp(const SDnaMaskString<S,M> & b)3182 void installReverseComp(const SDnaMaskString<S, M>& b) {
3183 while(this->sz_ < b.len_) {
3184 this->expandNoCopy((b.len_ + S) * M);
3185 }
3186 for(size_t i = 0; i < b.len_; i++) {
3187 this->cs_[i] = maskcomp[(int)b.cs_[b.len_-i-1]];
3188 }
3189 this->len_ = b.len_;
3190 }
3191
3192 /**
3193 * Either reverse or reverse-complement (depending on "color") this
3194 * DNA buffer in-place.
3195 */
reverseComp()3196 void reverseComp() {
3197 for(size_t i = 0; i < (this->len_ >> 1); i++) {
3198 char tmp1 = maskcomp[(int)this->cs_[i]];
3199 char tmp2 = maskcomp[(int)this->cs_[this->len_-i-1]];
3200 this->cs_[i] = tmp2;
3201 this->cs_[this->len_-i-1] = tmp1;
3202 }
3203 // Do middle element iff there are an odd number
3204 if((this->len_ & 1) != 0) {
3205 char tmp = this->cs_[this->len_ >> 1];
3206 tmp = maskcomp[(int)tmp];
3207 this->cs_[this->len_ >> 1] = tmp;
3208 }
3209 }
3210
3211 /**
3212 * Copy 'sz' bytes from buffer 'b' into this string.
3213 */
install(const char * b,size_t sz)3214 virtual void install(const char* b, size_t sz) {
3215 while(this->sz_ < sz) {
3216 this->expandNoCopy((sz + S) * M);
3217 }
3218 memcpy(this->cs_, b, sz);
3219 #ifndef NDEBUG
3220 for(size_t i = 0; i < sz; i++) {
3221 assert_range((int)this->cs_[i], 0, 15);
3222 }
3223 #endif
3224 this->len_ = sz;
3225 }
3226
3227 /**
3228 * Copy buffer 'b' of ASCII DNA characters into DNA masks.
3229 */
installChars(const char * b,size_t sz)3230 virtual void installChars(const char* b, size_t sz) {
3231 while(this->sz_ < sz) {
3232 this->expandNoCopy((sz + S) * M);
3233 }
3234 for(size_t i = 0; i < sz; i++) {
3235 assert_in(b[i], iupacs);
3236 this->cs_[i] = asc2dnamask[(int)b[i]];
3237 assert_range((int)this->cs_[i], 0, 15);
3238 }
3239 this->len_ = sz;
3240 }
3241
3242 /**
3243 * Copy C++ string of ASCII DNA characters into normal DNA
3244 * characters.
3245 */
installChars(const std::basic_string<char> & str)3246 virtual void installChars(const std::basic_string<char>& str) {
3247 installChars(str.c_str(), str.length());
3248 }
3249
3250 /**
3251 * Set DNA character at index 'idx' to 'c'.
3252 */
set(int c,size_t idx)3253 void set(int c, size_t idx) {
3254 assert_lt(idx, this->len_);
3255 assert_range(c, 0, 15);
3256 this->cs_[idx] = c;
3257 }
3258
3259 /**
3260 * Append DNA char c.
3261 */
append(const char & c)3262 void append(const char& c) {
3263 while(this->sz_ < this->len_+1) {
3264 this->expandNoCopy((this->len_ + 1 + S) * M);
3265 }
3266 assert_range((int)c, 0, 15);
3267 this->cs_[this->len_++] = c;
3268 }
3269
3270 /**
3271 * Set DNA character at index 'idx' to 'c'.
3272 */
setChar(char c,size_t idx)3273 void setChar(char c, size_t idx) {
3274 assert_lt(idx, this->len_);
3275 assert_in(toupper(c), iupacs);
3276 this->cs_[idx] = asc2dnamask[(int)c];
3277 }
3278
3279 /**
3280 * Append DNA character.
3281 */
appendChar(char c)3282 void appendChar(char c) {
3283 while(this->sz_ < this->len_+1) {
3284 expandNoCopy((this->len_ + 1 + S) * M);
3285 }
3286 assert_in(toupper(c), iupacs);
3287 this->cs_[this->len_++] = asc2dnamask[(int)c];
3288 }
3289
3290 /**
3291 * Return DNA character corresponding to element 'idx'.
3292 */
toChar(size_t idx)3293 char toChar(size_t idx) const {
3294 assert_range((int)this->cs_[idx], 0, 15);
3295 return mask2iupac[(int)this->cs_[idx]];
3296 }
3297
3298 /**
3299 * Retrieve constant version of element i.
3300 */
3301 const char& operator[](size_t i) const {
3302 return this->get(i);
3303 }
3304
3305 /**
3306 * Retrieve mutable version of element i.
3307 */
3308 char& operator[](size_t i) {
3309 return this->get(i);
3310 }
3311
3312 /**
3313 * Retrieve constant version of element i.
3314 */
get(size_t i)3315 const char& get(size_t i) const {
3316 assert_lt(i, this->len_);
3317 assert_range((int)this->cs_[i], 0, 15);
3318 return this->cs_[i];
3319 }
3320
3321 /**
3322 * Retrieve mutable version of element i.
3323 */
get(size_t i)3324 char& get(size_t i) {
3325 assert_lt(i, this->len_);
3326 assert_range((int)this->cs_[i], 0, 15);
3327 return this->cs_[i];
3328 }
3329
3330 /**
3331 * Return the ith character in the window defined by fw, color,
3332 * depth and len.
3333 */
3334 char windowGetDna(
3335 size_t i,
3336 bool fw,
3337 size_t depth = 0,
3338 size_t len = 0) const
3339 {
3340 if(len == 0) len = this->len_;
3341 assert_lt(i, len);
3342 assert_leq(len, this->len_ - depth);
3343 if(fw) return this->cs_[depth+i];
3344 else return maskcomp[this->cs_[depth+len-i-1]];
3345 }
3346
3347 /**
3348 * Fill the given DNA buffer with the substring specified by fw,
3349 * color, depth and len.
3350 */
3351 void windowGetDna(
3352 SDnaStringFixed<S>& buf,
3353 bool fw,
3354 size_t depth = 0,
3355 size_t len = 0) const
3356 {
3357 if(len == 0) len = this->len_;
3358 assert_leq(len, this->len_ - depth);
3359 for(size_t i = 0; i < len; i++) {
3360 buf.append(fw ? this->cs_[depth+i] :
3361 maskcomp[this->cs_[depth+len-i-1]]);
3362 }
3363 }
3364
3365 /**
3366 * Sample a random substring of the given length from this DNA
3367 * string and install the result in 'dst'.
3368 */
3369 template<typename T>
3370 void randSubstr(
3371 RandomSource& rnd, // pseudo-random generator
3372 T& dst, // put sampled substring here
3373 size_t len, // length of substring to extract
3374 bool watson = true, // true -> possibly extract from Watson strand
3375 bool crick = true) // true -> possibly extract from Crick strand
3376 {
3377 assert(watson || crick);
3378 assert_geq(this->len_, len);
3379 size_t poss = this->len_ - len + 1;
3380 assert_gt(poss, 0);
3381 uint32_t rndoff = (uint32_t)(rnd.nextU32() % poss);
3382 bool fw;
3383 if (watson && !crick) fw = true;
3384 else if(!watson && crick) fw = false;
3385 else {
3386 fw = rnd.nextBool();
3387 }
3388 if(fw) {
3389 // Install Watson substring
3390 for(size_t i = 0; i < len; i++) {
3391 dst[i] = this->cs_[i + rndoff];
3392 }
3393 } else {
3394 // Install Crick substring
3395 for(size_t i = 0; i < len; i++) {
3396 dst[i] = maskcomp[(int)this->cs_[i + rndoff + (len - i - 1)]];
3397 }
3398 }
3399 }
3400
3401 /**
3402 * Put a terminator in the 'len_'th element and then return a
3403 * pointer to the buffer. Useful for printing.
3404 */
toZBuf()3405 virtual const char* toZBuf() const { return this->toZBufXForm(iupacs); }
3406 };
3407
3408 typedef SStringExpandable<char, 1024, 2> BTString;
3409 typedef SDnaStringExpandable<1024, 2> BTDnaString;
3410 typedef SDnaMaskString<32, 2> BTDnaMask;
3411
3412 #endif /* SSTRING_H_ */
3413