1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef SSTRING_H_
21 #define SSTRING_H_
22 
23 #include <string.h>
24 #include <iostream>
25 #include "assert_helpers.h"
26 #include "alphabet.h"
27 #include "random_source.h"
28 
29 /**
30  * Four kinds of strings defined here:
31  *
32  * SString:
33  *   A fixed-length string using heap memory with size set at construction time
34  *   or when install() member is called.
35  *
36  * S2bDnaString:
37  *   Like SString, but stores a list uint32_t words where each word is divided
38  *   into 16 2-bit slots interpreted as holding one A/C/G/T nucleotide each.
39  *
40  * TODO: S3bDnaString allowing N.  S4bDnaString allowing nucleotide masks.
41  *
42  * SStringExpandable:
43  *   A string using heap memory where the size of the backing store is
44  *   automatically resized as needed.  Supports operations like append, insert,
45  *   erase, etc.
46  *
47  * SStringFixed:
48  *   A fixed-length string using stack memory where size is set at compile
49  *   time.
50  *
51  * All string classes have some extra facilities that make it easy to print the
52  * string, including when the string uses an encoded alphabet.  See toZBuf()
53  * and toZBufXForm().
54  *
55  * Global lt, eq, and gt template functions are supplied.  They are capable of
56  * doing lexicographical comparisons between any of the three categories of
57  * strings defined here.
58  */
59 
60 template<typename T>
61 class Class_sstr_len {
62 public:
sstr_len(const T & s)63 	static inline size_t sstr_len(const T& s) {
64 		return s.length();
65 	}
66 };
67 
68 template<unsigned N>
69 class Class_sstr_len<const char[N]> {
70 public:
sstr_len(const char s[N])71 	static inline size_t sstr_len(const char s[N]) {
72 		return strlen(s);
73 	}
74 };
75 
76 template<>
77 class Class_sstr_len<const char *> {
78 public:
sstr_len(const char * s)79 	static inline size_t sstr_len(const char *s) {
80 		return strlen(s);
81 	}
82 };
83 
84 template<>
85 class Class_sstr_len<const unsigned char *> {
86 public:
sstr_len(const unsigned char * s)87 	static inline size_t sstr_len(const unsigned char *s) {
88 		return strlen((const char *)s);
89 	}
90 };
91 
92 template<typename T1, typename T2>
sstr_eq(const T1 & s1,const T2 & s2)93 static inline bool sstr_eq(const T1& s1, const T2& s2) {
94 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
95 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
96 	if(len1 != len2) return false;
97 	for(size_t i = 0; i < len1; i++) {
98 		if(s1[i] != s2[i]) return false;
99 	}
100 	return true;
101 }
102 
103 template<typename T1, typename T2>
sstr_neq(const T1 & s1,const T2 & s2)104 static inline bool sstr_neq(const T1& s1, const T2& s2) {
105 	return !sstr_eq(s1, s2);
106 }
107 
108 /**
109  * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
110  * to upto characters.
111  */
112 template<typename T1, typename T2>
113 static inline bool sstr_suf_upto_eq(
114 	const T1& s1, size_t suf1,
115 	const T2& s2, size_t suf2,
116 	size_t upto,
117 	bool endlt = true)
118 {
119 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
120 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
121 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
122 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
123 	if(len1 > upto) len1 = upto;
124 	if(len2 > upto) len2 = upto;
125 	if(len1 != len2) return false;
126 	for(size_t i = 0; i < len1; i++) {
127 		if(s1[suf1+i] != s2[suf2+i]) {
128 			return false;
129 		}
130 	}
131 	return true;
132 }
133 
134 /**
135  * Return true iff the given suffix of s1 is equal to the given suffix of s2 up
136  * to upto characters.
137  */
138 template<typename T1, typename T2>
139 static inline bool sstr_suf_upto_neq(
140 	const T1& s1, size_t suf1,
141 	const T2& s2, size_t suf2,
142 	size_t upto,
143 	bool endlt = true)
144 {
145 	return !sstr_suf_upto_eq(s1, suf1, s2, suf2, upto, endlt);
146 }
147 
148 /**
149  * Return true iff s1 is less than s2.
150  */
151 template<typename T1, typename T2>
152 static inline bool sstr_lt(const T1& s1, const T2& s2, bool endlt = true) {
153 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
154 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
155 	size_t minlen = (len1 < len2 ? len1 : len2);
156 	for(size_t i = 0; i < minlen; i++) {
157 		if(s1[i] < s2[i]) {
158 			return true;
159 		} else if(s1[i] > s2[i]) {
160 			return false;
161 		}
162 	}
163 	if(len1 == len2) return false;
164 	return (len1 < len2) == endlt;
165 }
166 
167 /**
168  * Return true iff the given suffix of s1 is less than the given suffix of s2.
169  */
170 template<typename T1, typename T2>
171 static inline bool sstr_suf_lt(
172 	const T1& s1, size_t suf1,
173 	const T2& s2, size_t suf2,
174 	bool endlt = true)
175 {
176 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
177 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
178 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
179 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
180 	size_t minlen = (len1 < len2 ? len1 : len2);
181 	for(size_t i = 0; i < minlen; i++) {
182 		if(s1[suf1+i] < s2[suf2+i]) {
183 			return true;
184 		} else if(s1[suf1+i] > s2[suf2+i]) {
185 			return false;
186 		}
187 	}
188 	if(len1 == len2) return false;
189 	return (len1 < len2) == endlt;
190 }
191 
192 /**
193  * Return true iff the given suffix of s1 is less than the given suffix of s2.
194  * Treat s1 and s2 as though they have lengths len1/len2.
195  */
196 template<typename T1, typename T2>
197 static inline bool sstr_suf_lt(
198 	const T1& s1, size_t suf1, size_t len1,
199 	const T2& s2, size_t suf2, size_t len2,
200 	bool endlt = true)
201 {
202 	assert_leq(suf1, len1);
203 	assert_leq(suf2, len2);
204 	size_t left1 = len1 - suf1;
205 	size_t left2 = len2 - suf2;
206 	size_t minleft = (left1 < left2 ? left1 : left2);
207 	for(size_t i = 0; i < minleft; i++) {
208 		if(s1[suf1+i] < s2[suf2+i]) {
209 			return true;
210 		} else if(s1[suf1+i] > s2[suf2+i]) {
211 			return false;
212 		}
213 	}
214 	if(left1 == left2) return false;
215 	return (left1 < left2) == endlt;
216 }
217 
218 /**
219  * Return true iff the given suffix of s1 is less than the given suffix of s2
220  * up to upto characters.
221  */
222 template<typename T1, typename T2>
223 static inline bool sstr_suf_upto_lt(
224 	const T1& s1, size_t suf1,
225 	const T2& s2, size_t suf2,
226 	size_t upto,
227 	bool endlt = true)
228 {
229 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
230 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
231 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
232 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
233 	if(len1 > upto) len1 = upto;
234 	if(len2 > upto) len2 = upto;
235 	size_t minlen = (len1 < len2 ? len1 : len2);
236 	for(size_t i = 0; i < minlen; i++) {
237 		if(s1[suf1+i] < s2[suf2+i]) {
238 			return true;
239 		} else if(s1[suf1+i] > s2[suf2+i]) {
240 			return false;
241 		}
242 	}
243 	if(len1 == len2) return false;
244 	return (len1 < len2) == endlt;
245 }
246 
247 /**
248  * Return true iff the given prefix of s1 is less than the given prefix of s2.
249  */
250 template<typename T1, typename T2>
251 static inline bool sstr_pre_lt(
252 	const T1& s1, size_t pre1,
253 	const T2& s2, size_t pre2,
254 	bool endlt = true)
255 {
256 	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
257 	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
258 	size_t len1 = pre1;
259 	size_t len2 = pre2;
260 	size_t minlen = (len1 < len2 ? len1 : len2);
261 	for(size_t i = 0; i < minlen; i++) {
262 		if(s1[i] < s2[i]) {
263 			return true;
264 		} else if(s1[i] > s2[i]) {
265 			return false;
266 		}
267 	}
268 	if(len1 == len2) return false;
269 	return (len1 < len2) == endlt;
270 }
271 
272 /**
273  * Return true iff s1 is less than or equal to s2.
274  */
275 template<typename T1, typename T2>
276 static inline bool sstr_leq(const T1& s1, const T2& s2, bool endlt = true) {
277 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
278 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
279 	size_t minlen = (len1 < len2 ? len1 : len2);
280 	for(size_t i = 0; i < minlen; i++) {
281 		if(s1[i] < s2[i]) {
282 			return true;
283 		} else if(s1[i] > s2[i]) {
284 			return false;
285 		}
286 	}
287 	if(len1 == len2) return true;
288 	return (len1 < len2) == endlt;
289 }
290 
291 /**
292  * Return true iff the given suffix of s1 is less than or equal to the given
293  * suffix of s2.
294  */
295 template<typename T1, typename T2>
296 static inline bool sstr_suf_leq(
297 	const T1& s1, size_t suf1,
298 	const T2& s2, size_t suf2,
299 	bool endlt = true)
300 {
301 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
302 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
303 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
304 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
305 	size_t minlen = (len1 < len2 ? len1 : len2);
306 	for(size_t i = 0; i < minlen; i++) {
307 		if(s1[suf1+i] < s2[suf2+i]) {
308 			return true;
309 		} else if(s1[suf1+i] > s2[suf2+i]) {
310 			return false;
311 		}
312 	}
313 	if(len1 == len2) return true;
314 	return (len1 < len2) == endlt;
315 }
316 
317 /**
318  * Return true iff the given prefix of s1 is less than or equal to the given
319  * prefix of s2.
320  */
321 template<typename T1, typename T2>
322 static inline bool sstr_pre_leq(
323 	const T1& s1, size_t pre1,
324 	const T2& s2, size_t pre2,
325 	bool endlt = true)
326 {
327 	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
328 	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
329 	size_t len1 = pre1;
330 	size_t len2 = pre2;
331 	size_t minlen = (len1 < len2 ? len1 : len2);
332 	for(size_t i = 0; i < minlen; i++) {
333 		if(s1[i] < s2[i]) {
334 			return true;
335 		} else if(s1[i] > s2[i]) {
336 			return false;
337 		}
338 	}
339 	if(len1 == len2) return true;
340 	return (len1 < len2) == endlt;
341 }
342 
343 /**
344  * Return true iff s1 is greater than s2.
345  */
346 template<typename T1, typename T2>
347 static inline bool sstr_gt(const T1& s1, const T2& s2, bool endlt = true) {
348 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
349 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
350 	size_t minlen = (len1 < len2 ? len1 : len2);
351 	for(size_t i = 0; i < minlen; i++) {
352 		if(s1[i] > s2[i]) {
353 			return true;
354 		} else if(s1[i] < s2[i]) {
355 			return false;
356 		}
357 	}
358 	if(len1 == len2) return false;
359 	return (len1 > len2) == endlt;
360 }
361 
362 /**
363  * Return true iff the given suffix of s1 is greater than the given suffix of
364  * s2.
365  */
366 template<typename T1, typename T2>
367 static inline bool sstr_suf_gt(
368 	const T1& s1, size_t suf1,
369 	const T2& s2, size_t suf2,
370 	bool endlt = true)
371 {
372 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
373 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
374 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
375 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
376 	size_t minlen = (len1 < len2 ? len1 : len2);
377 	for(size_t i = 0; i < minlen; i++) {
378 		if(s1[suf1+i] > s2[suf2+i]) {
379 			return true;
380 		} else if(s1[suf1+i] < s2[suf2+i]) {
381 			return false;
382 		}
383 	}
384 	if(len1 == len2) return false;
385 	return (len1 > len2) == endlt;
386 }
387 
388 /**
389  * Return true iff the given prefix of s1 is greater than the given prefix of
390  * s2.
391  */
392 template<typename T1, typename T2>
393 static inline bool sstr_pre_gt(
394 	const T1& s1, size_t pre1,
395 	const T2& s2, size_t pre2,
396 	bool endlt = true)
397 {
398 	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
399 	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
400 	size_t len1 = pre1;
401 	size_t len2 = pre2;
402 	size_t minlen = (len1 < len2 ? len1 : len2);
403 	for(size_t i = 0; i < minlen; i++) {
404 		if(s1[i] > s2[i]) {
405 			return true;
406 		} else if(s1[i] < s2[i]) {
407 			return false;
408 		}
409 	}
410 	if(len1 == len2) return false;
411 	return (len1 > len2) == endlt;
412 }
413 
414 /**
415  * Return true iff s1 is greater than or equal to s2.
416  */
417 template<typename T1, typename T2>
418 static inline bool sstr_geq(const T1& s1, const T2& s2, bool endlt = true) {
419 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1);
420 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2);
421 	size_t minlen = (len1 < len2 ? len1 : len2);
422 	for(size_t i = 0; i < minlen; i++) {
423 		if(s1[i] > s2[i]) {
424 			return true;
425 		} else if(s1[i] < s2[i]) {
426 			return false;
427 		}
428 	}
429 	if(len1 == len2) return true;
430 	return (len1 > len2) == endlt;
431 }
432 
433 /**
434  * Return true iff the given suffix of s1 is greater than or equal to the given
435  * suffix of s2.
436  */
437 template<typename T1, typename T2>
438 static inline bool sstr_suf_geq(
439 	const T1& s1, size_t suf1,
440 	const T2& s2, size_t suf2,
441 	bool endlt = true)
442 {
443 	assert_leq(suf1, Class_sstr_len<T1>::sstr_len(s1));
444 	assert_leq(suf2, Class_sstr_len<T2>::sstr_len(s2));
445 	size_t len1 = Class_sstr_len<T1>::sstr_len(s1) - suf1;
446 	size_t len2 = Class_sstr_len<T2>::sstr_len(s2) - suf2;
447 	size_t minlen = (len1 < len2 ? len1 : len2);
448 	for(size_t i = 0; i < minlen; i++) {
449 		if(s1[suf1+i] > s2[suf2+i]) {
450 			return true;
451 		} else if(s1[suf1+i] < s2[suf2+i]) {
452 			return false;
453 		}
454 	}
455 	if(len1 == len2) return true;
456 	return (len1 > len2) == endlt;
457 }
458 
459 /**
460  * Return true iff the given prefix of s1 is greater than or equal to the given
461  * prefix of s2.
462  */
463 template<typename T1, typename T2>
464 static inline bool sstr_pre_geq(
465 	const T1& s1, size_t pre1,
466 	const T2& s2, size_t pre2,
467 	bool endlt = true)
468 {
469 	assert_leq(pre1, Class_sstr_len<T1>::sstr_len(s1));
470 	assert_leq(pre2, Class_sstr_len<T2>::sstr_len(s2));
471 	size_t len1 = pre1;
472 	size_t len2 = pre2;
473 	size_t minlen = (len1 < len2 ? len1 : len2);
474 	for(size_t i = 0; i < minlen; i++) {
475 		if(s1[i] > s2[i]) {
476 			return true;
477 		} else if(s1[i] < s2[i]) {
478 			return false;
479 		}
480 	}
481 	if(len1 == len2) return true;
482 	return (len1 > len2) == endlt;
483 }
484 
485 template<typename T>
sstr_to_cstr(const T & s)486 static inline const char * sstr_to_cstr(const T& s) {
487 	return s.toZBuf();
488 }
489 
490 template<>
491 inline const char * sstr_to_cstr<std::basic_string<char> >(
492 	const std::basic_string<char>& s)
493 {
494 	return s.c_str();
495 }
496 
497 /**
498  * Simple string class with backing memory whose size is managed by the user
499  * using the constructor and install() member function.  No behind-the-scenes
500  * reallocation or copying takes place.
501  */
502 template<typename T>
503 class SString {
504 public:
505 
SString()506 	explicit SString() :
507 		cs_(NULL),
508 		printcs_(NULL),
509 		len_(0)
510 	{ }
511 
SString(size_t sz)512 	explicit SString(size_t sz) :
513 		cs_(NULL),
514 		printcs_(NULL),
515 		len_(0)
516 	{
517 		resize(sz);
518 	}
519 
520 	/**
521 	 * Create an SStringExpandable from another SStringExpandable.
522 	 */
SString(const SString<T> & o)523 	SString(const SString<T>& o) :
524 		cs_(NULL),
525 		printcs_(NULL),
526 		len_(0)
527 	{
528 		*this = o;
529 	}
530 
531 	/**
532 	 * Create an SStringExpandable from a std::basic_string of the
533 	 * appropriate type.
534 	 */
SString(const std::basic_string<T> & str)535 	explicit SString(const std::basic_string<T>& str) :
536 		cs_(NULL),
537 		printcs_(NULL),
538 		len_(0)
539 	{
540 		install(str.c_str(), str.length());
541 	}
542 
543 	/**
544 	 * Create an SStringExpandable from an array and size.
545 	 */
SString(const T * b,size_t sz)546 	explicit SString(const T* b, size_t sz) :
547 		cs_(NULL),
548 		printcs_(NULL),
549 		len_(0)
550 	{
551 		install(b, sz);
552 	}
553 
554 	/**
555 	 * Create an SStringExpandable from a zero-terminated array.
556 	 */
SString(const T * b)557 	explicit SString(const T* b) :
558 		cs_(NULL),
559 		printcs_(NULL),
560 		len_(0)
561 	{
562 		install(b, strlen(b));
563 	}
564 
565 	/**
566 	 * Destroy the expandable string object.
567 	 */
~SString()568 	virtual ~SString() {
569 		if(cs_ != NULL) {
570 			delete[] cs_;
571 			cs_ = NULL;
572 		}
573 		if(printcs_ != NULL) {
574 			delete[] printcs_;
575 			printcs_ = NULL;
576 		}
577 		len_ = 0;
578 	}
579 
580 	/**
581 	 * Assignment to other SString.
582 	 */
583 	SString<T>& operator=(const SString<T>& o) {
584 		install(o.cs_, o.len_);
585 		return *this;
586 	}
587 
588 	/**
589 	 * Assignment to other SString.
590 	 */
591 	SString<T>& operator=(const std::basic_string<T>& o) {
592 		install(o);
593 		return *this;
594 	}
595 
596 	/**
597 	 * Resizes the string without preserving its contents.
598 	 */
resize(size_t sz)599 	void resize(size_t sz) {
600 		if(cs_ != NULL) {
601 			delete cs_;
602 			cs_ = NULL;
603 		}
604 		if(printcs_ != NULL) {
605 			delete printcs_;
606 			printcs_ = NULL;
607 		}
608 		if(sz != 0) {
609 			cs_ = new T[sz+1];
610 		}
611 		len_ = sz;
612 	}
613 
614 	/**
615 	 * Return ith character from the left of either the forward or the
616 	 * reverse version of the read.
617 	 */
618 	T windowGet(
619 		size_t i,
620 		bool   fw,
621 		size_t depth = 0,
622 		size_t len = 0) const
623 	{
624 		if(len == 0) len = len_;
625 		assert_lt(i, len);
626 		assert_leq(len, len_ - depth);
627 		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
628 	}
629 
630 	/**
631 	 * Return ith character from the left of either the forward or the
632 	 * reverse-complement version of the read.
633 	 */
634 	void windowGet(
635 		T& ret,
636 		bool   fw,
637 		size_t depth = 0,
638 		size_t len = 0) const
639 	{
640 		if(len == 0) len = len_;
641 		assert_leq(len, len_ - depth);
642 		ret.resize(len);
643 		for(size_t i = 0; i < len; i++) {
644 			ret.set(fw ? cs_[depth+i] : cs_[depth+len-i-1], i);
645 		}
646 	}
647 
648 	/**
649 	 * Set character at index 'idx' to 'c'.
650 	 */
set(int c,size_t idx)651 	inline void set(int c, size_t idx) {
652 		assert_lt(idx, len_);
653 		cs_[idx] = c;
654 	}
655 
656 	/**
657 	 * Retrieve constant version of element i.
658 	 */
659 	inline const T& operator[](size_t i) const {
660 		assert_lt(i, len_);
661 		return cs_[i];
662 	}
663 
664 	/**
665 	 * Retrieve mutable version of element i.
666 	 */
667 	inline T& operator[](size_t i) {
668 		assert_lt(i, len_);
669 		return cs_[i];
670 	}
671 
672 	/**
673 	 * Retrieve constant version of element i.
674 	 */
get(size_t i)675 	inline const T& get(size_t i) const {
676 		assert_lt(i, len_);
677 		return cs_[i];
678 	}
679 
680 	/**
681 	 * Copy 'sz' bytes from buffer 'b' into this string.  memcpy is used, not
682 	 * operator=.
683 	 */
install(const T * b,size_t sz)684 	virtual void install(const T* b, size_t sz) {
685 		if(sz == 0) return;
686 		resize(sz);
687 		memcpy(cs_, b, sz * sizeof(T));
688 	}
689 
690 	/**
691 	 * Copy 'sz' bytes from buffer 'b' into this string.  memcpy is used, not
692 	 * operator=.
693 	 */
install(const std::basic_string<T> & b)694 	virtual void install(const std::basic_string<T>& b) {
695 		size_t sz = b.length();
696 		if(sz == 0) return;
697 		resize(sz);
698 		memcpy(cs_, b.c_str(), sz * sizeof(T));
699 	}
700 
701 	/**
702 	 * Copy all bytes from zero-terminated buffer 'b' into this string.
703 	 */
install(const T * b)704 	void install(const T* b) {
705 		install(b, strlen(b));
706 	}
707 
708 	/**
709 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
710 	 * in the process.
711 	 */
installReverse(const char * b,size_t sz)712 	void installReverse(const char* b, size_t sz) {
713 		if(sz == 0) return;
714 		resize(sz);
715 		for(size_t i = 0; i < sz; i++) {
716 			cs_[i] = b[sz-i-1];
717 		}
718 		len_ = sz;
719 	}
720 
721 	/**
722 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
723 	 * in the process.
724 	 */
installReverse(const SString<T> & b)725 	void installReverse(const SString<T>& b) {
726 		installReverse(b.cs_, b.len_);
727 	}
728 
729 	/**
730 	 * Return true iff the two strings are equal.
731 	 */
732 	bool operator==(const SString<T>& o) {
733 		return sstr_eq(*this, o);
734 	}
735 
736 	/**
737 	 * Return true iff the two strings are not equal.
738 	 */
739 	bool operator!=(const SString<T>& o) {
740 		return sstr_neq(*this, o);
741 	}
742 
743 	/**
744 	 * Return true iff this string is less than given string.
745 	 */
746 	bool operator<(const SString<T>& o) {
747 		return sstr_lt(*this, o);
748 	}
749 
750 	/**
751 	 * Return true iff this string is greater than given string.
752 	 */
753 	bool operator>(const SString<T>& o) {
754 		return sstr_gt(*this, o);
755 	}
756 
757 	/**
758 	 * Return true iff this string is less than or equal to given string.
759 	 */
760 	bool operator<=(const SString<T>& o) {
761 		return sstr_leq(*this, o);
762 	}
763 
764 	/**
765 	 * Return true iff this string is greater than or equal to given string.
766 	 */
767 	bool operator>=(const SString<T>& o) {
768 		return sstr_geq(*this, o);
769 	}
770 
771 	/**
772 	 * Reverse the buffer in place.
773 	 */
reverse()774 	void reverse() {
775 		for(size_t i = 0; i < (len_ >> 1); i++) {
776 			T tmp = get(i);
777 			set(get(len_-i-1), i);
778 			set(tmp, len_-i-1);
779 		}
780 	}
781 
782 	/**
783 	 * Reverse a substring of the buffer in place.
784 	 */
reverseWindow(size_t off,size_t len)785 	void reverseWindow(size_t off, size_t len) {
786 		assert_leq(off, len_);
787 		assert_leq(off + len, len_);
788 		size_t mid = len >> 1;
789 		for(size_t i = 0; i < mid; i++) {
790 			T tmp = get(off+i);
791 			set(get(off+len-i-1), off+i);
792 			set(tmp, off+len-i-1);
793 		}
794 	}
795 
796 	/**
797 	 * Set the first len elements of the buffer to el.
798 	 */
fill(size_t len,const T & el)799 	void fill(size_t len, const T& el) {
800 		assert_leq(len, len_);
801 		for(size_t i = 0; i < len; i++) {
802 			set(el, i);
803 		}
804 	}
805 
806 	/**
807 	 * Set all elements of the buffer to el.
808 	 */
fill(const T & el)809 	void fill(const T& el) {
810 		fill(len_, el);
811 	}
812 
813 	/**
814 	 * Return the length of the string.
815 	 */
length()816 	inline size_t length() const { return len_; }
817 
818 	/**
819 	 * Clear the buffer.
820 	 */
clear()821 	void clear() { len_ = 0; }
822 
823 	/**
824 	 * Return true iff the buffer is empty.
825 	 */
empty()826 	inline bool empty() const { return len_ == 0; }
827 
828 	/**
829 	 * Put a terminator in the 'len_'th element and then return a
830 	 * pointer to the buffer.  Useful for printing.
831 	 */
toZBufXForm(const char * xform)832 	const char* toZBufXForm(const char *xform) const {
833 		ASSERT_ONLY(size_t xformElts = strlen(xform));
834 		// Lazily allocate space for print buffer
835 		if(printcs_ == NULL) {
836 			const_cast<char*&>(printcs_) = new char[len_+1];
837 		}
838 		char* printcs = const_cast<char*>(printcs_);
839 		assert(printcs != NULL);
840 		for(size_t i = 0; i < len_; i++) {
841 			assert_lt(cs_[i], (int)xformElts);
842 			printcs[i] = xform[cs_[i]];
843 		}
844 		printcs[len_] = 0;
845 		return printcs_;
846 	}
847 
848 	/**
849 	 * Put a terminator in the 'len_'th element and then return a
850 	 * pointer to the buffer.  Useful for printing.
851 	 */
toZBuf()852 	virtual const T* toZBuf() const {
853 		const_cast<T*>(cs_)[len_] = 0;
854 		return cs_;
855 	}
856 
857 	/**
858 	 * Return a const version of the raw buffer.
859 	 */
buf()860 	const T* buf() const { return cs_; }
861 
862 	/**
863 	 * Return a writeable version of the raw buffer.
864 	 */
wbuf()865 	T* wbuf() { return cs_; }
866 
867 protected:
868 
869 	T *cs_;      // +1 so that we have the option of dropping in a terminating "\0"
870 	char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
871 	size_t len_; // # elements
872 };
873 
874 /**
875  * Simple string class with backing memory whose size is managed by the user
876  * using the constructor and install() member function.  No behind-the-scenes
877  * reallocation or copying takes place.
878  */
879 class S2bDnaString {
880 
881 public:
882 
S2bDnaString()883 	explicit S2bDnaString() :
884 		cs_(NULL),
885 		printcs_(NULL),
886 		len_(0)
887 	{ }
888 
S2bDnaString(size_t sz)889 	explicit S2bDnaString(size_t sz) :
890 		cs_(NULL),
891 		printcs_(NULL),
892 		len_(0)
893 	{
894 		resize(sz);
895 	}
896 
897 	/**
898 	 * Copy another object of the same class.
899 	 */
S2bDnaString(const S2bDnaString & o)900 	S2bDnaString(const S2bDnaString& o) :
901 		cs_(NULL),
902 		printcs_(NULL),
903 		len_(0)
904 	{
905 		*this = o;
906 	}
907 
908 	/**
909 	 * Create an SStringExpandable from a std::basic_string of the
910 	 * appropriate type.
911 	 */
912 	explicit S2bDnaString(
913 		const std::basic_string<char>& str,
914 		bool chars = false,
915 		bool colors = false) :
cs_(NULL)916 		cs_(NULL),
917 		printcs_(NULL),
918 		len_(0)
919 	{
920 		if(chars) {
921 			if(colors) {
922 				installColors(str.c_str(), str.length());
923 			} else {
924 				installChars(str.c_str(), str.length());
925 			}
926 		} else {
927 			install(str.c_str(), str.length());
928 		}
929 	}
930 
931 	/**
932 	 * Create an SStringExpandable from an array and size.
933 	 */
934 	explicit S2bDnaString(
935 		const char* b,
936 		size_t sz,
937 		bool chars = false,
938 		bool colors = false) :
cs_(NULL)939 		cs_(NULL),
940 		printcs_(NULL),
941 		len_(0)
942 	{
943 		if(chars) {
944 			if(colors) {
945 				installColors(b, sz);
946 			} else {
947 				installChars(b, sz);
948 			}
949 		} else {
950 			install(b, sz);
951 		}
952 	}
953 
954 	/**
955 	 * Create an SStringFixed from a zero-terminated string.
956 	 */
957 	explicit S2bDnaString(
958 		const char* b,
959 		bool chars = false,
960 		bool colors = false) :
cs_(NULL)961 		cs_(NULL),
962 		printcs_(NULL),
963 		len_(0)
964 	{
965 		if(chars) {
966 			if(colors) {
967 				installColors(b, strlen(b));
968 			} else {
969 				installChars(b, strlen(b));
970 			}
971 		} else {
972 			install(b, strlen(b));
973 		}
974 	}
975 
976 	/**
977 	 * Destroy the expandable string object.
978 	 */
~S2bDnaString()979 	virtual ~S2bDnaString() {
980 		if(cs_ != NULL) {
981 			delete[] cs_;
982 			cs_ = NULL;
983 		}
984 		if(printcs_ != NULL) {
985 			delete[] printcs_;
986 			printcs_ = NULL;
987 		}
988 		len_ = 0;
989 	}
990 
991 	/**
992 	 * Assignment to other SString.
993 	 */
994 	template<typename T>
995 	S2bDnaString& operator=(const T& o) {
996 		install(o.c_str(), o.length());
997 		return *this;
998 	}
999 
1000 	/**
1001 	 * Assignment from a std::basic_string
1002 	 */
1003 	template<typename T>
1004 	S2bDnaString& operator=(const std::basic_string<char>& o) {
1005 		install(o);
1006 		return *this;
1007 	}
1008 
1009 	/**
1010 	 * Resizes the string without preserving its contents.
1011 	 */
resize(size_t sz)1012 	void resize(size_t sz) {
1013 		if(cs_ != NULL) {
1014 			delete cs_;
1015 			cs_ = NULL;
1016 		}
1017 		if(printcs_ != NULL) {
1018 			delete printcs_;
1019 			printcs_ = NULL;
1020 		}
1021 		len_ = sz;
1022 		if(sz != 0) {
1023 			cs_ = new uint32_t[nwords()];
1024 		}
1025 	}
1026 
1027 	/**
1028 	 * Return DNA character corresponding to element 'idx'.
1029 	 */
toChar(size_t idx)1030 	char toChar(size_t idx) const {
1031 		int c = (int)get(idx);
1032 		assert_range(0, 3, c);
1033 		return "ACGT"[c];
1034 	}
1035 
1036 	/**
1037 	 * Return color character corresponding to element 'idx'.
1038 	 */
toColor(size_t idx)1039 	char toColor(size_t idx) const {
1040 		int c = (int)get(idx);
1041 		assert_range(0, 3, c);
1042 		return "0123"[c];
1043 	}
1044 
1045 	/**
1046 	 * Return ith character from the left of either the forward or the
1047 	 * reverse version of the read.
1048 	 */
1049 	char windowGet(
1050 		size_t i,
1051 		bool   fw,
1052 		size_t depth = 0,
1053 		size_t len = 0) const
1054 	{
1055 		if(len == 0) len = len_;
1056 		assert_lt(i, len);
1057 		assert_leq(len, len_ - depth);
1058 		return fw ? get(depth+i) : get(depth+len-i-1);
1059 	}
1060 
1061 	/**
1062 	 * Return ith character from the left of either the forward or the
1063 	 * reverse-complement version of the read.
1064 	 */
1065 	template<typename T>
1066 	void windowGet(
1067 		T& ret,
1068 		bool   fw,
1069 		size_t depth = 0,
1070 		size_t len = 0) const
1071 	{
1072 		if(len == 0) len = len_;
1073 		assert_leq(len, len_ - depth);
1074 		ret.resize(len);
1075 		for(size_t i = 0; i < len; i++) {
1076 			ret.set((fw ? get(depth+i) : get(depth+len-i-1)), i);
1077 		}
1078 	}
1079 
1080 	/**
1081 	 * Return length in 32-bit words.
1082 	 */
nwords()1083 	size_t nwords() const {
1084 		return (len_ + 15) >> 4;
1085 	}
1086 
1087 	/**
1088 	 * Set character at index 'idx' to 'c'.
1089 	 */
set(int c,size_t idx)1090 	void set(int c, size_t idx) {
1091 		assert_lt(idx, len_);
1092 		assert_range(0, 3, c);
1093 		size_t word = idx >> 4;
1094 		size_t bpoff = (idx & 15) << 1;
1095 		cs_[word] = cs_[word] & ~(uint32_t)(3 << bpoff);
1096 		cs_[word] = cs_[word] |  (uint32_t)(c << bpoff);
1097 	}
1098 
1099 	/**
1100 	 * Set character at index 'idx' to DNA char 'c'.
1101 	 */
setChar(int c,size_t idx)1102 	void setChar(int c, size_t idx) {
1103 		assert_in(toupper(c), "ACGT");
1104 		int bp = asc2dna[c];
1105 		set(bp, idx);
1106 	}
1107 
1108 	/**
1109 	 * Set character at index 'idx' to color char 'c'.
1110 	 */
setColor(int c,size_t idx)1111 	void setColor(int c, size_t idx) {
1112 		assert_in(toupper(c), "0123");
1113 		int co = asc2col[c];
1114 		set(co, idx);
1115 	}
1116 
1117 	/**
1118 	 * Set the ith 32-bit word to given word.
1119 	 */
setWord(uint32_t w,size_t i)1120 	void setWord(uint32_t w, size_t i) {
1121 		assert_lt(i, nwords());
1122 		cs_[i] = w;
1123 	}
1124 
1125 	/**
1126 	 * Retrieve constant version of element i.
1127 	 */
1128 	char operator[](size_t i) const {
1129 		assert_lt(i, len_);
1130 		return get(i);
1131 	}
1132 
1133 	/**
1134 	 * Retrieve constant version of element i.
1135 	 */
get(size_t i)1136 	char get(size_t i) const {
1137 		assert_lt(i, len_);
1138 		size_t word = i >> 4;
1139 		size_t bpoff = (i & 15) << 1;
1140 		return (char)((cs_[word] >> bpoff) & 3);
1141 	}
1142 
1143 	/**
1144 	 * Copy packed words from string 'b' into this packed string.
1145 	 */
install(const uint32_t * b,size_t sz)1146 	void install(const uint32_t* b, size_t sz) {
1147 		if(sz == 0) return;
1148 		resize(sz);
1149 		memcpy(cs_, b, sizeof(uint32_t)*nwords());
1150 	}
1151 
1152 	/**
1153 	 * Copy 'sz' DNA characters encoded as integers from buffer 'b' into this
1154 	 * packed string.
1155 	 */
install(const char * b,size_t sz)1156 	void install(const char* b, size_t sz) {
1157 		if(sz == 0) return;
1158 		resize(sz);
1159 		size_t wordi = 0;
1160 		for(size_t i = 0; i < sz; i += 16) {
1161 			uint32_t word = 0;
1162 			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1163 				uint32_t bp = (int)b[i+j];
1164 				uint32_t shift = (uint32_t)j << 1;
1165 				assert_range(0, 3, (int)bp);
1166 				word |= (bp << shift);
1167 			}
1168 			cs_[wordi++] = word;
1169 		}
1170 	}
1171 
1172 	/**
1173 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1174 	 */
installChars(const char * b,size_t sz)1175 	void installChars(const char* b, size_t sz) {
1176 		if(sz == 0) return;
1177 		resize(sz);
1178 		size_t wordi = 0;
1179 		for(size_t i = 0; i < sz; i += 16) {
1180 			uint32_t word = 0;
1181 			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1182 				char c = b[i+j];
1183 				assert_in(toupper(c), "ACGT");
1184 				int bp = asc2dna[(int)c];
1185 				assert_range(0, 3, (int)bp);
1186 				uint32_t shift = (uint32_t)j << 1;
1187 				word |= (bp << shift);
1188 			}
1189 			cs_[wordi++] = word;
1190 		}
1191 	}
1192 
1193 	/**
1194 	 * Copy 'sz' color characters from buffer 'b' into this packed string.
1195 	 */
installColors(const char * b,size_t sz)1196 	void installColors(const char* b, size_t sz) {
1197 		if(sz == 0) return;
1198 		resize(sz);
1199 		size_t wordi = 0;
1200 		for(size_t i = 0; i < sz; i += 16) {
1201 			uint32_t word = 0;
1202 			for(int j = 0; j < 16 && (size_t)(i+j) < sz; j++) {
1203 				char c = b[i+j];
1204 				assert_in(c, "0123");
1205 				int bp = asc2col[(int)c];
1206 				assert_range(0, 3, (int)bp);
1207 				uint32_t shift = (uint32_t)j << 1;
1208 				word |= (bp << shift);
1209 			}
1210 			cs_[wordi++] = word;
1211 		}
1212 	}
1213 
1214 	/**
1215 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1216 	 */
install(const char * b)1217 	void install(const char* b) {
1218 		install(b, strlen(b));
1219 	}
1220 
1221 	/**
1222 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1223 	 */
installChars(const char * b)1224 	void installChars(const char* b) {
1225 		installChars(b, strlen(b));
1226 	}
1227 
1228 	/**
1229 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1230 	 */
installColors(const char * b)1231 	void installColors(const char* b) {
1232 		installColors(b, strlen(b));
1233 	}
1234 
1235 	/**
1236 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1237 	 */
install(const std::basic_string<char> & b)1238 	void install(const std::basic_string<char>& b) {
1239 		install(b.c_str(), b.length());
1240 	}
1241 
1242 	/**
1243 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1244 	 */
installChars(const std::basic_string<char> & b)1245 	void installChars(const std::basic_string<char>& b) {
1246 		installChars(b.c_str(), b.length());
1247 	}
1248 
1249 	/**
1250 	 * Copy 'sz' DNA characters from buffer 'b' into this packed string.
1251 	 */
installColors(const std::basic_string<char> & b)1252 	void installColors(const std::basic_string<char>& b) {
1253 		installColors(b.c_str(), b.length());
1254 	}
1255 
1256 	/**
1257 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1258 	 * in the process.
1259 	 */
installReverse(const char * b,size_t sz)1260 	void installReverse(const char* b, size_t sz) {
1261 		resize(sz);
1262 		if(sz == 0) return;
1263 		size_t wordi = 0;
1264 		size_t bpi   = 0;
1265 		cs_[0] = 0;
1266 		for(size_t i =sz; i > 0; i--) {
1267 			assert_range(0, 3, (int)b[i-1]);
1268 			cs_[wordi] |= ((int)b[i-1] << (bpi<<1));
1269 			if(bpi == 15) {
1270 				wordi++;
1271 				cs_[wordi] = 0;
1272 				bpi = 0;
1273 			} else bpi++;
1274 		}
1275 	}
1276 
1277 	/**
1278 	 * Copy all chars from buffer of DNA characters 'b' into this string,
1279 	 * reversing them in the process.
1280 	 */
installReverse(const char * b)1281 	void installReverse(const char* b) {
1282 		installReverse(b, strlen(b));
1283 	}
1284 
1285 	/**
1286 	 * Copy 'sz' bytes from buffer of DNA characters 'b' into this string,
1287 	 * reversing them in the process.
1288 	 */
installReverseChars(const char * b,size_t sz)1289 	void installReverseChars(const char* b, size_t sz) {
1290 		resize(sz);
1291 		if(sz == 0) return;
1292 		size_t wordi = 0;
1293 		size_t bpi   = 0;
1294 		cs_[0] = 0;
1295 		for(size_t i =sz; i > 0; i--) {
1296 			char c = b[i-1];
1297 			assert_in(toupper(c), "ACGT");
1298 			int bp = asc2dna[(int)c];
1299 			assert_range(0, 3, bp);
1300 			cs_[wordi] |= (bp << (bpi<<1));
1301 			if(bpi == 15) {
1302 				wordi++;
1303 				cs_[wordi] = 0;
1304 				bpi = 0;
1305 			} else bpi++;
1306 		}
1307 	}
1308 
1309 	/**
1310 	 * Copy all chars from buffer of DNA characters 'b' into this string,
1311 	 * reversing them in the process.
1312 	 */
installReverseChars(const char * b)1313 	void installReverseChars(const char* b) {
1314 		installReverseChars(b, strlen(b));
1315 	}
1316 
1317 	/**
1318 	 * Copy 'sz' bytes from buffer of color characters 'b' into this string,
1319 	 * reversing them in the process.
1320 	 */
installReverseColors(const char * b,size_t sz)1321 	void installReverseColors(const char* b, size_t sz) {
1322 		resize(sz);
1323 		if(sz == 0) return;
1324 		size_t wordi = 0;
1325 		size_t bpi   = 0;
1326 		cs_[0] = 0;
1327 		for(size_t i =sz; i > 0; i--) {
1328 			char c = b[i-1];
1329 			assert_in(c, "0123");
1330 			int bp = asc2col[(int)c];
1331 			assert_range(0, 3, bp);
1332 			cs_[wordi] |= (bp << (bpi<<1));
1333 			if(bpi == 15) {
1334 				wordi++;
1335 				cs_[wordi] = 0;
1336 				bpi = 0;
1337 			} else bpi++;
1338 		}
1339 	}
1340 
1341 	/**
1342 	 * Copy all chars from buffer of color characters 'b' into this string,
1343 	 * reversing them in the process.
1344 	 */
installReverseColors(const char * b)1345 	void installReverseColors(const char* b) {
1346 		installReverseColors(b, strlen(b));
1347 	}
1348 
1349 	/**
1350 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1351 	 * in the process.
1352 	 */
installReverse(const S2bDnaString & b)1353 	void installReverse(const S2bDnaString& b) {
1354 		resize(b.len_);
1355 		if(b.len_ == 0) return;
1356 		size_t wordi = 0;
1357 		size_t bpi   = 0;
1358 		size_t wordb = b.nwords()-1;
1359 		size_t bpb   = (b.len_-1) & 15;
1360 		cs_[0] = 0;
1361 		for(size_t i = b.len_; i > 0; i--) {
1362 			int bbp = (int)((b[wordb] >> (bpb << 1)) & 3);
1363 			assert_range(0, 3, bbp);
1364 			cs_[wordi] |= (bbp << (bpi << 1));
1365 			if(bpi == 15) {
1366 				wordi++;
1367 				cs_[wordi] = 0;
1368 				bpi = 0;
1369 			} else bpi++;
1370 			if(bpb == 0) {
1371 				wordb--;
1372 				bpi = 15;
1373 			} else bpi--;
1374 		}
1375 	}
1376 
1377 	/**
1378 	 * Return true iff the two strings are equal.
1379 	 */
1380 	bool operator==(const S2bDnaString& o) {
1381 		return sstr_eq(*this, o);
1382 	}
1383 
1384 	/**
1385 	 * Return true iff the two strings are not equal.
1386 	 */
1387 	bool operator!=(const S2bDnaString& o) {
1388 		return sstr_neq(*this, o);
1389 	}
1390 
1391 	/**
1392 	 * Return true iff this string is less than given string.
1393 	 */
1394 	bool operator<(const S2bDnaString& o) {
1395 		return sstr_lt(*this, o);
1396 	}
1397 
1398 	/**
1399 	 * Return true iff this string is greater than given string.
1400 	 */
1401 	bool operator>(const S2bDnaString& o) {
1402 		return sstr_gt(*this, o);
1403 	}
1404 
1405 	/**
1406 	 * Return true iff this string is less than or equal to given string.
1407 	 */
1408 	bool operator<=(const S2bDnaString& o) {
1409 		return sstr_leq(*this, o);
1410 	}
1411 
1412 	/**
1413 	 * Return true iff this string is greater than or equal to given string.
1414 	 */
1415 	bool operator>=(const S2bDnaString& o) {
1416 		return sstr_geq(*this, o);
1417 	}
1418 
1419 	/**
1420 	 * Reverse the 2-bit encoded DNA string in-place.
1421 	 */
reverse()1422 	void reverse() {
1423 		if(len_ <= 1) return;
1424 		size_t wordf = nwords()-1;
1425 		size_t bpf   = (len_-1) & 15;
1426 		size_t wordi = 0;
1427 		size_t bpi   = 0;
1428 		while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
1429 			int f = (cs_[wordf] >> (bpf << 1)) & 3;
1430 			int i = (cs_[wordi] >> (bpi << 1)) & 3;
1431 			cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
1432 			cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
1433 			cs_[wordf] |=  (uint32_t)(i << (bpf << 1));
1434 			cs_[wordi] |=  (uint32_t)(f << (bpi << 1));
1435 			if(bpf == 0) {
1436 				bpf = 15;
1437 				wordf--;
1438 			} else bpf--;
1439 			if(bpi == 15) {
1440 				bpi = 0;
1441 				wordi++;
1442 			} else bpi++;
1443 		}
1444 	}
1445 
1446 	/**
1447 	 * Reverse a substring of the buffer in place.
1448 	 */
reverseWindow(size_t off,size_t len)1449 	void reverseWindow(size_t off, size_t len) {
1450 		assert_leq(off, len_);
1451 		assert_leq(off+len, len_);
1452 		if(len <= 1) return;
1453 		size_t wordf = (off+len-1) >> 4;
1454 		size_t bpf   = (off+len-1) & 15;
1455 		size_t wordi = (off      ) >> 4;
1456 		size_t bpi   = (off      ) & 15;
1457 		while(wordf > wordi || (wordf == wordi && bpf > bpi)) {
1458 			int f = (cs_[wordf] >> (bpf << 1)) & 3;
1459 			int i = (cs_[wordi] >> (bpi << 1)) & 3;
1460 			cs_[wordf] &= ~(uint32_t)(3 << (bpf << 1));
1461 			cs_[wordi] &= ~(uint32_t)(3 << (bpi << 1));
1462 			cs_[wordf] |=  (uint32_t)(i << (bpf << 1));
1463 			cs_[wordi] |=  (uint32_t)(f << (bpi << 1));
1464 			if(bpf == 0) {
1465 				bpf = 15;
1466 				wordf--;
1467 			} else bpf--;
1468 			if(bpi == 15) {
1469 				bpi = 0;
1470 				wordi++;
1471 			} else bpi++;
1472 		}
1473 	}
1474 
1475 
1476 	/**
1477 	 * Set the first len elements of the buffer to el.
1478 	 */
fill(size_t len,char el)1479 	void fill(size_t len, char el) {
1480 		assert_leq(len, len_);
1481 		assert_range(0, 3, (int)el);
1482 		size_t word = 0;
1483 		if(len > 32) {
1484 			// Copy el throughout block
1485 			uint32_t bl = (uint32_t)el;
1486 			bl |= (bl << 2);
1487 			bl |= (bl << 4);
1488 			bl |= (bl << 8);
1489 			bl |= (bl << 16);
1490 			// Fill with blocks
1491 			size_t blen = len >> 4;
1492 			for(; word < blen; word++) {
1493 				cs_[word] = bl;
1494 			}
1495 			len = len & 15;
1496 		}
1497 		size_t bp = 0;
1498 		for(size_t i = 0; i < len; i++) {
1499 			cs_[word] &= ~(uint32_t)(3  << (bp << 1));
1500 			cs_[word] |=  (uint32_t)(el << (bp << 1));
1501 			if(bp == 15) {
1502 				word++;
1503 				bp = 0;
1504 			} else bp++;
1505 		}
1506 	}
1507 
1508 	/**
1509 	 * Set all elements of the buffer to el.
1510 	 */
fill(char el)1511 	void fill(char el) {
1512 		fill(len_, el);
1513 	}
1514 
1515 	/**
1516 	 * Return the ith character in the window defined by fw, color, depth and
1517 	 * len.
1518 	 */
1519 	char windowGetDna(
1520 		size_t i,
1521 		bool   fw,
1522 		size_t depth = 0,
1523 		size_t len = 0) const
1524 	{
1525 		if(len == 0) len = len_;
1526 		assert_lt(i, len);
1527 		assert_leq(len, len_ - depth);
1528 		if(fw) {
1529 			return get(depth+i);
1530 		} else {
1531 			return compDna(get(depth+len-i-1));
1532 		}
1533 	}
1534 
1535 	/**
1536 	 * Fill the given DNA buffer with the substring specified by fw,
1537 	 * color, depth and len.
1538 	 */
1539 	template<typename T>
1540 	void windowGetDna(
1541 		T&     buf,
1542 		bool   fw,
1543 		size_t depth = 0,
1544 		size_t len = 0) const
1545 	{
1546 		if(len == 0) len = len_;
1547 		assert_leq(len, len_ - depth);
1548 		buf.resize(len);
1549 		for(size_t i = 0; i < len; i++) {
1550 			buf.set(
1551 				(fw ?
1552 					get(depth+i) :
1553 					compDna(get(depth+len-i-1))), i);
1554 		}
1555 	}
1556 
1557 	/**
1558 	 * Return the length of the string.
1559 	 */
length()1560 	inline size_t length() const { return len_; }
1561 
1562 	/**
1563 	 * Clear the buffer.
1564 	 */
clear()1565 	void clear() { len_ = 0; }
1566 
1567 	/**
1568 	 * Return true iff the buffer is empty.
1569 	 */
empty()1570 	inline bool empty() const { return len_ == 0; }
1571 
1572 	/**
1573 	 * Return a const version of the raw buffer.
1574 	 */
buf()1575 	const uint32_t* buf() const { return cs_; }
1576 
1577 	/**
1578 	 * Return a writeable version of the raw buffer.
1579 	 */
wbuf()1580 	uint32_t* wbuf() { return cs_; }
1581 
1582 	/**
1583 	 * Note: the size of the string once it's stored in the print buffer is 4
1584 	 * times as large as the string as stored in compact 2-bit-per-char words.
1585 	 */
toZBuf()1586 	const char* toZBuf() const {
1587 		if(printcs_ == NULL) {
1588 			const_cast<char*&>(printcs_) = new char[len_+1];
1589 		}
1590 		char *printcs = const_cast<char*>(printcs_);
1591 		size_t word = 0, bp = 0;
1592 		for(size_t i = 0; i < len_; i++) {
1593 			int c = (cs_[word] >> (bp << 1)) & 3;
1594 			printcs[i] = "ACGT"[c];
1595 			if(bp == 15) {
1596 				word++;
1597 				bp = 0;
1598 			} else bp++;
1599 		}
1600 		printcs[len_] = '\0';
1601 		return printcs_;
1602 	}
1603 
1604 protected:
1605 
1606 	uint32_t *cs_; // 2-bit packed words
1607 	char *printcs_;
1608 	size_t len_;   // # elements
1609 };
1610 
1611 /**
1612  * Simple string class with backing memory that automatically expands as needed.
1613  */
1614 template<typename T, int S = 1024, int M = 2, int I = 0>
1615 class SStringExpandable {
1616 
1617 public:
1618 
SStringExpandable()1619 	explicit SStringExpandable() :
1620 		cs_(NULL),
1621 		printcs_(NULL),
1622 		len_(0),
1623 		sz_(0)
1624 	{
1625 		if(I > 0) {
1626 			expandNoCopy(I);
1627 		}
1628 	}
1629 
SStringExpandable(size_t sz)1630 	explicit SStringExpandable(size_t sz) :
1631 		cs_(NULL),
1632 		printcs_(NULL),
1633 		len_(0),
1634 		sz_(0)
1635 	{
1636 		expandNoCopy(sz);
1637 	}
1638 
1639 	/**
1640 	 * Create an SStringExpandable from another SStringExpandable.
1641 	 */
SStringExpandable(const SStringExpandable<T,S> & o)1642 	SStringExpandable(const SStringExpandable<T, S>& o) :
1643 		cs_(NULL),
1644 		printcs_(NULL),
1645 		len_(0),
1646 		sz_(0)
1647 	{
1648 		*this = o;
1649 	}
1650 
1651 	/**
1652 	 * Create an SStringExpandable from a std::basic_string of the
1653 	 * appropriate type.
1654 	 */
SStringExpandable(const std::basic_string<T> & str)1655 	explicit SStringExpandable(const std::basic_string<T>& str) :
1656 		cs_(NULL),
1657 		printcs_(NULL),
1658 		len_(0),
1659 		sz_(0)
1660 	{
1661 		install(str.c_str(), str.length());
1662 	}
1663 
1664 	/**
1665 	 * Create an SStringExpandable from an array and size.
1666 	 */
SStringExpandable(const T * b,size_t sz)1667 	explicit SStringExpandable(const T* b, size_t sz) :
1668 		cs_(NULL),
1669 		printcs_(NULL),
1670 		len_(0),
1671 		sz_(0)
1672 	{
1673 		install(b, sz);
1674 	}
1675 
1676 	/**
1677 	 * Create an SStringExpandable from a zero-terminated array.
1678 	 */
SStringExpandable(const T * b)1679 	explicit SStringExpandable(const T* b) :
1680 		cs_(NULL),
1681 		printcs_(NULL),
1682 		len_(0),
1683 		sz_(0)
1684 	{
1685 		install(b, strlen(b));
1686 	}
1687 
1688 	/**
1689 	 * Destroy the expandable string object.
1690 	 */
~SStringExpandable()1691 	virtual ~SStringExpandable() {
1692 		if(cs_ != NULL) {
1693 			delete[] cs_;
1694 			cs_ = NULL;
1695 		}
1696 		if(printcs_ != NULL) {
1697 			delete[] printcs_;
1698 			printcs_ = NULL;
1699 		}
1700 		sz_ = len_ = 0;
1701 	}
1702 
1703 	/**
1704 	 * Return ith character from the left of either the forward or the
1705 	 * reverse-complement version of the read.
1706 	 */
1707 	T windowGet(
1708 		size_t i,
1709 		bool   fw,
1710 		size_t depth = 0,
1711 		size_t len = 0) const
1712 	{
1713 		if(len == 0) len = len_;
1714 		assert_lt(i, len);
1715 		assert_leq(len, len_ - depth);
1716 		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
1717 	}
1718 
1719 	/**
1720 	 * Return ith character from the left of either the forward or the
1721 	 * reverse-complement version of the read.
1722 	 */
1723 	void windowGet(
1724 		T& ret,
1725 		bool   fw,
1726 		size_t depth = 0,
1727 		size_t len = 0) const
1728 	{
1729 		if(len == 0) len = len_;
1730 		assert_leq(len, len_ - depth);
1731 		for(size_t i = 0; i < len; i++) {
1732 			ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
1733 		}
1734 	}
1735 
1736 	/**
1737 	 * Assignment to other SStringExpandable.
1738 	 */
1739 	SStringExpandable<T,S,M,I>& operator=(const SStringExpandable<T,S,M,I>& o) {
1740 		install(o.cs_, o.len_);
1741 		return *this;
1742 	}
1743 
1744 	/**
1745 	 * Assignment from a std::basic_string
1746 	 */
1747 	SStringExpandable<T,S>& operator=(const std::basic_string<T>& o) {
1748 		install(o.c_str(), o.length());
1749 		return *this;
1750 	}
1751 
1752 	/**
1753 	 * Insert char c before position 'idx'; slide subsequent chars down.
1754 	 */
insert(const T & c,size_t idx)1755 	void insert(const T& c, size_t idx) {
1756 		assert_lt(idx, len_);
1757 		if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
1758 		len_++;
1759 		// Move everyone down by 1
1760 		// len_ is the *new* length
1761 		for(size_t i = len_; i > idx+1; i--) {
1762 			cs_[i-1] = cs_[i-2];
1763 		}
1764 		cs_[idx] = c;
1765 	}
1766 
1767 	/**
1768 	 * Set character at index 'idx' to 'c'.
1769 	 */
set(int c,size_t idx)1770 	void set(int c, size_t idx) {
1771 		assert_lt(idx, len_);
1772 		cs_[idx] = c;
1773 	}
1774 
1775 	/**
1776 	 * Append char c.
1777 	 */
append(const T & c)1778 	void append(const T& c) {
1779 		if(sz_ < len_ + 1) expandCopy((len_ + 1 + S) * M);
1780 		cs_[len_++] = c;
1781 	}
1782 
1783 	/**
1784 	 * Delete char at position 'idx'; slide subsequent chars up.
1785 	 */
remove(size_t idx)1786 	void remove(size_t idx) {
1787 		assert_lt(idx, len_);
1788 		assert_gt(len_, 0);
1789 		for(size_t i = idx; i < len_-1; i++) {
1790 			cs_[i] = cs_[i+1];
1791 		}
1792 		len_--;
1793 	}
1794 
1795 	/**
1796 	 * Retrieve constant version of element i.
1797 	 */
1798 	const T& operator[](size_t i) const {
1799 		assert_lt(i, len_);
1800 		return cs_[i];
1801 	}
1802 
1803 	/**
1804 	 * Retrieve mutable version of element i.
1805 	 */
1806 	T& operator[](size_t i) {
1807 		assert_lt(i, len_);
1808 		return cs_[i];
1809 	}
1810 
1811 	/**
1812 	 * Retrieve constant version of element i.
1813 	 */
get(size_t i)1814 	const T& get(size_t i) const {
1815 		assert_lt(i, len_);
1816 		return cs_[i];
1817 	}
1818 
1819 	/**
1820 	 * Copy 'sz' bytes from buffer 'b' into this string.
1821 	 */
install(const T * b,size_t sz)1822 	virtual void install(const T* b, size_t sz) {
1823 		if(sz_ < sz) expandNoCopy((sz + S) * M);
1824 		memcpy(cs_, b, sz * sizeof(T));
1825 		len_ = sz;
1826 	}
1827 
1828 
1829 	/**
1830 	 * Copy all bytes from zero-terminated buffer 'b' into this string.
1831 	 */
install(const T * b)1832 	void install(const T* b) { install(b, strlen(b)); }
1833 
1834 	/**
1835 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1836 	 * in the process.
1837 	 */
installReverse(const char * b,size_t sz)1838 	void installReverse(const char* b, size_t sz) {
1839 		if(sz_ < sz) expandNoCopy((sz + S) * M);
1840 		for(size_t i = 0; i < sz; i++) {
1841 			cs_[i] = b[sz-i-1];
1842 		}
1843 		len_ = sz;
1844 	}
1845 
1846 	/**
1847 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
1848 	 * in the process.
1849 	 */
installReverse(const SStringExpandable<T,S> & b)1850 	void installReverse(const SStringExpandable<T, S>& b) {
1851 		if(sz_ < b.len_) expandNoCopy((b.len_ + S) * M);
1852 		for(size_t i = 0; i < b.len_; i++) {
1853 			cs_[i] = b.cs_[b.len_ - i - 1];
1854 		}
1855 		len_ = b.len_;
1856 	}
1857 
1858 	/**
1859 	 * Return true iff the two strings are equal.
1860 	 */
1861 	bool operator==(const SStringExpandable<T, S>& o) {
1862 		return sstr_eq(*this, o);
1863 	}
1864 
1865 	/**
1866 	 * Return true iff the two strings are not equal.
1867 	 */
1868 	bool operator!=(const SStringExpandable<T, S>& o) {
1869 		return sstr_neq(*this, o);
1870 	}
1871 
1872 	/**
1873 	 * Return true iff this string is less than given string.
1874 	 */
1875 	bool operator<(const SStringExpandable<T, S>& o) {
1876 		return sstr_lt(*this, o);
1877 	}
1878 
1879 	/**
1880 	 * Return true iff this string is greater than given string.
1881 	 */
1882 	bool operator>(const SStringExpandable<T, S>& o) {
1883 		return sstr_gt(*this, o);
1884 	}
1885 
1886 	/**
1887 	 * Return true iff this string is less than or equal to given string.
1888 	 */
1889 	bool operator<=(const SStringExpandable<T, S>& o) {
1890 		return sstr_leq(*this, o);
1891 	}
1892 
1893 	/**
1894 	 * Return true iff this string is greater than or equal to given string.
1895 	 */
1896 	bool operator>=(const SStringExpandable<T, S>& o) {
1897 		return sstr_geq(*this, o);
1898 	}
1899 
1900 	/**
1901 	 * Reverse the buffer in place.
1902 	 */
reverse()1903 	void reverse() {
1904 		for(size_t i = 0; i < (len_ >> 1); i++) {
1905 			T tmp = get(i);
1906 			set(get(len_-i-1), i);
1907 			set(tmp, len_-i-1);
1908 		}
1909 	}
1910 
1911 	/**
1912 	 * Reverse a substring of the buffer in place.
1913 	 */
reverseWindow(size_t off,size_t len)1914 	void reverseWindow(size_t off, size_t len) {
1915 		assert_leq(off, len_);
1916 		assert_leq(off + len, len_);
1917 		size_t mid = len >> 1;
1918 		for(size_t i = 0; i < mid; i++) {
1919 			T tmp = get(off+i);
1920 			set(get(off+len-i-1), off+i);
1921 			set(tmp, off+len-i-1);
1922 		}
1923 	}
1924 
1925 	/**
1926 	 * Simply resize the buffer.  If the buffer is resized to be
1927 	 * longer, the newly-added elements will contain garbage and should
1928 	 * be initialized immediately.
1929 	 */
resize(size_t len)1930 	void resize(size_t len) {
1931 		if(sz_ < len) expandCopy((len + S) * M);
1932 		len_ = len;
1933 	}
1934 
1935 	/**
1936 	 * Simply resize the buffer.  If the buffer is resized to be
1937 	 * longer, new elements will be initialized with 'el'.
1938 	 */
resize(size_t len,const T & el)1939 	void resize(size_t len, const T& el) {
1940 		if(sz_ < len) expandCopy((len + S) * M);
1941 		if(len > len_) {
1942 			for(size_t i = len_; i < len; i++) {
1943 				cs_[i] = el;
1944 			}
1945 		}
1946 		len_ = len;
1947 	}
1948 
1949 	/**
1950 	 * Set the first len elements of the buffer to el.
1951 	 */
fill(size_t len,const T & el)1952 	void fill(size_t len, const T& el) {
1953 		assert_leq(len, len_);
1954 		for(size_t i = 0; i < len; i++) {
1955 			cs_[i] = el;
1956 		}
1957 	}
1958 
1959 	/**
1960 	 * Set all elements of the buffer to el.
1961 	 */
fill(const T & el)1962 	void fill(const T& el) {
1963 		fill(len_, el);
1964 	}
1965 
1966 	/**
1967 	 * Trim len characters from the beginning of the string.
1968 	 */
trimBegin(size_t len)1969 	void trimBegin(size_t len) {
1970 		assert_leq(len, len_);
1971 		if(len == len_) {
1972 			len_ = 0; return;
1973 		}
1974 		for(size_t i = 0; i < len_-len; i++) {
1975 			cs_[i] = cs_[i+len];
1976 		}
1977 		len_ -= len;
1978 	}
1979 
1980 	/**
1981 	 * Trim len characters from the end of the string.
1982 	 */
trimEnd(size_t len)1983 	size_t trimEnd(size_t len) {
1984 		if(len >= len_) {
1985 			size_t ret = len_;
1986 			len_ = 0;
1987 			return ret;
1988 		}
1989 		len_ -= len;
1990 		return len;
1991 	}
1992 
1993 	/**
1994 	 * Copy 'sz' bytes from buffer 'b' into this string.
1995 	 */
append(const T * b,size_t sz)1996 	void append(const T* b, size_t sz) {
1997 		if(sz_ < len_ + sz) expandCopy((len_ + sz + S) * M);
1998 		memcpy(cs_ + len_, b, sz * sizeof(T));
1999 		len_ += sz;
2000 	}
2001 
2002 	/**
2003 	 * Copy bytes from zero-terminated buffer 'b' into this string.
2004 	 */
append(const T * b)2005 	void append(const T* b) {
2006 		append(b, strlen(b));
2007 	}
2008 
2009 	/**
2010 	 * Return the length of the string.
2011 	 */
length()2012 	size_t length() const { return len_; }
2013 
2014 	/**
2015 	 * Clear the buffer.
2016 	 */
clear()2017 	void clear() { len_ = 0; }
2018 
2019 	/**
2020 	 * Return true iff the buffer is empty.
2021 	 */
empty()2022 	bool empty() const { return len_ == 0; }
2023 
2024 	/**
2025 	 * Put a terminator in the 'len_'th element and then return a
2026 	 * pointer to the buffer.  Useful for printing.
2027 	 */
toZBufXForm(const char * xform)2028 	const char* toZBufXForm(const char *xform) const {
2029 		ASSERT_ONLY(size_t xformElts = strlen(xform));
2030 		if(empty()) {
2031 			const_cast<char&>(zero_) = 0;
2032 			return &zero_;
2033 		}
2034 		char* printcs = const_cast<char*>(printcs_);
2035 		// Lazily allocate space for print buffer
2036 		for(size_t i = 0; i < len_; i++) {
2037 			assert_lt(cs_[i], (int)xformElts);
2038 			printcs[i] = xform[(int)cs_[i]];
2039 		}
2040 		printcs[len_] = 0;
2041 		return printcs_;
2042 	}
2043 
2044 	/**
2045 	 * Put a terminator in the 'len_'th element and then return a
2046 	 * pointer to the buffer.  Useful for printing.
2047 	 */
toZBuf()2048 	virtual const T* toZBuf() const {
2049 		if(empty()) {
2050 			const_cast<T&>(zeroT_) = 0;
2051 			return &zeroT_;
2052 		}
2053 		assert_leq(len_, sz_);
2054 		const_cast<T*>(cs_)[len_] = 0;
2055 		return cs_;
2056 	}
2057 
2058 	/**
2059 	 * Return true iff this DNA string matches the given nucleotide
2060 	 * character string.
2061 	 */
eq(const char * str)2062 	bool eq(const char *str) const {
2063 		const char *self = toZBuf();
2064 		return strcmp(str, self) == 0;
2065 	}
2066 
2067 	/**
2068 	 * Return a const version of the raw buffer.
2069 	 */
buf()2070 	const T* buf() const { return cs_; }
2071 
2072 	/**
2073 	 * Return a writeable version of the raw buffer.
2074 	 */
wbuf()2075 	T* wbuf() { return cs_; }
2076 
2077 protected:
2078 	/**
2079 	 * Allocate new, bigger buffer and copy old contents into it.  If
2080 	 * requested size can be accommodated by current buffer, do nothing.
2081 	 */
expandCopy(size_t sz)2082 	void expandCopy(size_t sz) {
2083 		if(sz_ >= sz) return; // done!
2084 		T *tmp  = new T[sz + 1];
2085 		char *ptmp = new char[sz + 1];
2086 		if(cs_ != NULL) {
2087 			memcpy(tmp, cs_, sizeof(T)*len_);
2088 			delete[] cs_;
2089 		}
2090 		if(printcs_ != NULL) {
2091 			memcpy(ptmp, printcs_, sizeof(char)*len_);
2092 			delete[] printcs_;
2093 		}
2094 		cs_ = tmp;
2095 		printcs_ = ptmp;
2096 		sz_ = sz;
2097 	}
2098 
2099 	/**
2100 	 * Allocate new, bigger buffer.  If requested size can be
2101 	 * accommodated by current buffer, do nothing.
2102 	 */
expandNoCopy(size_t sz)2103 	void expandNoCopy(size_t sz) {
2104 		if(sz_ >= sz) return; // done!
2105 		if(cs_      != NULL) delete[] cs_;
2106 		if(printcs_ != NULL) delete[] printcs_;
2107 		cs_ = new T[sz + 1];
2108 		printcs_ = new char[sz + 1];
2109 		sz_ = sz;
2110 	}
2111 
2112 	T *cs_;      // +1 so that we have the option of dropping in a terminating "\0"
2113 	char *printcs_; // +1 so that we have the option of dropping in a terminating "\0"
2114 	char zero_;  // 0 terminator for empty string
2115 	T zeroT_;    // 0 terminator for empty string
2116 	size_t len_; // # filled-in elements
2117 	size_t sz_;  // size capacity of cs_
2118 };
2119 
2120 /**
2121  * Simple string class with in-object storage.
2122  *
2123  * All copies induced by, e.g., operator=, the copy constructor,
2124  * install() and append(), are shallow (using memcpy/sizeof).  If deep
2125  * copies are needed, use a different class.
2126  *
2127  * Reading from an uninitialized element results in an assert as long
2128  * as NDEBUG is not defined.  If NDEBUG is defined, the result is
2129  * undefined.
2130  */
2131 template<typename T, int S>
2132 class SStringFixed {
2133 public:
SStringFixed()2134 	explicit SStringFixed() : len_(0) { }
2135 
2136 	/**
2137 	 * Create an SStringFixed from another SStringFixed.
2138 	 */
SStringFixed(const SStringFixed<T,S> & o)2139 	SStringFixed(const SStringFixed<T, S>& o) {
2140 		*this = o;
2141 	}
2142 
2143 	/**
2144 	 * Create an SStringFixed from another SStringFixed.
2145 	 */
SStringFixed(const std::basic_string<T> & str)2146 	explicit SStringFixed(const std::basic_string<T>& str) {
2147 		install(str.c_str(), str.length());
2148 	}
2149 
2150 	/**
2151 	 * Create an SStringFixed from an array and size.
2152 	 */
SStringFixed(const T * b,size_t sz)2153 	explicit SStringFixed(const T* b, size_t sz) {
2154 		install(b, sz);
2155 	}
2156 
2157 	/**
2158 	 * Create an SStringFixed from a zero-terminated string.
2159 	 */
SStringFixed(const T * b)2160 	explicit SStringFixed(const T* b) {
2161 		install(b, strlen(b));
2162 	}
2163 
~SStringFixed()2164 	virtual ~SStringFixed() { } // C++ needs this
2165 
2166 	/**
2167 	 * Retrieve constant version of element i.
2168 	 */
2169 	inline const T& operator[](size_t i) const {
2170 		return get(i);
2171 	}
2172 
2173 	/**
2174 	 * Retrieve mutable version of element i.
2175 	 */
2176 	inline T& operator[](size_t i) {
2177 		return get(i);
2178 	}
2179 
2180 	/**
2181 	 * Retrieve constant version of element i.
2182 	 */
get(size_t i)2183 	inline const T& get(size_t i) const {
2184 		assert_lt(i, len_);
2185 		return cs_[i];
2186 	}
2187 
2188 	/**
2189 	 * Retrieve mutable version of element i.
2190 	 */
get(size_t i)2191 	inline T& get(size_t i) {
2192 		assert_lt(i, len_);
2193 		return cs_[i];
2194 	}
2195 
2196 	/**
2197 	 * Return ith character from the left of either the forward or the
2198 	 * reverse-complement version of the read.
2199 	 */
2200 	T windowGet(
2201 		size_t i,
2202 		bool   fw,
2203 		size_t depth = 0,
2204 		size_t len = 0) const
2205 	{
2206 		if(len == 0) len = len_;
2207 		assert_lt(i, len);
2208 		assert_leq(len, len_ - depth);
2209 		return fw ? cs_[depth+i] : cs_[depth+len-i-1];
2210 	}
2211 
2212 	/**
2213 	 * Return ith character from the left of either the forward or the
2214 	 * reverse-complement version of the read.
2215 	 */
2216 	void windowGet(
2217 		T& ret,
2218 		bool   fw,
2219 		size_t depth = 0,
2220 		size_t len = 0) const
2221 	{
2222 		if(len == 0) len = len_;
2223 		assert_leq(len, len_ - depth);
2224 		for(size_t i = 0; i < len; i++) {
2225 			ret.append(fw ? cs_[depth+i] : cs_[depth+len-i-1]);
2226 		}
2227 	}
2228 
2229 	/**
2230 	 * Assignment to other SStringFixed.
2231 	 */
2232 	SStringFixed<T,S>& operator=(const SStringFixed<T,S>& o) {
2233 		install(o.cs_, o.len_);
2234 		return *this;
2235 	}
2236 
2237 	/**
2238 	 * Assignment from a std::basic_string
2239 	 */
2240 	SStringFixed<T,S>& operator=(const std::basic_string<T>& o) {
2241 		install(o);
2242 		return *this;
2243 	}
2244 
2245 	/**
2246 	 * Insert char c before position 'idx'; slide subsequent chars down.
2247 	 */
insert(const T & c,size_t idx)2248 	void insert(const T& c, size_t idx) {
2249 		assert_lt(len_, S);
2250 		assert_lt(idx, len_);
2251 		// Move everyone down by 1
2252 		for(int i = len_; i > idx; i--) {
2253 			cs_[i] = cs_[i-1];
2254 		}
2255 		cs_[idx] = c;
2256 		len_++;
2257 	}
2258 
2259 	/**
2260 	 * Set character at index 'idx' to 'c'.
2261 	 */
set(int c,size_t idx)2262 	void set(int c, size_t idx) {
2263 		assert_lt(idx, len_);
2264 		cs_[idx] = c;
2265 	}
2266 
2267 	/**
2268 	 * Append char c.
2269 	 */
append(const T & c)2270 	void append(const T& c) {
2271 		assert_lt(len_, S);
2272 		cs_[len_++] = c;
2273 	}
2274 
2275 	/**
2276 	 * Delete char at position 'idx'; slide subsequent chars up.
2277 	 */
remove(size_t idx)2278 	void remove(size_t idx) {
2279 		assert_lt(idx, len_);
2280 		assert_gt(len_, 0);
2281 		for(size_t i = idx; i < len_-1; i++) {
2282 			cs_[i] = cs_[i+1];
2283 		}
2284 		len_--;
2285 	}
2286 
2287 	/**
2288 	 * Copy 'sz' bytes from buffer 'b' into this string.
2289 	 */
install(const T * b,size_t sz)2290 	virtual void install(const T* b, size_t sz) {
2291 		assert_leq(sz, S);
2292 		memcpy(cs_, b, sz * sizeof(T));
2293 		len_ = sz;
2294 	}
2295 
2296 	/**
2297 	 * Copy all bytes from zero-terminated buffer 'b' into this string.
2298 	 */
install(const T * b)2299 	void install(const T* b) { install(b, strlen(b)); }
2300 
2301 	/**
2302 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
2303 	 * in the process.
2304 	 */
installReverse(const char * b,size_t sz)2305 	void installReverse(const char* b, size_t sz) {
2306 		assert_leq(sz, S);
2307 		for(size_t i = 0; i < sz; i++) {
2308 			cs_[i] = b[sz-i-1];
2309 		}
2310 		len_ = sz;
2311 	}
2312 
2313 	/**
2314 	 * Copy 'sz' bytes from buffer 'b' into this string, reversing them
2315 	 * in the process.
2316 	 */
installReverse(const SStringFixed<T,S> & b)2317 	void installReverse(const SStringFixed<T, S>& b) {
2318 		assert_leq(b.len_, S);
2319 		for(size_t i = 0; i < b.len_; i++) {
2320 			cs_[i] = b.cs_[b.len_ - i - 1];
2321 		}
2322 		len_ = b.len_;
2323 	}
2324 
2325 	/**
2326 	 * Return true iff the two strings are equal.
2327 	 */
2328 	bool operator==(const SStringFixed<T, S>& o) {
2329 		return sstr_eq(*this, o);
2330 	}
2331 
2332 	/**
2333 	 * Return true iff the two strings are not equal.
2334 	 */
2335 	bool operator!=(const SStringFixed<T, S>& o) {
2336 		return sstr_neq(*this, o);
2337 	}
2338 
2339 	/**
2340 	 * Return true iff this string is less than given string.
2341 	 */
2342 	bool operator<(const SStringFixed<T, S>& o) {
2343 		return sstr_lt(*this, o);
2344 	}
2345 
2346 	/**
2347 	 * Return true iff this string is greater than given string.
2348 	 */
2349 	bool operator>(const SStringFixed<T, S>& o) {
2350 		return sstr_gt(*this, o);
2351 	}
2352 
2353 	/**
2354 	 * Return true iff this string is less than or equal to given string.
2355 	 */
2356 	bool operator<=(const SStringFixed<T, S>& o) {
2357 		return sstr_leq(*this, o);
2358 	}
2359 
2360 	/**
2361 	 * Return true iff this string is greater than or equal to given string.
2362 	 */
2363 	bool operator>=(const SStringFixed<T, S>& o) {
2364 		return sstr_geq(*this, o);
2365 	}
2366 
2367 	/**
2368 	 * Reverse the buffer in place.
2369 	 */
reverse()2370 	void reverse() {
2371 		for(size_t i = 0; i < (len_ >> 1); i++) {
2372 			T tmp = get(i);
2373 			set(get(len_-i-1), i);
2374 			set(tmp, len_-i-1);
2375 		}
2376 	}
2377 
2378 	/**
2379 	 * Reverse a substring of the buffer in place.
2380 	 */
reverseWindow(size_t off,size_t len)2381 	void reverseWindow(size_t off, size_t len) {
2382 		assert_leq(off, len_);
2383 		assert_leq(off + len, len_);
2384 		size_t mid = len >> 1;
2385 		for(size_t i = 0; i < mid; i++) {
2386 			T tmp = get(off+i);
2387 			set(get(off+len-i-1), off+i);
2388 			set(tmp, off+len-i-1);
2389 		}
2390 	}
2391 
2392 	/**
2393 	 * Simply resize the buffer.  If the buffer is resized to be
2394 	 * longer, the newly-added elements will contain garbage and should
2395 	 * be initialized immediately.
2396 	 */
resize(size_t len)2397 	void resize(size_t len) {
2398 		assert_lt(len, S);
2399 		len_ = len;
2400 	}
2401 
2402 	/**
2403 	 * Simply resize the buffer.  If the buffer is resized to be
2404 	 * longer, new elements will be initialized with 'el'.
2405 	 */
resize(size_t len,const T & el)2406 	void resize(size_t len, const T& el) {
2407 		assert_lt(len, S);
2408 		if(len > len_) {
2409 			for(size_t i = len_; i < len; i++) {
2410 				cs_[i] = el;
2411 			}
2412 		}
2413 		len_ = len;
2414 	}
2415 
2416 	/**
2417 	 * Set the first len elements of the buffer to el.
2418 	 */
fill(size_t len,const T & el)2419 	void fill(size_t len, const T& el) {
2420 		assert_leq(len, len_);
2421 		for(size_t i = 0; i < len; i++) {
2422 			cs_[i] = el;
2423 		}
2424 	}
2425 
2426 	/**
2427 	 * Set all elements of the buffer to el.
2428 	 */
fill(const T & el)2429 	void fill(const T& el) {
2430 		fill(len_, el);
2431 	}
2432 
2433 	/**
2434 	 * Trim len characters from the beginning of the string.
2435 	 */
trimBegin(size_t len)2436 	void trimBegin(size_t len) {
2437 		assert_leq(len, len_);
2438 		if(len == len_) {
2439 			len_ = 0; return;
2440 		}
2441 		for(size_t i = 0; i < len_-len; i++) {
2442 			cs_[i] = cs_[i+len];
2443 		}
2444 		len_ -= len;
2445 	}
2446 
2447 	/**
2448 	 * Trim len characters from the end of the string.
2449 	 */
trimEnd(size_t len)2450 	void trimEnd(size_t len) {
2451 		if(len >= len_) len_ = 0;
2452 		else len_ -= len;
2453 	}
2454 
2455 	/**
2456 	 * Copy 'sz' bytes from buffer 'b' into this string.
2457 	 */
append(const T * b,size_t sz)2458 	void append(const T* b, size_t sz) {
2459 		assert_leq(sz + len_, S);
2460 		memcpy(cs_ + len_, b, sz * sizeof(T));
2461 		len_ += sz;
2462 	}
2463 
2464 	/**
2465 	 * Copy bytes from zero-terminated buffer 'b' into this string.
2466 	 */
append(const T * b)2467 	void append(const T* b) {
2468 		append(b, strlen(b));
2469 	}
2470 
2471 	/**
2472 	 * Return the length of the string.
2473 	 */
length()2474 	size_t length() const { return len_; }
2475 
2476 	/**
2477 	 * Clear the buffer.
2478 	 */
clear()2479 	void clear() { len_ = 0; }
2480 
2481 	/**
2482 	 * Return true iff the buffer is empty.
2483 	 */
empty()2484 	bool empty() const { return len_ == 0; }
2485 
2486 	/**
2487 	 * Put a terminator in the 'len_'th element and then return a
2488 	 * pointer to the buffer.  Useful for printing.
2489 	 */
toZBuf()2490 	virtual const T* toZBuf() const {
2491 		const_cast<T*>(cs_)[len_] = 0;
2492 		return cs_;
2493 	}
2494 
2495 	/**
2496 	 * Return true iff this DNA string matches the given nucleotide
2497 	 * character string.
2498 	 */
eq(const char * str)2499 	bool eq(const char *str) const {
2500 		const char *self = toZBuf();
2501 		return strcmp(str, self) == 0;
2502 	}
2503 
2504 	/**
2505 	 * Put a terminator in the 'len_'th element and then return a
2506 	 * pointer to the buffer.  Useful for printing.
2507 	 */
toZBufXForm(const char * xform)2508 	const char* toZBufXForm(const char *xform) const {
2509 		ASSERT_ONLY(size_t xformElts = strlen(xform));
2510 		char* printcs = const_cast<char*>(printcs_);
2511 		for(size_t i = 0; i < len_; i++) {
2512 			assert_lt(cs_[i], (int)xformElts);
2513 			printcs[i] = xform[cs_[i]];
2514 		}
2515 		printcs[len_] = 0;
2516 		return printcs_;
2517 	}
2518 
2519 	/**
2520 	 * Return a const version of the raw buffer.
2521 	 */
buf()2522 	const T* buf() const { return cs_; }
2523 
2524 	/**
2525 	 * Return a writeable version of the raw buffer.
2526 	 */
wbuf()2527 	T* wbuf() { return cs_; }
2528 
2529 protected:
2530 	T cs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
2531 	char printcs_[S+1]; // +1 so that we have the option of dropping in a terminating "\0"
2532 	size_t len_;
2533 };
2534 
2535 //
2536 // Stream put operators
2537 //
2538 
2539 template <typename T, int S, int M>
2540 std::ostream& operator<< (std::ostream& os, const SStringExpandable<T, S, M>& str) {
2541 	os << str.toZBuf();
2542 	return os;
2543 }
2544 
2545 template <typename T, int S>
2546 std::ostream& operator<< (std::ostream& os, const SStringFixed<T, S>& str) {
2547 	os << str.toZBuf();
2548 	return os;
2549 }
2550 
2551 extern uint8_t asc2dna[];
2552 extern uint8_t asc2col[];
2553 
2554 /**
2555  * Encapsulates a fixed-length DNA string with characters encoded as
2556  * chars.  Only capable of encoding A, C, G, T and N.  The length is
2557  * specified via the template parameter S.
2558  */
2559 template<int S>
2560 class SDnaStringFixed : public SStringFixed<char, S> {
2561 public:
2562 
SDnaStringFixed()2563 	explicit SDnaStringFixed() : SStringFixed<char, S>() { }
2564 
2565 	/**
2566 	 * Create an SStringFixed from another SStringFixed.
2567 	 */
SDnaStringFixed(const SDnaStringFixed<S> & o)2568 	SDnaStringFixed(const SDnaStringFixed<S>& o) :
2569 		SStringFixed<char, S>(o) { }
2570 
2571 	/**
2572 	 * Create an SStringFixed from a C++ basic_string.
2573 	 */
SDnaStringFixed(const std::basic_string<char> & str)2574 	explicit SDnaStringFixed(const std::basic_string<char>& str) :
2575 		SStringFixed<char, S>(str) { }
2576 
2577 	/**
2578 	 * Create an SStringFixed from an array and size.
2579 	 */
SDnaStringFixed(const char * b,size_t sz)2580 	explicit SDnaStringFixed(const char* b, size_t sz) :
2581 		SStringFixed<char, S>(b, sz) { }
2582 
2583 	/**
2584 	 * Create an SStringFixed from a zero-terminated string.
2585 	 */
2586 	explicit SDnaStringFixed(
2587 		const char* b,
2588 		bool chars = false,
2589 		bool colors = false) :
2590 		SStringFixed<char, S>()
2591 	{
2592 		if(chars) {
2593 			if(colors) {
2594 				installColors(b, strlen(b));
2595 			} else {
2596 				installChars(b, strlen(b));
2597 			}
2598 		} else {
2599 			install(b, strlen(b));
2600 		}
2601 	}
2602 
~SDnaStringFixed()2603 	virtual ~SDnaStringFixed() { } // C++ needs this
2604 
2605 	/**
2606 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2607 	 * complementing them in the process, assuming an encoding where
2608 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
2609 	 */
installReverseComp(const char * b,size_t sz)2610 	void installReverseComp(const char* b, size_t sz) {
2611 		assert_leq(sz, S);
2612 		for(size_t i = 0; i < sz; i++) {
2613 			this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
2614 		}
2615 		this->len_ = sz;
2616 	}
2617 
2618 	/**
2619 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2620 	 * complementing them in the process, assuming an encoding where
2621 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
2622 	 */
installReverseComp(const SDnaStringFixed<S> & b)2623 	void installReverseComp(const SDnaStringFixed<S>& b) {
2624 		assert_leq(b.len_, S);
2625 		for(size_t i = 0; i < b.len_; i++) {
2626 			this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
2627 		}
2628 		this->len_ = b.len_;
2629 	}
2630 
2631 	/**
2632 	 * Either reverse or reverse-complement (depending on "color") this
2633 	 * DNA buffer in-place.
2634 	 */
reverseComp()2635 	void reverseComp() {
2636 		for(size_t i = 0; i < (this->len_ >> 1); i++) {
2637 			char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
2638 			char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
2639 			this->cs_[i] = tmp2;
2640 			this->cs_[this->len_-i-1] = tmp1;
2641 		}
2642 		// Do middle element iff there are an odd number
2643 		if((this->len_ & 1) != 0) {
2644 			char tmp = this->cs_[this->len_ >> 1];
2645 			tmp = (tmp == 4 ? 4 : tmp ^ 3);
2646 			this->cs_[this->len_ >> 1] = tmp;
2647 		}
2648 	}
2649 
2650 	/**
2651 	 * Copy 'sz' bytes from buffer 'b' into this string.
2652 	 */
install(const char * b,size_t sz)2653 	virtual void install(const char* b, size_t sz) {
2654 		assert_leq(sz, S);
2655 		memcpy(this->cs_, b, sz);
2656 #ifndef NDEBUG
2657 		for(size_t i = 0; i < sz; i++) {
2658 			assert_leq(this->cs_[i], 4);
2659 			assert_geq(this->cs_[i], 0);
2660 		}
2661 #endif
2662 		this->len_ = sz;
2663 	}
2664 
2665 	/**
2666 	 * Copy buffer 'b' of ASCII DNA characters into normal DNA
2667 	 * characters.
2668 	 */
installChars(const char * b,size_t sz)2669 	virtual void installChars(const char* b, size_t sz) {
2670 		assert_leq(sz, S);
2671 		for(size_t i = 0; i < sz; i++) {
2672 			assert_in(toupper(b[i]), "ACGTN-");
2673 			this->cs_[i] = asc2dna[(int)b[i]];
2674 			assert_geq(this->cs_[i], 0);
2675 			assert_leq(this->cs_[i], 4);
2676 		}
2677 		this->len_ = sz;
2678 	}
2679 
2680 	/**
2681 	 * Copy buffer 'b' of ASCII color characters into normal DNA
2682 	 * characters.
2683 	 */
installColors(const char * b,size_t sz)2684 	virtual void installColors(const char* b, size_t sz) {
2685 		assert_leq(sz, S);
2686 		for(size_t i = 0; i < sz; i++) {
2687 			assert_in(b[i], "0123.");
2688 			this->cs_[i] = asc2col[(int)b[i]];
2689 			assert_geq(this->cs_[i], 0);
2690 			assert_leq(this->cs_[i], 4);
2691 		}
2692 		this->len_ = sz;
2693 	}
2694 
2695 	/**
2696 	 * Copy C++ string of ASCII DNA characters into normal DNA
2697 	 * characters.
2698 	 */
installChars(const std::basic_string<char> & str)2699 	virtual void installChars(const std::basic_string<char>& str) {
2700 		installChars(str.c_str(), str.length());
2701 	}
2702 
2703 	/**
2704 	 * Copy C++ string of ASCII color characters into normal DNA
2705 	 * characters.
2706 	 */
installColors(const std::basic_string<char> & str)2707 	virtual void installColors(const std::basic_string<char>& str) {
2708 		installColors(str.c_str(), str.length());
2709 	}
2710 
2711 	/**
2712 	 * Set DNA character at index 'idx' to 'c'.
2713 	 */
set(int c,size_t idx)2714 	void set(int c, size_t idx) {
2715 		assert_lt(idx, this->len_);
2716 		assert_leq(c, 4);
2717 		assert_geq(c, 0);
2718 		this->cs_[idx] = c;
2719 	}
2720 
2721 	/**
2722 	 * Append DNA char c.
2723 	 */
append(const char & c)2724 	void append(const char& c) {
2725 		assert_lt(this->len_, S);
2726 		assert_leq(c, 4);
2727 		assert_geq(c, 0);
2728 		this->cs_[this->len_++] = c;
2729 	}
2730 
2731 	/**
2732 	 * Set DNA character at index 'idx' to 'c'.
2733 	 */
setChar(char c,size_t idx)2734 	void setChar(char c, size_t idx) {
2735 		assert_lt(idx, this->len_);
2736 		assert_in(toupper(c), "ACGTN");
2737 		this->cs_[idx] = asc2dna[(int)c];
2738 	}
2739 
2740 	/**
2741 	 * Append DNA character.
2742 	 */
appendChar(char c)2743 	void appendChar(char c) {
2744 		assert_lt(this->len_, S);
2745 		assert_in(toupper(c), "ACGTN");
2746 		this->cs_[this->len_++] = asc2dna[(int)c];
2747 	}
2748 
2749 	/**
2750 	 * Return DNA character corresponding to element 'idx'.
2751 	 */
toChar(size_t idx)2752 	char toChar(size_t idx) const {
2753 		assert_geq((int)this->cs_[idx], 0);
2754 		assert_leq((int)this->cs_[idx], 4);
2755 		return "ACGTN"[(int)this->cs_[idx]];
2756 	}
2757 
2758 	/**
2759 	 * Retrieve constant version of element i.
2760 	 */
2761 	const char& operator[](size_t i) const {
2762 		return this->get(i);
2763 	}
2764 
2765 	/**
2766 	 * Retrieve constant version of element i.
2767 	 */
get(size_t i)2768 	const char& get(size_t i) const {
2769 		assert_lt(i, this->len_);
2770 		assert_leq(this->cs_[i], 4);
2771 		assert_geq(this->cs_[i], 0);
2772 		return this->cs_[i];
2773 	}
2774 
2775 	/**
2776 	 * Return the ith character in the window defined by fw, color,
2777 	 * depth and len.
2778 	 */
2779 	char windowGetDna(
2780 		size_t i,
2781 		bool   fw,
2782 		size_t depth = 0,
2783 		size_t len = 0) const
2784 	{
2785 		if(len == 0) len = this->len_;
2786 		assert_lt(i, len);
2787 		assert_leq(len, this->len_ - depth);
2788 		if(fw) return this->cs_[depth+i];
2789 		else   return compDna(this->cs_[depth+len-i-1]);
2790 	}
2791 
2792 	/**
2793 	 * Fill the given DNA buffer with the substring specified by fw,
2794 	 * color, depth and len.
2795 	 */
2796 	void windowGetDna(
2797 		SDnaStringFixed<S>& buf,
2798 		bool   fw,
2799 		size_t depth = 0,
2800 		size_t len = 0) const
2801 	{
2802 		if(len == 0) len = this->len_;
2803 		assert_leq(len, this->len_ - depth);
2804 		for(size_t i = 0; i < len; i++) {
2805 			buf.append(fw ? this->cs_[depth+i] :
2806 			                compDna(this->cs_[depth+len-i-1]));
2807 		}
2808 	}
2809 
2810 	/**
2811 	 * Put a terminator in the 'len_'th element and then return a
2812 	 * pointer to the buffer.  Useful for printing.
2813 	 */
toZBuf()2814 	virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
2815 };
2816 
2817 /**
2818  * Encapsulates a fixed-length DNA string with characters encoded as
2819  * chars.  Only capable of encoding A, C, G, T and N.  The length is
2820  * specified via the template parameter S.
2821  */
2822 
2823 template<int S = 1024, int M = 2>
2824 class SDnaStringExpandable : public SStringExpandable<char, S, M> {
2825 public:
2826 
SDnaStringExpandable()2827 	explicit SDnaStringExpandable() : SStringExpandable<char, S, M>() { }
2828 
2829 	/**
2830 	 * Create an SStringFixed from another SStringFixed.
2831 	 */
SDnaStringExpandable(const SDnaStringExpandable<S,M> & o)2832 	SDnaStringExpandable(const SDnaStringExpandable<S, M>& o) :
2833 		SStringExpandable<char, S, M>(o) { }
2834 
2835 	/**
2836 	 * Create an SStringFixed from a C++ basic_string.
2837 	 */
2838 	explicit SDnaStringExpandable(
2839 		const std::basic_string<char>& str,
2840 		bool chars = false,
2841 		bool colors = false) :
2842 		SStringExpandable<char, S, M>()
2843 	{
2844 		if(chars) {
2845 			if(colors) {
2846 				installColors(str);
2847 			} else {
2848 				installChars(str);
2849 			}
2850 		} else {
2851 			install(str);
2852 		}
2853 	}
2854 
2855 	/**
2856 	 * Create an SStringFixed from an array and size.
2857 	 */
2858 	explicit SDnaStringExpandable(
2859 		const char* b,
2860 		size_t sz,
2861 		bool chars = false,
2862 		bool colors = false) :
2863 		SStringExpandable<char, S, M>()
2864 	{
2865 		if(chars) {
2866 			if(colors) {
2867 				installColors(b, sz);
2868 			} else {
2869 				installChars(b, sz);
2870 			}
2871 		} else {
2872 			install(b, sz);
2873 		}
2874 	}
2875 
2876 	/**
2877 	 * Create an SStringFixed from a zero-terminated string.
2878 	 */
2879 	explicit SDnaStringExpandable(
2880 		const char* b,
2881 		bool chars = false,
2882 		bool colors = false) :
2883 		SStringExpandable<char, S, M>()
2884 	{
2885 		install(b, chars, colors);
2886 	}
2887 
~SDnaStringExpandable()2888 	virtual ~SDnaStringExpandable() { } // C++ needs this
2889 
2890 	/**
2891 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2892 	 * complementing them in the process, assuming an encoding where
2893 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
2894 	 */
installReverseComp(const char * b,size_t sz)2895 	void installReverseComp(const char* b, size_t sz) {
2896 		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2897 		for(size_t i = 0; i < sz; i++) {
2898 			this->cs_[i] = (b[sz-i-1] == 4 ? 4 : b[sz-i-1] ^ 3);
2899 		}
2900 		this->len_ = sz;
2901 	}
2902 
2903 	/**
2904 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
2905 	 * complementing them in the process, assuming an encoding where
2906 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
2907 	 */
installReverseComp(const SDnaStringExpandable<S,M> & b)2908 	void installReverseComp(const SDnaStringExpandable<S, M>& b) {
2909 		if(this->sz_ < b.len_) this->expandCopy((b.len_ + S) * M);
2910 		for(size_t i = 0; i < b.len_; i++) {
2911 			this->cs_[i] = (b.cs_[b.len_-i-1] == 4 ? 4 : b.cs_[b.len_-i-1] ^ 3);
2912 		}
2913 		this->len_ = b.len_;
2914 	}
2915 
2916 	/**
2917 	 * Either reverse or reverse-complement (depending on "color") this
2918 	 * DNA buffer in-place.
2919 	 */
reverseComp()2920 	void reverseComp() {
2921 		for(size_t i = 0; i < (this->len_ >> 1); i++) {
2922 			char tmp1 = (this->cs_[i] == 4 ? 4 : this->cs_[i] ^ 3);
2923 			char tmp2 = (this->cs_[this->len_-i-1] == 4 ? 4 : this->cs_[this->len_-i-1] ^ 3);
2924 			this->cs_[i] = tmp2;
2925 			this->cs_[this->len_-i-1] = tmp1;
2926 		}
2927 		// Do middle element iff there are an odd number
2928 		if((this->len_ & 1) != 0) {
2929 			char tmp = this->cs_[this->len_ >> 1];
2930 			tmp = (tmp == 4 ? 4 : tmp ^ 3);
2931 			this->cs_[this->len_ >> 1] = tmp;
2932 		}
2933 	}
2934 
2935 	/**
2936 	 * Copy 'sz' bytes from buffer 'b' into this string.
2937 	 */
2938 	virtual void install(
2939 		const char* b,
2940 		bool chars = false,
2941 		bool colors = false)
2942 	{
2943 		if(chars) {
2944 			if(colors) {
2945 				installColors(b, strlen(b));
2946 			} else {
2947 				installChars(b, strlen(b));
2948 			}
2949 		} else {
2950 			install(b, strlen(b));
2951 		}
2952 	}
2953 
2954 	/**
2955 	 * Copy 'sz' bytes from buffer 'b' into this string.
2956 	 */
install(const char * b,size_t sz)2957 	virtual void install(const char* b, size_t sz) {
2958 		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2959 		memcpy(this->cs_, b, sz);
2960 #ifndef NDEBUG
2961 		for(size_t i = 0; i < sz; i++) {
2962 			assert_range(0, 4, (int)this->cs_[i]);
2963 		}
2964 #endif
2965 		this->len_ = sz;
2966 	}
2967 
2968 	/**
2969 	 * Copy buffer 'b' of ASCII DNA characters into normal DNA
2970 	 * characters.
2971 	 */
installChars(const char * b,size_t sz)2972 	virtual void installChars(const char* b, size_t sz) {
2973 		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2974 		for(size_t i = 0; i < sz; i++) {
2975 			assert_in(toupper(b[i]), "ACGTN-");
2976 			this->cs_[i] = asc2dna[(int)b[i]];
2977 			assert_range(0, 4, (int)this->cs_[i]);
2978 		}
2979 		this->len_ = sz;
2980 	}
2981 
2982 	/**
2983 	 * Copy buffer 'b' of ASCII color characters into normal DNA
2984 	 * characters.
2985 	 */
installColors(const char * b,size_t sz)2986 	virtual void installColors(const char* b, size_t sz) {
2987 		if(this->sz_ < sz) this->expandCopy((sz + S) * M);
2988 		for(size_t i = 0; i < sz; i++) {
2989 			assert_in(b[i], "0123.");
2990 			this->cs_[i] = asc2col[(int)b[i]];
2991 			assert_range(0, 4, (int)this->cs_[i]);
2992 		}
2993 		this->len_ = sz;
2994 	}
2995 
2996 	/**
2997 	 * Copy C++ string of ASCII DNA characters into normal DNA
2998 	 * characters.
2999 	 */
installChars(const std::basic_string<char> & str)3000 	virtual void installChars(const std::basic_string<char>& str) {
3001 		installChars(str.c_str(), str.length());
3002 	}
3003 
3004 	/**
3005 	 * Copy C++ string of ASCII color characters into normal DNA
3006 	 * characters.
3007 	 */
installColors(const std::basic_string<char> & str)3008 	virtual void installColors(const std::basic_string<char>& str) {
3009 		installColors(str.c_str(), str.length());
3010 	}
3011 
3012 	/**
3013 	 * Set DNA character at index 'idx' to 'c'.
3014 	 */
set(int c,size_t idx)3015 	void set(int c, size_t idx) {
3016 		assert_lt(idx, this->len_);
3017 		assert_range(0, 4, c);
3018 		this->cs_[idx] = c;
3019 	}
3020 
3021 	/**
3022 	 * Append DNA char c.
3023 	 */
append(const char & c)3024 	void append(const char& c) {
3025 		if(this->sz_ < this->len_ + 1) {
3026 			this->expandCopy((this->len_ + 1 + S) * M);
3027 		}
3028 		assert_range(0, 4, (int)c);
3029 		this->cs_[this->len_++] = c;
3030 	}
3031 
3032 	/**
3033 	 * Set DNA character at index 'idx' to 'c'.
3034 	 */
setChar(char c,size_t idx)3035 	void setChar(char c, size_t idx) {
3036 		assert_lt(idx, this->len_);
3037 		assert_in(toupper(c), "ACGTN");
3038 		this->cs_[idx] = asc2dna[(int)c];
3039 	}
3040 
3041 	/**
3042 	 * Append DNA character.
3043 	 */
appendChar(char c)3044 	void appendChar(char c) {
3045 		if(this->sz_ < this->len_ + 1) {
3046 			this->expandCopy((this->len_ + 1 + S) * M);
3047 		}
3048 		assert_in(toupper(c), "ACGTN");
3049 		this->cs_[this->len_++] = asc2dna[(int)c];
3050 	}
3051 
3052 	/**
3053 	 * Return DNA character corresponding to element 'idx'.
3054 	 */
toChar(size_t idx)3055 	char toChar(size_t idx) const {
3056 		assert_range(0, 4, (int)this->cs_[idx]);
3057 		return "ACGTN"[(int)this->cs_[idx]];
3058 	}
3059 
3060 	/**
3061 	 * Retrieve constant version of element i.
3062 	 */
3063 	inline const char& operator[](size_t i) const {
3064 		return this->get(i);
3065 	}
3066 
3067 	/**
3068 	 * Retrieve constant version of element i.
3069 	 */
get(size_t i)3070 	inline const char& get(size_t i) const {
3071 		assert_lt(i, this->len_);
3072 		assert_range(0, 4, (int)this->cs_[i]);
3073 		return this->cs_[i];
3074 	}
3075 
3076 	/**
3077 	 * Return the ith character in the window defined by fw, color,
3078 	 * depth and len.
3079 	 */
3080 	char windowGetDna(
3081 		size_t i,
3082 		bool   fw,
3083 		size_t depth = 0,
3084 		size_t len = 0) const
3085 	{
3086 		if(len == 0) len = this->len_;
3087 		assert_lt(i, len);
3088 		assert_leq(len, this->len_ - depth);
3089 		if(fw) return this->cs_[depth+i];
3090 		else   return compDna(this->cs_[depth+len-i-1]);
3091 	}
3092 
3093 	/**
3094 	 * Fill the given DNA buffer with the substring specified by fw,
3095 	 * color, depth and len.
3096 	 */
3097 	void windowGetDna(
3098 		SDnaStringExpandable<S, M>& buf,
3099 		bool   fw,
3100 		size_t depth = 0,
3101 		size_t len = 0) const
3102 	{
3103 		if(len == 0) len = this->len_;
3104 		assert_leq(len, this->len_ - depth);
3105 		for(size_t i = 0; i < len; i++) {
3106 			buf.append(fw ? this->cs_[depth+i] :
3107 			                compDna(this->cs_[depth+len-i-1]));
3108 		}
3109 	}
3110 
3111 	/**
3112 	 * Put a terminator in the 'len_'th element and then return a
3113 	 * pointer to the buffer.  Useful for printing.
3114 	 */
toZBuf()3115 	virtual const char* toZBuf() const { return this->toZBufXForm("ACGTN"); }
3116 };
3117 
3118 /**
3119  * Encapsulates an expandable DNA string with characters encoded as
3120  * char-sized masks.  Encodes A, C, G, T, and all IUPAC, as well as the
3121  * empty mask indicating "matches nothing."
3122  */
3123 template<int S = 16, int M = 2>
3124 class SDnaMaskString : public SStringExpandable<char, S, M> {
3125 public:
3126 
SDnaMaskString()3127 	explicit SDnaMaskString() : SStringExpandable<char, S, M>() { }
3128 
3129 	/**
3130 	 * Create an SStringFixed from another SStringFixed.
3131 	 */
SDnaMaskString(const SDnaMaskString<S,M> & o)3132 	SDnaMaskString(const SDnaMaskString<S, M>& o) :
3133 		SStringExpandable<char, S, M>(o) { }
3134 
3135 	/**
3136 	 * Create an SStringFixed from a C++ basic_string.
3137 	 */
SDnaMaskString(const std::basic_string<char> & str)3138 	explicit SDnaMaskString(const std::basic_string<char>& str) :
3139 		SStringExpandable<char, S, M>(str) { }
3140 
3141 	/**
3142 	 * Create an SStringFixed from an array and size.
3143 	 */
SDnaMaskString(const char * b,size_t sz)3144 	explicit SDnaMaskString(const char* b, size_t sz) :
3145 		SStringExpandable<char, S, M>(b, sz) { }
3146 
3147 	/**
3148 	 * Create an SStringFixed from a zero-terminated string.
3149 	 */
3150 	explicit SDnaMaskString(const char* b, bool chars = false) :
3151 		SStringExpandable<char, S, M>()
3152 	{
3153 		if(chars) {
3154 			installChars(b, strlen(b));
3155 		} else {
3156 			install(b, strlen(b));
3157 		}
3158 	}
3159 
~SDnaMaskString()3160 	virtual ~SDnaMaskString() { } // C++ needs this
3161 
3162 	/**
3163 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
3164 	 * complementing them in the process, assuming an encoding where
3165 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
3166 	 */
installReverseComp(const char * b,size_t sz)3167 	void installReverseComp(const char* b, size_t sz) {
3168 		while(this->sz_ < sz) {
3169 			this->expandNoCopy((sz + S) * M);
3170 		}
3171 		for(size_t i = 0; i < sz; i++) {
3172 			this->cs_[i] = maskcomp[(int)b[sz-i-1]];
3173 		}
3174 		this->len_ = sz;
3175 	}
3176 
3177 	/**
3178 	 * Copy 'sz' bytes from buffer 'b' into this string, reverse-
3179 	 * complementing them in the process, assuming an encoding where
3180 	 * 0=A, 1=C, 2=G, 3=T, 4=N.
3181 	 */
installReverseComp(const SDnaMaskString<S,M> & b)3182 	void installReverseComp(const SDnaMaskString<S, M>& b) {
3183 		while(this->sz_ < b.len_) {
3184 			this->expandNoCopy((b.len_ + S) * M);
3185 		}
3186 		for(size_t i = 0; i < b.len_; i++) {
3187 			this->cs_[i] = maskcomp[(int)b.cs_[b.len_-i-1]];
3188 		}
3189 		this->len_ = b.len_;
3190 	}
3191 
3192 	/**
3193 	 * Either reverse or reverse-complement (depending on "color") this
3194 	 * DNA buffer in-place.
3195 	 */
reverseComp()3196 	void reverseComp() {
3197 		for(size_t i = 0; i < (this->len_ >> 1); i++) {
3198 			char tmp1 = maskcomp[(int)this->cs_[i]];
3199 			char tmp2 = maskcomp[(int)this->cs_[this->len_-i-1]];
3200 			this->cs_[i] = tmp2;
3201 			this->cs_[this->len_-i-1] = tmp1;
3202 		}
3203 		// Do middle element iff there are an odd number
3204 		if((this->len_ & 1) != 0) {
3205 			char tmp = this->cs_[this->len_ >> 1];
3206 			tmp = maskcomp[(int)tmp];
3207 			this->cs_[this->len_ >> 1] = tmp;
3208 		}
3209 	}
3210 
3211 	/**
3212 	 * Copy 'sz' bytes from buffer 'b' into this string.
3213 	 */
install(const char * b,size_t sz)3214 	virtual void install(const char* b, size_t sz) {
3215 		while(this->sz_ < sz) {
3216 			this->expandNoCopy((sz + S) * M);
3217 		}
3218 		memcpy(this->cs_, b, sz);
3219 #ifndef NDEBUG
3220 		for(size_t i = 0; i < sz; i++) {
3221 			assert_range((int)this->cs_[i], 0, 15);
3222 		}
3223 #endif
3224 		this->len_ = sz;
3225 	}
3226 
3227 	/**
3228 	 * Copy buffer 'b' of ASCII DNA characters into DNA masks.
3229 	 */
installChars(const char * b,size_t sz)3230 	virtual void installChars(const char* b, size_t sz) {
3231 		while(this->sz_ < sz) {
3232 			this->expandNoCopy((sz + S) * M);
3233 		}
3234 		for(size_t i = 0; i < sz; i++) {
3235 			assert_in(b[i], iupacs);
3236 			this->cs_[i] = asc2dnamask[(int)b[i]];
3237 			assert_range((int)this->cs_[i], 0, 15);
3238 		}
3239 		this->len_ = sz;
3240 	}
3241 
3242 	/**
3243 	 * Copy C++ string of ASCII DNA characters into normal DNA
3244 	 * characters.
3245 	 */
installChars(const std::basic_string<char> & str)3246 	virtual void installChars(const std::basic_string<char>& str) {
3247 		installChars(str.c_str(), str.length());
3248 	}
3249 
3250 	/**
3251 	 * Set DNA character at index 'idx' to 'c'.
3252 	 */
set(int c,size_t idx)3253 	void set(int c, size_t idx) {
3254 		assert_lt(idx, this->len_);
3255 		assert_range(c, 0, 15);
3256 		this->cs_[idx] = c;
3257 	}
3258 
3259 	/**
3260 	 * Append DNA char c.
3261 	 */
append(const char & c)3262 	void append(const char& c) {
3263 		while(this->sz_ < this->len_+1) {
3264 			this->expandNoCopy((this->len_ + 1 + S) * M);
3265 		}
3266 		assert_range((int)c, 0, 15);
3267 		this->cs_[this->len_++] = c;
3268 	}
3269 
3270 	/**
3271 	 * Set DNA character at index 'idx' to 'c'.
3272 	 */
setChar(char c,size_t idx)3273 	void setChar(char c, size_t idx) {
3274 		assert_lt(idx, this->len_);
3275 		assert_in(toupper(c), iupacs);
3276 		this->cs_[idx] = asc2dnamask[(int)c];
3277 	}
3278 
3279 	/**
3280 	 * Append DNA character.
3281 	 */
appendChar(char c)3282 	void appendChar(char c) {
3283 		while(this->sz_ < this->len_+1) {
3284 			expandNoCopy((this->len_ + 1 + S) * M);
3285 		}
3286 		assert_in(toupper(c), iupacs);
3287 		this->cs_[this->len_++] = asc2dnamask[(int)c];
3288 	}
3289 
3290 	/**
3291 	 * Return DNA character corresponding to element 'idx'.
3292 	 */
toChar(size_t idx)3293 	char toChar(size_t idx) const {
3294 		assert_range((int)this->cs_[idx], 0, 15);
3295 		return mask2iupac[(int)this->cs_[idx]];
3296 	}
3297 
3298 	/**
3299 	 * Retrieve constant version of element i.
3300 	 */
3301 	const char& operator[](size_t i) const {
3302 		return this->get(i);
3303 	}
3304 
3305 	/**
3306 	 * Retrieve mutable version of element i.
3307 	 */
3308 	char& operator[](size_t i) {
3309 		return this->get(i);
3310 	}
3311 
3312 	/**
3313 	 * Retrieve constant version of element i.
3314 	 */
get(size_t i)3315 	const char& get(size_t i) const {
3316 		assert_lt(i, this->len_);
3317 		assert_range((int)this->cs_[i], 0, 15);
3318 		return this->cs_[i];
3319 	}
3320 
3321 	/**
3322 	 * Retrieve mutable version of element i.
3323 	 */
get(size_t i)3324 	char& get(size_t i) {
3325 		assert_lt(i, this->len_);
3326 		assert_range((int)this->cs_[i], 0, 15);
3327 		return this->cs_[i];
3328 	}
3329 
3330 	/**
3331 	 * Return the ith character in the window defined by fw, color,
3332 	 * depth and len.
3333 	 */
3334 	char windowGetDna(
3335 		size_t i,
3336 		bool   fw,
3337 		size_t depth = 0,
3338 		size_t len = 0) const
3339 	{
3340 		if(len == 0) len = this->len_;
3341 		assert_lt(i, len);
3342 		assert_leq(len, this->len_ - depth);
3343 		if(fw) return this->cs_[depth+i];
3344 		else   return maskcomp[this->cs_[depth+len-i-1]];
3345 	}
3346 
3347 	/**
3348 	 * Fill the given DNA buffer with the substring specified by fw,
3349 	 * color, depth and len.
3350 	 */
3351 	void windowGetDna(
3352 		SDnaStringFixed<S>& buf,
3353 		bool   fw,
3354 		size_t depth = 0,
3355 		size_t len = 0) const
3356 	{
3357 		if(len == 0) len = this->len_;
3358 		assert_leq(len, this->len_ - depth);
3359 		for(size_t i = 0; i < len; i++) {
3360 			buf.append(fw ? this->cs_[depth+i] :
3361 			                maskcomp[this->cs_[depth+len-i-1]]);
3362 		}
3363 	}
3364 
3365 	/**
3366 	 * Sample a random substring of the given length from this DNA
3367 	 * string and install the result in 'dst'.
3368 	 */
3369 	template<typename T>
3370 	void randSubstr(
3371 		RandomSource& rnd,  // pseudo-random generator
3372 		T& dst,             // put sampled substring here
3373 		size_t len,         // length of substring to extract
3374 		bool watson = true, // true -> possibly extract from Watson strand
3375 		bool crick = true)  // true -> possibly extract from Crick strand
3376 	{
3377 		assert(watson || crick);
3378 		assert_geq(this->len_, len);
3379 		size_t poss = this->len_ - len + 1;
3380 		assert_gt(poss, 0);
3381 		uint32_t rndoff = (uint32_t)(rnd.nextU32() % poss);
3382 		bool fw;
3383 		if     (watson && !crick) fw = true;
3384 		else if(!watson && crick) fw = false;
3385 		else {
3386 			fw = rnd.nextBool();
3387 		}
3388 		if(fw) {
3389 			// Install Watson substring
3390 			for(size_t i = 0; i < len; i++) {
3391 				dst[i] = this->cs_[i + rndoff];
3392 			}
3393 		} else {
3394 			// Install Crick substring
3395 			for(size_t i = 0; i < len; i++) {
3396 				dst[i] = maskcomp[(int)this->cs_[i + rndoff + (len - i - 1)]];
3397 			}
3398 		}
3399 	}
3400 
3401 	/**
3402 	 * Put a terminator in the 'len_'th element and then return a
3403 	 * pointer to the buffer.  Useful for printing.
3404 	 */
toZBuf()3405 	virtual const char* toZBuf() const { return this->toZBufXForm(iupacs); }
3406 };
3407 
3408 typedef SStringExpandable<char, 1024, 2> BTString;
3409 typedef SDnaStringExpandable<1024, 2>    BTDnaString;
3410 typedef SDnaMaskString<32, 2>            BTDnaMask;
3411 
3412 #endif /* SSTRING_H_ */
3413