1 /* The MIT License
2 
3    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
4 
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12 
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25 
26 #include <config.h>
27 
28 #include <stdarg.h>
29 #include <stdio.h>
30 #include <ctype.h>
31 #include <string.h>
32 #include <stdint.h>
33 #include <math.h>
34 #include "htslib/kstring.h"
35 
kputd(double d,kstring_t * s)36 int kputd(double d, kstring_t *s) {
37 	int len = 0;
38 	char buf[21], *cp = buf+20, *ep;
39 	if (d == 0) {
40 		if (signbit(d)) {
41 			kputsn("-0",2,s);
42 			return 2;
43 		} else {
44 			kputsn("0",1,s);
45 			return 1;
46 		}
47 	}
48 
49 	if (d < 0) {
50 		kputc('-',s);
51 		len = 1;
52 		d=-d;
53 	}
54 	if (!(d >= 0.0001 && d <= 999999)) {
55 		if (ks_resize(s, s->l + 50) < 0)
56 			return EOF;
57 		// We let stdio handle the exponent cases
58 		int s2 = sprintf(s->s + s->l, "%g", d);
59 		len += s2;
60 		s->l += s2;
61 		return len;
62 	}
63 
64 	uint64_t i = d*10000000000LL;
65 	// Correction for rounding - rather ugly
66 
67 	// Optimised for small numbers.
68 	// Better still would be __builtin_clz on hi/lo 32 and get the
69 	// starting point very rapidly.
70 	if (d<.0001)
71 		i+=0;
72 	else if (d<0.001)
73 		i+=5;
74 	else if (d < 0.01)
75 		i+=50;
76 	else if (d < 0.1)
77 		i+=500;
78 	else if (d < 1)
79 		i+=5000;
80 	else if (d < 10)
81 		i+=50000;
82 	else if (d < 100)
83 		i+=500000;
84 	else if (d < 1000)
85 		i+=5000000;
86 	else if (d < 10000)
87 		i+=50000000;
88 	else if (d < 100000)
89 		i+=500000000;
90 	else
91 		i+=5000000000LL;
92 
93 	do {
94 		*--cp = '0' + i%10;
95 		i /= 10;
96 	} while (i >= 1);
97 	buf[20] = 0;
98 	int p = buf+20-cp;
99 	if (p <= 10) { // d < 1
100 		//assert(d/1);
101 		cp[6] = 0; ep = cp+5;// 6 precision
102 		while (p < 10) {
103 			*--cp = '0';
104 			p++;
105 		}
106 		*--cp = '.';
107 		*--cp = '0';
108 	} else {
109 		char *xp = --cp;
110 		while (p > 10) {
111 			xp[0] = xp[1];
112 			p--;
113 			xp++;
114 		}
115 		xp[0] = '.';
116 		cp[7] = 0; ep=cp+6;
117 		if (cp[6] == '.') cp[6] = 0;
118 	}
119 
120 	// Cull trailing zeros
121 	while (*ep == '0' && ep > cp)
122 		ep--;
123 	char *z = ep+1;
124 	while (ep > cp) {
125 		if (*ep == '.') {
126 			if (z[-1] == '.')
127 				z[-1] = 0;
128 			else
129 				z[0] = 0;
130 			break;
131 		}
132 		ep--;
133 	}
134 
135 	int sl = strlen(cp);
136 	len += sl;
137 	kputsn(cp, sl, s);
138 	return len;
139 }
140 
kvsprintf(kstring_t * s,const char * fmt,va_list ap)141 int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
142 {
143 	va_list args;
144 	int l;
145 	va_copy(args, ap);
146 
147 	if (fmt[0] == '%' && fmt[1] == 'g' && fmt[2] == 0) {
148 		double d = va_arg(args, double);
149 		l = kputd(d, s);
150 		va_end(args);
151 		return l;
152 	}
153 
154 	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
155 	va_end(args);
156 	if (l + 1 > s->m - s->l) {
157 		if (ks_resize(s, s->l + l + 2) < 0)
158 			return -1;
159 		va_copy(args, ap);
160 		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
161 		va_end(args);
162 	}
163 	s->l += l;
164 	return l;
165 }
166 
ksprintf(kstring_t * s,const char * fmt,...)167 int ksprintf(kstring_t *s, const char *fmt, ...)
168 {
169 	va_list ap;
170 	int l;
171 	va_start(ap, fmt);
172 	l = kvsprintf(s, fmt, ap);
173 	va_end(ap);
174 	return l;
175 }
176 
kstrtok(const char * str,const char * sep_in,ks_tokaux_t * aux)177 char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
178 {
179 	const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
180 	if (sep) { // set up the table
181 		if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
182 		aux->finished = 0;
183 		if (sep[0] && sep[1]) {
184 			aux->sep = -1;
185 			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
186 			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
187 		} else aux->sep = sep[0];
188 	}
189 	if (aux->finished) return 0;
190 	else if (str) start = (unsigned char *) str, aux->finished = 0;
191 	else start = (unsigned char *) aux->p + 1;
192 	if (aux->sep < 0) {
193 		for (p = start; *p; ++p)
194 			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
195 	} else {
196 		for (p = start; *p; ++p)
197 			if (*p == aux->sep) break;
198 	}
199 	aux->p = (const char *) p; // end of token
200 	if (*p == 0) aux->finished = 1; // no more tokens
201 	return (char*)start;
202 }
203 
204 // s MUST BE a null terminated string; l = strlen(s)
ksplit_core(char * s,int delimiter,int * _max,int ** _offsets)205 int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
206 {
207 	int i, n, max, last_char, last_start, *offsets, l;
208 	n = 0; max = *_max; offsets = *_offsets;
209 	l = strlen(s);
210 
211 #define __ksplit_aux do {						\
212 		if (_offsets) {						\
213 			s[i] = 0;					\
214 			if (n == max) {					\
215 				int *tmp;				\
216 				max = max? max<<1 : 2;			\
217 				if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
218 					offsets = tmp;			\
219 				} else	{				\
220 					free(offsets);			\
221 					*_offsets = NULL;		\
222 					return 0;			\
223 				}					\
224 			}						\
225 			offsets[n++] = last_start;			\
226 		} else ++n;						\
227 	} while (0)
228 
229 	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
230 		if (delimiter == 0) {
231 			if (isspace((int)((unsigned char) s[i])) || s[i] == 0) {
232 				if (isgraph(last_char))
233                     __ksplit_aux; // the end of a field
234 			} else {
235 				if (isspace(last_char) || last_char == 0)
236                     last_start = i;
237 			}
238 		} else {
239 			if (s[i] == delimiter || s[i] == 0) {
240 				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
241 			} else {
242 				if (last_char == delimiter || last_char == 0) last_start = i;
243 			}
244 		}
245 		last_char = (int)((unsigned char)s[i]);
246 	}
247 	*_max = max; *_offsets = offsets;
248 	return n;
249 }
250 
kgetline(kstring_t * s,kgets_func * fgets_fn,void * fp)251 int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
252 {
253 	size_t l0 = s->l;
254 
255 	while (s->l == l0 || s->s[s->l-1] != '\n') {
256 		if (s->m - s->l < 200) {
257 			if (ks_resize(s, s->m + 200) < 0)
258 				return EOF;
259 		}
260 		if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
261 		s->l += strlen(s->s + s->l);
262 	}
263 
264 	if (s->l == l0) return EOF;
265 
266 	if (s->l > l0 && s->s[s->l-1] == '\n') {
267 		s->l--;
268 		if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
269 	}
270 	s->s[s->l] = '\0';
271 	return 0;
272 }
273 
274 /**********************
275  * Boyer-Moore search *
276  **********************/
277 
278 typedef unsigned char ubyte_t;
279 
280 // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
ksBM_prep(const ubyte_t * pat,int m)281 static int *ksBM_prep(const ubyte_t *pat, int m)
282 {
283 	int i, *suff, *prep, *bmGs, *bmBc;
284 	prep = (int*)calloc(m + 256, sizeof(int));
285 	bmGs = prep; bmBc = prep + m;
286 	{ // preBmBc()
287 		for (i = 0; i < 256; ++i) bmBc[i] = m;
288 		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
289 	}
290 	suff = (int*)calloc(m, sizeof(int));
291 	{ // suffixes()
292 		int f = 0, g;
293 		suff[m - 1] = m;
294 		g = m - 1;
295 		for (i = m - 2; i >= 0; --i) {
296 			if (i > g && suff[i + m - 1 - f] < i - g)
297 				suff[i] = suff[i + m - 1 - f];
298 			else {
299 				if (i < g) g = i;
300 				f = i;
301 				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
302 				suff[i] = f - g;
303 			}
304 		}
305 	}
306 	{ // preBmGs()
307 		int j = 0;
308 		for (i = 0; i < m; ++i) bmGs[i] = m;
309 		for (i = m - 1; i >= 0; --i)
310 			if (suff[i] == i + 1)
311 				for (; j < m - 1 - i; ++j)
312 					if (bmGs[j] == m)
313 						bmGs[j] = m - 1 - i;
314 		for (i = 0; i <= m - 2; ++i)
315 			bmGs[m - 1 - suff[i]] = m - 1 - i;
316 	}
317 	free(suff);
318 	return prep;
319 }
320 
kmemmem(const void * _str,int n,const void * _pat,int m,int ** _prep)321 void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
322 {
323 	int i, j, *prep = 0, *bmGs, *bmBc;
324 	const ubyte_t *str, *pat;
325 	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
326 	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
327 	if (_prep && *_prep == 0) *_prep = prep;
328 	bmGs = prep; bmBc = prep + m;
329 	j = 0;
330 	while (j <= n - m) {
331 		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
332 		if (i >= 0) {
333 			int max = bmBc[str[i+j]] - m + 1 + i;
334 			if (max < bmGs[i]) max = bmGs[i];
335 			j += max;
336 		} else return (void*)(str + j);
337 	}
338 	if (_prep == 0) free(prep);
339 	return 0;
340 }
341 
kstrstr(const char * str,const char * pat,int ** _prep)342 char *kstrstr(const char *str, const char *pat, int **_prep)
343 {
344 	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
345 }
346 
kstrnstr(const char * str,const char * pat,int n,int ** _prep)347 char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
348 {
349 	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
350 }
351 
352 /***********************
353  * The main() function *
354  ***********************/
355 
356 #ifdef KSTRING_MAIN
357 #include <stdio.h>
main()358 int main()
359 {
360 	kstring_t *s;
361 	int *fields, n, i;
362 	ks_tokaux_t aux;
363 	char *p;
364 	s = (kstring_t*)calloc(1, sizeof(kstring_t));
365 	// test ksprintf()
366 	ksprintf(s, " abcdefg:    %d ", 100);
367 	printf("'%s'\n", s->s);
368 	// test ksplit()
369 	fields = ksplit(s, 0, &n);
370 	for (i = 0; i < n; ++i)
371 		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
372 	// test kstrtok()
373 	s->l = 0;
374 	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
375 		kputsn(p, aux.p - p, s);
376 		kputc('\n', s);
377 	}
378 	printf("%s", s->s);
379 	// free
380 	free(s->s); free(s); free(fields);
381 
382 	{
383 		static char *str = "abcdefgcdgcagtcakcdcd";
384 		static char *pat = "cd";
385 		char *ret, *s = str;
386 		int *prep = 0;
387 		while ((ret = kstrstr(s, pat, &prep)) != 0) {
388 			printf("match: %s\n", ret);
389 			s = ret + prep[0];
390 		}
391 		free(prep);
392 	}
393 	return 0;
394 }
395 #endif
396