1 /**
2 * @file string.c
3 * Implements string handling
4 *
5 * @note: for efficiency reasons, later builds may spread the
6 * individual functions across different source modules. I was a
7 * bit lazy to do this right now and I am totally unsure if it
8 * really is worth the effort.
9 *//*
10 * libestr - some essentials for string handling (and a bit more)
11 * Copyright 2010 by Rainer Gerhards and Adiscon GmbH.
12 *
13 * This file is part of libestr.
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 *
29 * A copy of the LGPL v2.1 can be found in the file "COPYING" in this distribution.
30 */
31 #include "config.h"
32 #include <stdlib.h>
33 #include <string.h>
34 #include <assert.h>
35 #include <ctype.h>
36 #include <errno.h>
37
38 #include "libestr.h"
39
40 #define ERR_ABORT {r = 1; goto done; }
41
42 #if 1 /* !defined(NDEBUG) TODO: decide if we want this or not! */
43 # define CHECK_STR
44 # define ASSERT_STR(s)
45 #else
46 # define CHECK_STR
47 if(s->objID != ES_STRING_OID) { \
48 r = -1; \
49 goto done; \
50 }
51 # define ASSERT_STR(s) assert((s)->objID == ES_STRING_OID)
52 #endif /* #if !defined(NDEBUG) */
53
54
55 /* ------------------------------ HELPERS ------------------------------ */
56
57 /**
58 * Extend string buffer.
59 * This is called if the size is insufficient. Note that the string
60 * pointer will be changed.
61 * @param[in/out] ps pointer to (pointo to) string to be extened
62 * @param[in] minNeeded minimum number of additional bytes needed
63 * @returns 0 on success, something else otherwise
64 */
65 int
es_extendBuf(es_str_t ** ps,es_size_t minNeeded)66 es_extendBuf(es_str_t **ps, es_size_t minNeeded)
67 {
68 int r = 0;
69 es_str_t *s = *ps;
70 es_size_t newSize;
71 es_size_t newAlloc;
72
73 ASSERT_STR(s);
74 /* first compute the new size needed */
75 if(minNeeded > s->lenBuf) {
76 newSize = s->lenBuf + minNeeded;
77 } else {
78 newSize = 2 * s->lenBuf;
79 }
80 if(newSize < minNeeded) { /* overflow? */
81 r = ENOMEM;
82 goto done;
83 }
84
85 newAlloc = newSize + sizeof(es_str_t);
86 if(newAlloc < newSize) { /* overflow? */
87 r = ENOMEM;
88 goto done;
89 }
90
91 if((s = (es_str_t*) realloc(s, newAlloc)) == NULL) {
92 r = errno;
93 goto done;
94 }
95 s->lenBuf = newSize;
96 *ps = s;
97
98 done:
99 return r;
100 }
101
102
103 /* ------------------------------ END HELPERS ------------------------------ */
104
105 es_str_t *
es_newStr(es_size_t lenhint)106 es_newStr(es_size_t lenhint)
107 {
108 es_str_t *s;
109 /* we round length to a multiple of 8 in the hope to reduce
110 * memory fragmentation.
111 */
112 if(lenhint & 0x07)
113 lenhint = lenhint - (lenhint & 0x07) + 8;
114
115 if(sizeof(es_str_t) + lenhint < lenhint) { /* overflow? */
116 s = NULL;
117 goto done;
118 }
119 if((s = malloc(sizeof(es_str_t) + lenhint)) == NULL)
120 goto done;
121
122 # ifndef NDEBUG
123 /*s->objID = ES_STRING_OID;*/
124 # endif
125 s->lenBuf = lenhint;
126 s->lenStr = 0;
127
128 done:
129 return s;
130 }
131
132
133 es_str_t*
es_newStrFromCStr(const char * cstr,es_size_t len)134 es_newStrFromCStr(const char *cstr, es_size_t len)
135 {
136 es_str_t *s;
137
138 if((s = es_newStr(len)) == NULL) goto done;
139 memcpy(es_getBufAddr(s), cstr, len);
140 s->lenStr = len;
141
142 done:
143 return s;
144 }
145
146
147 es_str_t*
es_newStrFromBuf(char * buf,es_size_t len)148 es_newStrFromBuf(char *buf, es_size_t len)
149 {
150 es_str_t *s;
151
152 if((s = es_newStr(len)) == NULL) goto done;
153
154 memcpy(es_getBufAddr(s), buf, len);
155 s->lenStr = len;
156
157 done:
158 return s;
159 }
160
161
162 es_str_t*
es_newStrFromNumber(long long num)163 es_newStrFromNumber(long long num)
164 {
165 char numbuf[20]; /* 2^64 has 20 digits ;) */
166 int i,j;
167 char minus = '\0';
168 es_str_t *s;
169 long long upperBorder = -9223372036854775807LL;
170 --upperBorder; /* handle number in C90 and newer modes */
171
172 /* handle border case */
173 if(num == upperBorder) {
174 s = es_newStrFromCStr("-9223372036854775808", 20);
175 goto done;
176 }
177
178 if (num < 0) {
179 minus = '-';
180 num = -num;
181 }
182
183 /* generate string (reversed) */
184 for(i = 0 ; num != 0 ; ++i) {
185 numbuf[i] = num % 10 + '0';
186 num /= 10;
187 }
188 if(i == 0)
189 numbuf [i++] = '0';
190 if (minus != '\0')
191 numbuf[i++] = minus;
192
193 /* now create the actual string */
194 if((s = es_newStr(i)) == NULL) goto done;
195 s->lenStr = i;
196 for(j = 0 ; --i >= 0 ; ++j) {
197 es_getBufAddr(s)[j] = numbuf[i];
198 }
199
200 done:
201 return s;
202 }
203
204
205 es_str_t*
es_newStrFromSubStr(es_str_t * str,es_size_t start,es_size_t len)206 es_newStrFromSubStr(es_str_t *str, es_size_t start, es_size_t len)
207 {
208 es_str_t *s;
209
210 if(start+len < start) {
211 s = NULL;
212 goto done;
213 }
214 if((s = es_newStr(len)) == NULL) goto done;
215
216 if(start > es_strlen(str))
217 goto done;
218 else if(start + len > es_strlen(str) - 1)
219 len = es_strlen(str) - start;
220
221 memcpy(es_getBufAddr(s), es_getBufAddr(str)+start, len);
222 s->lenStr = len;
223
224 done:
225 return s;
226 }
227
228 void
es_deleteStr(es_str_t * s)229 es_deleteStr(es_str_t *s)
230 {
231 ASSERT_STR(s);
232 # if 0 /*!defined(NDEBUG)*/
233 s->objID = ES_STRING_FREED;
234 # endif
235 free(s);
236 }
237
238
239 int
es_strbufcmp(es_str_t * s,const unsigned char * buf,es_size_t lenBuf)240 es_strbufcmp(es_str_t *s, const unsigned char *buf, es_size_t lenBuf)
241 {
242 int r;
243 es_size_t i;
244 unsigned char *c;
245
246 ASSERT_STR(s);
247 assert(buf != NULL);
248 c = es_getBufAddr(s);
249 r = 0; /* assume: strings equal, will be reset if not */
250 for(i = 0 ; i < s->lenStr ; ++i) {
251 if(i == lenBuf) {
252 r = 1; /* strings are so far equal, but second string is smaller */
253 break;
254 }
255 if(c[i] != buf[i]) {
256 r = c[i] - buf[i];
257 break;
258 }
259 }
260 if(r == 0 && s->lenStr < lenBuf)
261 r = -1; /* strings are so far equal, but first string is smaller */
262 return r;
263 }
264
265
266 /* The following is the case-insensitive version of es_strbufcmp. It is
267 * a separate function for speed puprposes. However, the code is almost
268 * identical to es_strbufcmp, so when that one is updated, changes should
269 * be copied over to here as well. The only difference is the tolower()
270 * call, so change propagation is easy ;)
271 */
272 int
es_strcasebufcmp(es_str_t * s,const unsigned char * buf,es_size_t lenBuf)273 es_strcasebufcmp(es_str_t *s, const unsigned char *buf, es_size_t lenBuf)
274 {
275 int r;
276 es_size_t i;
277 unsigned char *c;
278
279 ASSERT_STR(s);
280 assert(buf != NULL);
281 c = es_getBufAddr(s);
282 r = 0; /* assume: strings equal, will be reset if not */
283 for(i = 0 ; i < s->lenStr ; ++i) {
284 if(i == lenBuf) {
285 r = 1;
286 break;
287 }
288 if(tolower(c[i]) != tolower(buf[i])) {
289 r = tolower(c[i]) - tolower(buf[i]);
290 break;
291 }
292 }
293 if(r == 0 && s->lenStr < lenBuf)
294 r = -1;
295 return r;
296 }
297 int
es_strncmp(es_str_t * s1,es_str_t * s2,es_size_t len)298 es_strncmp(es_str_t *s1, es_str_t *s2, es_size_t len)
299 {
300 int r;
301 es_size_t i;
302 unsigned char *c1, *c2;
303
304 ASSERT_STR(s1);
305 ASSERT_STR(s2);
306 c1 = es_getBufAddr(s1);
307 c2 = es_getBufAddr(s2);
308 r = 0; /* assume: strings equal, will be reset if not */
309 for(i = 0 ; i < len ; ++i) {
310 if(i >= s1->lenStr) {
311 if(i >= s2->lenStr) {
312 break; /* we are done, match ready */
313 } else {
314 r = -1; /* first string smaller --> less */
315 break;
316 }
317 } else {
318 if(i >= s2->lenStr) {
319 r = 1; /* first string smaller --> greater */
320 break;
321 } else {
322 if(c1[i] != c2[i]) {
323 r = c1[i] - c2[i];
324 break;
325 }
326 }
327 }
328 }
329 return r;
330 }
331
332
333 /* The following is the case-insensitive version of es_strContains. It is
334 * a separate function for speed puprposes. However, the code is almost
335 * identical to es_strContains, so when that one is updated, changes should
336 * be copied over to here as well. The only difference is the tolower()
337 * call, so change propagation is easy ;)
338 */
339 int
es_strncasecmp(es_str_t * s1,es_str_t * s2,es_size_t len)340 es_strncasecmp(es_str_t *s1, es_str_t *s2, es_size_t len)
341 {
342 int r;
343 es_size_t i;
344 unsigned char *c1, *c2;
345
346 ASSERT_STR(s1);
347 ASSERT_STR(s2);
348 c1 = es_getBufAddr(s1);
349 c2 = es_getBufAddr(s2);
350 r = 0; /* assume: strings equal, will be reset if not */
351 for(i = 0 ; i < len ; ++i) {
352 if(i >= s1->lenStr) {
353 if(i >= s1->lenStr) {
354 break; /* we are done, match ready */
355 } else {
356 r = -1; /* first string smaller --> less */
357 break;
358 }
359 } else {
360 if(i >= s1->lenStr) {
361 r = 1; /* first string smaller --> greater */
362 break;
363 } else {
364 if(tolower(c1[i]) != tolower(c2[i])) {
365 r = tolower(c1[i]) - tolower(c2[i]);
366 break;
367 }
368 }
369 }
370 }
371 return r;
372 }
373
374
375 int
es_strContains(es_str_t * s1,es_str_t * s2)376 es_strContains(es_str_t *s1, es_str_t *s2)
377 {
378 es_size_t i, j;
379 es_size_t max;
380 unsigned char *c1, *c2;
381 int r;
382
383 r = -1;
384 if(s2->lenStr > s1->lenStr) {
385 /* can not be contained ;) */
386 goto done;
387 }
388
389 c1 = es_getBufAddr(s1);
390 c2 = es_getBufAddr(s2);
391 max = s1->lenStr - s2->lenStr + 1;
392 for(i = 0 ; i < max ; ++i) {
393 for(j = 0 ; j < s2->lenStr; ++j) {
394 if(c1[i+j] != c2[j])
395 break;
396 }
397 if(j == s2->lenStr) {
398 r = i;
399 break;
400 }
401 }
402
403 done: return r;
404 }
405
406
407 /* The following is the case-insensitive version of es_strContains. It is
408 * a separate function for speed puprposes. However, the code is almost
409 * identical to es_strContains, so when that one is updated, changes should
410 * be copied over to here as well. The only difference is the tolower()
411 * call, so change propagation is easy ;)
412 */
413 int
es_strCaseContains(es_str_t * s1,es_str_t * s2)414 es_strCaseContains(es_str_t *s1, es_str_t *s2)
415 {
416 es_size_t i, j;
417 es_size_t max;
418 unsigned char *c1, *c2;
419 int r;
420
421 r = -1;
422 if(s2->lenStr > s1->lenStr) {
423 /* can not be contained ;) */
424 goto done;
425 }
426
427 c1 = es_getBufAddr(s1);
428 c2 = es_getBufAddr(s2);
429 max = s1->lenStr - s2->lenStr + 1;
430 for(i = 0 ; i < max ; ++i) {
431 for(j = 0 ; j < s2->lenStr; ++j) {
432 if(tolower(c1[i+j]) != tolower(c2[j]))
433 break;
434 }
435 if(j == s2->lenStr) {
436 r = i;
437 break;
438 }
439 }
440
441 done: return r;
442 }
443
444
445 int
es_addChar(es_str_t ** ps,const unsigned char c)446 es_addChar(es_str_t **ps, const unsigned char c)
447 {
448 int r = 0;
449
450 if((*ps)->lenStr >= (*ps)->lenBuf) {
451 if((r = es_extendBuf(ps, 1)) != 0) goto done;
452 }
453
454 /* ok, when we reach this, we have sufficient memory */
455 *(es_getBufAddr(*ps) + (*ps)->lenStr++) = c;
456
457 done:
458 return r;
459 }
460
461
462 int
es_addBuf(es_str_t ** ps1,const char * buf,es_size_t lenBuf)463 es_addBuf(es_str_t **ps1, const char *buf, es_size_t lenBuf)
464 {
465 int r;
466 es_size_t newlen;
467 es_str_t *s1 = *ps1;
468
469 ASSERT_STR(s1);
470 if(lenBuf == 0) {
471 r = 0;
472 goto done;
473 }
474
475 newlen = s1->lenStr + lenBuf;
476 if(newlen != (size_t) s1->lenStr + (size_t) lenBuf) {
477 r = ENOMEM;
478 goto done;
479 }
480 if(s1->lenBuf < newlen) {
481 /* we need to extend */
482 if((r = es_extendBuf(ps1, newlen - s1->lenBuf)) != 0) goto done;
483 s1 = *ps1;
484 }
485
486 /* do the actual copy, we now *have* the space required */
487 memcpy(es_getBufAddr(s1)+s1->lenStr, buf, lenBuf);
488 s1->lenStr = newlen;
489 r = 0; /* all well */
490
491 done:
492 return r;
493 }
494
495
496 char *
es_str2cstr(es_str_t * s,const char * nulEsc)497 es_str2cstr(es_str_t *s, const char *nulEsc)
498 {
499 char *cstr;
500 es_size_t lenEsc;
501 int nbrNUL;
502 es_size_t i;
503 size_t iDst;
504 unsigned char *c;
505
506 /* detect number of NULs inside string */
507 c = es_getBufAddr(s);
508 nbrNUL = 0;
509 for(i = 0 ; i < s->lenStr ; ++i) {
510 if(c[i] == 0x00)
511 ++nbrNUL;
512 }
513
514 if(nbrNUL == 0) {
515 /* no special handling needed */
516 if((cstr = malloc(s->lenStr + 1)) == NULL) goto done;
517 if(s->lenStr > 0)
518 memcpy(cstr, c, s->lenStr);
519 cstr[s->lenStr] = '\0';
520 } else {
521 /* we have NUL bytes present and need to process them
522 * during creation of the C string.
523 */
524 lenEsc = (nulEsc == NULL) ? 0 : strlen(nulEsc);
525 if((cstr = malloc(s->lenStr + nbrNUL * (lenEsc - 1) + 1)) == NULL)
526 goto done;
527 for(i = iDst = 0 ; i < s->lenStr ; ++i) {
528 if(c[i] == 0x00) {
529 if(lenEsc == 1) {
530 cstr[iDst++] = *nulEsc;
531 } else if(lenEsc > 1) {
532 memcpy(cstr + iDst, nulEsc, lenEsc);
533 iDst += lenEsc;
534 }
535 } else {
536 cstr[iDst++] = c[i];
537 }
538 }
539 cstr[iDst] = '\0';
540 }
541
542 done:
543 return cstr;
544 }
545
546 /*helpers to es_str2num */
547 /* startindex is provided for decimal to cover '-' */
548 static inline long long
es_str2num_dec(es_str_t * s,unsigned i,int * bSuccess)549 es_str2num_dec(es_str_t *s, unsigned i, int *bSuccess)
550 {
551 long long num;
552 unsigned char *c;
553
554 num = 0;
555 c = es_getBufAddr(s);
556 while(i < s->lenStr && isdigit(c[i])) {
557 num = num * 10 + c[i] - '0';
558 ++i;
559 }
560 if(bSuccess != NULL)
561 *bSuccess = (i == s->lenStr) ? 1 : 0;
562 return num;
563 }
564 static inline long long
es_str2num_oct(es_str_t * s,int * bSuccess)565 es_str2num_oct(es_str_t *s, int *bSuccess)
566 {
567 long long num;
568 unsigned char *c;
569 unsigned i;
570
571 i = 0;
572 num = 0;
573 c = es_getBufAddr(s);
574 while(i < s->lenStr && (c[i] >= '0' && c[i] <= '7')) {
575 num = num * 8 + c[i] - '0';
576 ++i;
577 }
578 if(bSuccess != NULL)
579 *bSuccess = (i == s->lenStr) ? 1 : 0;
580 return num;
581 }
582 static inline long long
es_str2num_hex(es_str_t * s,int * bSuccess)583 es_str2num_hex(es_str_t *s, int *bSuccess)
584 {
585 long long num;
586 unsigned char *c;
587 unsigned i;
588
589 i = 0;
590 num = 0;
591 c = es_getBufAddr(s) + 2;
592 while(i < s->lenStr && isxdigit(c[i])) {
593 if(isdigit(c[i]))
594 num = num * 16 + c[i] - '0';
595 else
596 num = num * 16 + tolower(c[i]) - 'a';
597 ++i;
598 }
599 if(bSuccess != NULL)
600 *bSuccess = (i == s->lenStr) ? 1 : 0;
601 return num;
602 }
603 /*end helpers to es_str2num */
604
605 long long
es_str2num(es_str_t * s,int * bSuccess)606 es_str2num(es_str_t *s, int *bSuccess)
607 {
608 long long num;
609 unsigned char *c;
610 if(s->lenStr == 0) {
611 num = 0;
612 if(bSuccess != NULL)
613 *bSuccess = 0;
614 goto done;
615 }
616
617 c = es_getBufAddr(s);
618 if(c[0] == '-') {
619 num = -es_str2num_dec(s, 1, bSuccess);
620 } else if(c[0] == '0') {
621 if(s->lenStr > 1 && c[1] == 'x') {
622 num = es_str2num_hex(s, bSuccess);
623 } else {
624 num = es_str2num_oct(s, bSuccess);
625 }
626 } else { /* decimal */
627 num = es_str2num_dec(s, 0, bSuccess);
628 }
629
630 done: return num;
631 }
632
633
634 /**
635 * Get numerical value of a hex digit. This is a helper function.
636 * @param[in] c a character containing 0..9, A..Z, a..z anything else
637 * is an (undetected) error.
638 */
hexDigitVal(char c)639 static inline int hexDigitVal(char c)
640 {
641 int r;
642 if(c < 'A')
643 r = c - '0';
644 else if(c < 'a')
645 r = c - 'A' + 10;
646 else
647 r = c - 'a' + 10;
648 return r;
649 }
650
651 /* Handle the actual unescaping.
652 * a helper to es_unescapeStr(), to help make the function easier to read.
653 */
654 static inline void
doUnescape(unsigned char * c,es_size_t lenStr,es_size_t * iSrc,es_size_t iDst)655 doUnescape(unsigned char *c, es_size_t lenStr, es_size_t *iSrc, es_size_t iDst)
656 {
657 if(c[*iSrc] == '\\') {
658 if(++(*iSrc) == lenStr) {
659 /* error, incomplete escape, treat as single char */
660 c[iDst] = '\\';
661 }
662 /* regular case, unescape */
663 switch(c[*iSrc]) {
664 case '0':
665 c[iDst] = '\0';
666 break;
667 case 'a':
668 c[iDst] = '\007';
669 break;
670 case 'b':
671 c[iDst] = '\b';
672 break;
673 case 'f':
674 c[iDst] = '\014';
675 break;
676 case 'n':
677 c[iDst] = '\n';
678 break;
679 case 'r':
680 c[iDst] = '\r';
681 break;
682 case 't':
683 c[iDst] = '\t';
684 break;
685 case '\'':
686 c[iDst] = '\'';
687 break;
688 case '"':
689 c[iDst] = '"';
690 break;
691 case '?':
692 c[iDst] = '?';
693 break;
694 case '\\':
695 c[iDst] = '\\';
696 break;
697 case 'x':
698 if((*iSrc)+1 == lenStr) {
699 /* just end run, leave as is */
700 *iSrc += 1;
701 goto done;
702 }
703 if( (*iSrc)+2 == lenStr
704 || !isxdigit(c[(*iSrc)+1])
705 || !isxdigit(c[(*iSrc)+2])) {
706 /* error, incomplete escape, use as is */
707 c[iDst] = '\\';
708 --(*iSrc);
709 }
710 c[iDst] = (hexDigitVal(c[(*iSrc)+1]) << 4) +
711 hexDigitVal(c[(*iSrc)+2]);
712 *iSrc += 2;
713 break;
714 default:
715 /* error, incomplete escape, use as is. Ideally we
716 should reject it instead, to allow for future
717 enhancements, but that would break ABI of
718 es_unescapeStr. */
719 c[iDst] = '\\';
720 --(*iSrc);
721 break;
722 }
723 } else {
724 /* regular character */
725 c[iDst] = c[*iSrc];
726 }
727 done: return;
728 }
729
730 void
es_unescapeStr(es_str_t * s)731 es_unescapeStr(es_str_t *s)
732 {
733 es_size_t iSrc, iDst;
734 unsigned char *c;
735 assert(s != NULL);
736
737 c = es_getBufAddr(s);
738 /* scan for first escape sequence (if we are luky, there is none!) */
739 iSrc = 0;
740 while(iSrc < s->lenStr && c[iSrc] != '\\')
741 ++iSrc;
742 /* now we have a sequence or end of string. In any case, we process
743 * all remaining characters (maybe 0!) and unescape.
744 */
745 if(iSrc != s->lenStr) {
746 iDst = iSrc;
747 while(iSrc < s->lenStr) {
748 doUnescape(c, s->lenStr, &iSrc, iDst);
749 ++iSrc;
750 ++iDst;
751 }
752 s->lenStr = iDst;
753 }
754 }
755
756 void
es_tolower(es_str_t * s)757 es_tolower(es_str_t *s)
758 {
759 es_size_t i;
760
761 for(i = 0 ; i < s->lenStr ; ++i)
762 es_getBufAddr(s)[i] = tolower(es_getBufAddr(s)[i]);
763 }
764