1 /**
2  * @mainpage
3  * libestr - some essentials for string handling (and a bit more)
4  *
5  * Copyright 2010-2011 by Rainer Gerhards and Adiscon GmbH.
6  *
7  *
8  *//*
9  *
10  * libestr - some essentials for string handling (and a bit more)
11  * Copyright 2010 by Rainer Gerhards and Adiscon GmbH.
12  *
13  * This file is part of libestr.
14  *
15  * This library is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU Lesser General Public
17  * License as published by the Free Software Foundation; either
18  * version 2.1 of the License, or (at your option) any later version.
19  *
20  * This library is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23  * Lesser General Public License for more details.
24  *
25  * You should have received a copy of the GNU Lesser General Public
26  * License along with this library; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
28  *
29  * A copy of the LGPL v2.1 can be found in the file "COPYING" in this distribution.
30  */
31 #ifndef LIBESTR_H_INCLUDED
32 #define	LIBESTR_H_INCLUDED
33 
34 
35 /**
36  * Data type for string sizes.
37  */
38 typedef unsigned int es_size_t;
39 
40 /**
41  * The string object.
42  * @note
43  * We do not use es_size_t, because that tends to be 64 bits on 64 bit platforms.
44  * In almost all cases I can think of, 4GB is a sufficient upper limit on string
45  * size. So we use unsigned ints, which means we save a lot of space and efficieny,
46  * what is especially important if there is a large number of strings inside a
47  * process.
48  * For the same reason, we do \b not provide a way to create and automatically
49  * free a traditional C string. That would requre another pointer (8 bytes of
50  * overhead on a 64 bit machine!).
51  */
52 typedef struct
53 {
54 	/* word-aligned items */
55 	es_size_t lenStr;		/**< actual length of string,
56 					    MUST be first element of struct because
57 					    of inline functions! */
58 	es_size_t lenBuf;		/**< length of buffer (including free space) */
59 	/* non word-aligned items */
60 	/* --currently none-- */
61 	/* NOTE: the actual string data is placed AFTER the last data
62 	 * element. It is accessed by pointer arithmetic. This saves us
63 	 * storing another pointer (8 byte on 64bit machines!)
64 	 */
65 } es_str_t;
66 
67 
68 /**
69  * Return library version as a classical NUL-terminated C-String.
70  */
71 char *es_version(void);
72 
73 /**
74  * Get the base address for the string's buffer.
75  * Proper use for library users is to gain read-only access to the buffer,
76  * so that it may be used inside an i/o request or similar things. Note that
77  * it is an \b invalid assumption that the buffer address keeps constant between
78  * library calls. This is only guaranteed for read-only methods. For example,
79  * the methods used to grow the string may be forced to reallocate the buffer
80  * on a new address with sufficiently free space.
81  *
82  * @param[in] s string object
83  * @returns address of buffer <b>Note: this is NOT a zero-terminated C string!</b>
84  */
85 static inline unsigned char *
es_getBufAddr(es_str_t * s)86 es_getBufAddr(es_str_t *s)
87 {
88 	return ((unsigned char*) s) + sizeof(es_str_t);
89 }
90 
91 /**
92  * Return length of provided string object.
93  */
es_strlen(es_str_t * str)94 static inline es_size_t es_strlen(es_str_t *str)
95 {
96 	return(str->lenStr);
97 }
98 
99 /**
100  * Create a new string object.
101  * @param[in] lenhint expected max length of string. Do \b not use too large value.
102  * @returns pointer to new object or NULL on error
103  */
104 es_str_t* es_newStr(es_size_t lenhint);
105 
106 /**
107  * delete a string object.
108  * @param[in] str string to be deleted.
109  */
110 void es_deleteStr(es_str_t *str);
111 
112 
113 /**
114  * Create a new string object based on a "traditional" C string.
115  * @param[in] cstr traditional, '\0'-terminated C string
116  * @param[in] len length of str. Use strlen() if you don't know it, but often it
117  *  		the length is known and we use this as a time-safer (if present).
118  * @returns pointer to new object or NULL on error
119  */
120 es_str_t* es_newStrFromCStr(const char *cstr, es_size_t len);
121 
122 
123 /**
124  * Create a new string object from a substring of an existing string.
125  * This involves copying the substring.
126  *
127  * @param[in] str original string
128  * @param[in] start beginning position of substring (0-based)
129  * @param[in] len length of substring to extract
130  * @returns pointer to new object or NULL on error
131  *
132  * If start > strlen, a valid (!) empty string will be returned. If
133  * start+len > strlen, the rest of the string starting at start will be
134  * returned.
135  */
136 es_str_t* es_newStrFromSubStr(es_str_t *str, es_size_t start, es_size_t len);
137 
138 
139 /**
140  * Create a new string object from a buffer.
141  * This involves copying the buffer.
142  *
143  * @param[in] buf buffer begin
144  * @param[in] len length of buffer
145  * @returns pointer to new object or NULL on error
146  */
147 es_str_t* es_newStrFromBuf(char *buf, es_size_t len);
148 
149 
150 /**
151  * Create a new string object from a number.
152  *
153  * @param[in] num number (a long long value to cover all)
154  * @returns pointer to new object or NULL on error
155  */
156 es_str_t* es_newStrFromNumber(long long num);
157 
158 
159 /**
160  * Empty a string.
161  * An existing string is set to empty state, but no allocation
162  * or allocation information is reset. This function is useful if
163  * the same string object is used several times within a loop
164  * and it shall be re-set to "" on each iteration. As the allocation
165  * is preserved, the string in most cases needs to grow only very
166  * few times. This is considered the fastest method to repeatedly
167  * work with temporary strings.
168  *
169  * @param[in] str the string to empty
170  */
171 static inline void
es_emptyStr(es_str_t * str)172 es_emptyStr(es_str_t *str)
173 {
174 	str->lenStr = 0;
175 }
176 
177 
178 /**
179  * Duplicate a str.
180  * Currently, the string is actually duplicated. May be changed to
181  * copy-on-write in later releases.
182  *
183  * @param[in] str original string
184  * @returns pointer to new object or NULL on error
185  */
186 static inline es_str_t*
es_strdup(es_str_t * str)187 es_strdup(es_str_t *str)
188 {
189 	return es_newStrFromSubStr(str, 0, es_strlen(str));
190 }
191 
192 
193 /**
194  * Compare a string against a buffer.
195  * Semantics are the same as strcmp(). This function is required in
196  * order to permit simple comparisons against C strings, what
197  * otherwise would require conversions. As a side-effect, it can also
198  * compare against substrings and other buffers of any type.
199  *
200  * @param[in] s string to compare
201  * @param[in] b buffer to compare against
202  * @param[in] len lenght of buffer
203  * @returns 0 if equal, negative if s<cs, positive if s>cs
204 */
205 int es_strbufcmp(es_str_t *s, const unsigned char *b, es_size_t len);
206 
207 /** Case-insensitive version of es_strcasebufcmp.
208  */
209 int es_strcasebufcmp(es_str_t *s, const unsigned char *b, es_size_t len);
210 
211 
212 /**
213  * Convert a string to lower case. Once converted, this can not be
214  * undone. If the caller needs the original string, it must create
215  * a copy before calling tolower.
216  *
217  * @param[in] s string object to be converted
218  */
219 void es_tolower(es_str_t *s);
220 
221 /**
222  * Compare two string objects.
223  * Semantics are the same as strcmp().
224  *
225  * @param[in] s1 frist string
226  * @param[in] s2 second string
227  * @returns 0 if equal, negative if s1<s2, positive if s1>s2
228 */
229 static inline int
es_strcmp(es_str_t * s1,es_str_t * s2)230 es_strcmp(es_str_t *s1, es_str_t *s2)
231 {
232 	return es_strbufcmp(s1, es_getBufAddr(s2), s2->lenStr);
233 }
234 
235 /** Case-insensitive version of es_strcmp.
236  */
237 static inline int
es_strcasecmp(es_str_t * s1,es_str_t * s2)238 es_strcasecmp(es_str_t *s1, es_str_t *s2)
239 {
240 	return es_strcasebufcmp(s1, es_getBufAddr(s2), s2->lenStr);
241 }
242 
243 
244 /**
245  * Compare two string objects, but only the first n characters.
246  * Semantics are the same as strncmp().
247  *
248  * @param[in] s1 frist string
249  * @param[in] s2 second string
250  * @param[in] len number of characters to compare
251  * @returns 0 if equal, negative if s1<s2, positive if s1>s2
252 */
253 int es_strncmp(es_str_t *s1, es_str_t *s2, es_size_t len);
254 
255 
256 /**
257  * This is the case insensitive version of es_strncmp. See there for
258  * further details.
259 */
260 int es_strncasecmp(es_str_t *s1, es_str_t *s2, es_size_t len);
261 
262 
263 /**
264  * Check if the second string is contained within the first string.
265  *
266  * @param[in] s1 frist string
267  * @param[in] s2 second string
268  * @returns -1 if s2 is not contained in s1, otherwise the offset
269  *             of the first location where it is contained. This is
270  *             zero-based, so 0 as return indicates everthing OK and s2
271  *             is contained right at the start of s1.
272 */
273 int es_strContains(es_str_t *s1, es_str_t *s2);
274 
275 
276 /**
277  * This is the case-insensitive version of es_strContains. See there
278  * for further information.
279 */
280 int es_strCaseContains(es_str_t *s1, es_str_t *s2);
281 
282 
283 /**
284  * A macro to compare a string against a constant C string
285  */
286 #define es_strconstcmp(str, constcstr) \
287 	es_strbufcmp(str, (unsigned char*) constcstr, sizeof(constcstr) - 1)
288 
289 /**
290  * Extend string buffer.
291  * This is called if the size is insufficient. Note that the string
292  * pointer will be changed. This is an \b internal function that should
293  * \b not be called from any lib user app.
294  *
295  * @param[in/out] ps pointer to (pointo to) string to be extened
296  * @param[in] minNeeded minimum number of additional bytes needed
297  * @returns 0 on success, something else otherwise
298  */
299 int es_extendBuf(es_str_t **ps, es_size_t minNeeded);
300 
301 /**
302  * Append a character to the current string object.
303  * Note that the pointer to the string object may change. This
304  * is because we may need to aquire more memory.
305  * @param[in/out] ps string to be extened (updatedable pointer required!)
306  * @returns 0 on success, something else otherwise
307  */
308 int es_addChar(es_str_t **ps, const unsigned char c);
309 
310 
311 /**
312  * Append a memory buffer to a string.
313  * This is the method that almost all other append methods actually use.
314  *
315  * @param[in/out] ps1 updateable pointer to to-be-appended-to string
316  * @param[in] buf buffer to append
317  * @param[in] lenBuf length of buffer
318  *
319  * @returns 0 on success, something else otherwise
320  */
321 int es_addBuf(es_str_t **ps1, const char *buf, const es_size_t lenBuf);
322 
323 /**
324  * A macro to add a traditional C constant to a string.
325  */
326 #define es_addBufConstcstr(str, constcstr) \
327 	es_addBuf(str, constcstr, sizeof(constcstr) - 1)
328 
329 /**
330  * Append a second string to the first one.
331  *
332  * @param[in/out] ps1 updateable pointer to to-be-appended-to string
333  * @param[in] s2 string to append
334  *
335  * @returns 0 on success, something else otherwise
336  */
337 static inline int
es_addStr(es_str_t ** ps1,es_str_t * s2)338 es_addStr(es_str_t **ps1, es_str_t *s2)
339 {
340 	return es_addBuf(ps1, (char*) es_getBufAddr(s2), s2->lenStr);
341 }
342 
343 /**
344  * Obtain a traditional C-String from a string object.
345  * The string object is not modified. Note that the C string is not
346  * necessarily exactly the same string: C Strings can not contain NUL
347  * characters, and as such they need to be either encoded or dropped.
348  * This is done by this function. The user can specify with which character
349  * sequence (a traditional C String) it shall be replaced.
350  * @note
351  * This function has to do a lot of work, and should not be called unless
352  * absolutely necessary. If possible, use the native representation of
353  * the string object. For example, you can use the buffer address and
354  * string length in most i/o calls, if you use the native versions and avoid
355  * the C string i/o calls.
356  *
357  * @param[in] s string object
358  * @param[in] nulEsc escape sequence for NULs. If NULL, NUL characters will be dropped.
359  *
360  * @returns NULL in case of error, otherwise a suitably-encoded standard C string.
361  * 	This string is allocated from the dynamic memory pool and must be freed
362  * 	by the caller.
363  */
364 char *es_str2cstr(es_str_t *s, const char *nulEsc);
365 
366 /**
367  * Obtain a number from the string object. The result is always valid
368  * and the number value is extracted as follows:
369  * - strings starting with "0x" are interpreted as being hex
370  * - strings starting with "0" are interpreted as being octal
371  * - strings starting with "-" are interpreted as negative decimal
372  * - all others are interpreted as postive decimal
373  * - octal and hex string are always unsigned
374  * - the number is made up from the longest sequence of (valid) digits
375  *   from the start of the string. Trailing non-digits are ignored
376  * - if the string does not start with a valid digit, 0 is returned
377  * Note that the string always returns the best match as the number
378  * "represented" by the string. For example "1x234" will return the
379  * number 1 and "Test123" will return 0. You can use bSuccess to learn
380  * if the string could be converted completely (1) or only partially (0).
381  *
382  * @param[in] s string object
383  * @param[out] bSucccess 1 if the conversion was "successful", that means
384  *             the whole string was number, 0 if "unsuccessful", that means
385  *             the string was not a valid number. In this case, the first
386  *             part of the string is treated as number. If the caller sets
387  *             bSuccess to NULL, no conversion state information is returned.
388  *
389  * @returns number value as specified
390  */
391 long long es_str2num(es_str_t *s, int *bSuccess);
392 
393 /**
394  * Unescape a string.
395  * The escape seqences defined below will be unescaped and replaced
396  * by a single character. The string is modified in place (note that
397  * space is always sufficient, because the resulting string will be
398  * smaller or of equal size). This function can not run into trouble,
399  * so it does not return a return status.
400  *
401  * The following escape sequences, inspired by the C language, are supported:
402  * (Note: double backslashes are for Doxygen, of course this is to
403  * be used with single backslashes):
404  * - \\0 NUL
405  * - \\a BEL
406  * - \\b Backspace
407  * - \\f FF
408  * - \\n LF
409  * - \\r CR
410  * - \\t HT
411  * - \\' singlu quotation mark
412  * - \\" double quotation mark
413  * - \\? question mark
414  * - \\\\ backslash character
415  * - \\ooo ASCII Character in octal notation (o being octal digit)
416  * - \\xhh ASCC character in hexadecimal notation
417  * - \\xhhhh Unicode characer in headecimal notation
418  * All other escape sequences are undefined. Currently, this is
419  * interpreted as the escape character itself, but this is not
420  * guaranteed. Most importantly, a special meaning may be assigned
421  * to any of the currently-unassigned characters in the future.
422  *
423  * @param[in/out] s string object to unescape.
424  */
425 void es_unescapeStr(es_str_t *s);
426 
427 #endif /* #ifndef LIBESTR_H_INCLUDED */
428