1 /**
2 * @mainpage
3 * libestr - some essentials for string handling (and a bit more)
4 *
5 * Copyright 2010-2011 by Rainer Gerhards and Adiscon GmbH.
6 *
7 *
8 *//*
9 *
10 * libestr - some essentials for string handling (and a bit more)
11 * Copyright 2010 by Rainer Gerhards and Adiscon GmbH.
12 *
13 * This file is part of libestr.
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 *
29 * A copy of the LGPL v2.1 can be found in the file "COPYING" in this distribution.
30 */
31 #ifndef LIBESTR_H_INCLUDED
32 #define LIBESTR_H_INCLUDED
33
34
35 /**
36 * Data type for string sizes.
37 */
38 typedef unsigned int es_size_t;
39
40 /**
41 * The string object.
42 * @note
43 * We do not use es_size_t, because that tends to be 64 bits on 64 bit platforms.
44 * In almost all cases I can think of, 4GB is a sufficient upper limit on string
45 * size. So we use unsigned ints, which means we save a lot of space and efficieny,
46 * what is especially important if there is a large number of strings inside a
47 * process.
48 * For the same reason, we do \b not provide a way to create and automatically
49 * free a traditional C string. That would requre another pointer (8 bytes of
50 * overhead on a 64 bit machine!).
51 */
52 typedef struct
53 {
54 /* word-aligned items */
55 es_size_t lenStr; /**< actual length of string,
56 MUST be first element of struct because
57 of inline functions! */
58 es_size_t lenBuf; /**< length of buffer (including free space) */
59 /* non word-aligned items */
60 /* --currently none-- */
61 /* NOTE: the actual string data is placed AFTER the last data
62 * element. It is accessed by pointer arithmetic. This saves us
63 * storing another pointer (8 byte on 64bit machines!)
64 */
65 } es_str_t;
66
67
68 /**
69 * Return library version as a classical NUL-terminated C-String.
70 */
71 char *es_version(void);
72
73 /**
74 * Get the base address for the string's buffer.
75 * Proper use for library users is to gain read-only access to the buffer,
76 * so that it may be used inside an i/o request or similar things. Note that
77 * it is an \b invalid assumption that the buffer address keeps constant between
78 * library calls. This is only guaranteed for read-only methods. For example,
79 * the methods used to grow the string may be forced to reallocate the buffer
80 * on a new address with sufficiently free space.
81 *
82 * @param[in] s string object
83 * @returns address of buffer <b>Note: this is NOT a zero-terminated C string!</b>
84 */
85 static inline unsigned char *
es_getBufAddr(es_str_t * s)86 es_getBufAddr(es_str_t *s)
87 {
88 return ((unsigned char*) s) + sizeof(es_str_t);
89 }
90
91 /**
92 * Return length of provided string object.
93 */
es_strlen(es_str_t * str)94 static inline es_size_t es_strlen(es_str_t *str)
95 {
96 return(str->lenStr);
97 }
98
99 /**
100 * Create a new string object.
101 * @param[in] lenhint expected max length of string. Do \b not use too large value.
102 * @returns pointer to new object or NULL on error
103 */
104 es_str_t* es_newStr(es_size_t lenhint);
105
106 /**
107 * delete a string object.
108 * @param[in] str string to be deleted.
109 */
110 void es_deleteStr(es_str_t *str);
111
112
113 /**
114 * Create a new string object based on a "traditional" C string.
115 * @param[in] cstr traditional, '\0'-terminated C string
116 * @param[in] len length of str. Use strlen() if you don't know it, but often it
117 * the length is known and we use this as a time-safer (if present).
118 * @returns pointer to new object or NULL on error
119 */
120 es_str_t* es_newStrFromCStr(const char *cstr, es_size_t len);
121
122
123 /**
124 * Create a new string object from a substring of an existing string.
125 * This involves copying the substring.
126 *
127 * @param[in] str original string
128 * @param[in] start beginning position of substring (0-based)
129 * @param[in] len length of substring to extract
130 * @returns pointer to new object or NULL on error
131 *
132 * If start > strlen, a valid (!) empty string will be returned. If
133 * start+len > strlen, the rest of the string starting at start will be
134 * returned.
135 */
136 es_str_t* es_newStrFromSubStr(es_str_t *str, es_size_t start, es_size_t len);
137
138
139 /**
140 * Create a new string object from a buffer.
141 * This involves copying the buffer.
142 *
143 * @param[in] buf buffer begin
144 * @param[in] len length of buffer
145 * @returns pointer to new object or NULL on error
146 */
147 es_str_t* es_newStrFromBuf(char *buf, es_size_t len);
148
149
150 /**
151 * Create a new string object from a number.
152 *
153 * @param[in] num number (a long long value to cover all)
154 * @returns pointer to new object or NULL on error
155 */
156 es_str_t* es_newStrFromNumber(long long num);
157
158
159 /**
160 * Empty a string.
161 * An existing string is set to empty state, but no allocation
162 * or allocation information is reset. This function is useful if
163 * the same string object is used several times within a loop
164 * and it shall be re-set to "" on each iteration. As the allocation
165 * is preserved, the string in most cases needs to grow only very
166 * few times. This is considered the fastest method to repeatedly
167 * work with temporary strings.
168 *
169 * @param[in] str the string to empty
170 */
171 static inline void
es_emptyStr(es_str_t * str)172 es_emptyStr(es_str_t *str)
173 {
174 str->lenStr = 0;
175 }
176
177
178 /**
179 * Duplicate a str.
180 * Currently, the string is actually duplicated. May be changed to
181 * copy-on-write in later releases.
182 *
183 * @param[in] str original string
184 * @returns pointer to new object or NULL on error
185 */
186 static inline es_str_t*
es_strdup(es_str_t * str)187 es_strdup(es_str_t *str)
188 {
189 return es_newStrFromSubStr(str, 0, es_strlen(str));
190 }
191
192
193 /**
194 * Compare a string against a buffer.
195 * Semantics are the same as strcmp(). This function is required in
196 * order to permit simple comparisons against C strings, what
197 * otherwise would require conversions. As a side-effect, it can also
198 * compare against substrings and other buffers of any type.
199 *
200 * @param[in] s string to compare
201 * @param[in] b buffer to compare against
202 * @param[in] len lenght of buffer
203 * @returns 0 if equal, negative if s<cs, positive if s>cs
204 */
205 int es_strbufcmp(es_str_t *s, const unsigned char *b, es_size_t len);
206
207 /** Case-insensitive version of es_strcasebufcmp.
208 */
209 int es_strcasebufcmp(es_str_t *s, const unsigned char *b, es_size_t len);
210
211
212 /**
213 * Convert a string to lower case. Once converted, this can not be
214 * undone. If the caller needs the original string, it must create
215 * a copy before calling tolower.
216 *
217 * @param[in] s string object to be converted
218 */
219 void es_tolower(es_str_t *s);
220
221 /**
222 * Compare two string objects.
223 * Semantics are the same as strcmp().
224 *
225 * @param[in] s1 frist string
226 * @param[in] s2 second string
227 * @returns 0 if equal, negative if s1<s2, positive if s1>s2
228 */
229 static inline int
es_strcmp(es_str_t * s1,es_str_t * s2)230 es_strcmp(es_str_t *s1, es_str_t *s2)
231 {
232 return es_strbufcmp(s1, es_getBufAddr(s2), s2->lenStr);
233 }
234
235 /** Case-insensitive version of es_strcmp.
236 */
237 static inline int
es_strcasecmp(es_str_t * s1,es_str_t * s2)238 es_strcasecmp(es_str_t *s1, es_str_t *s2)
239 {
240 return es_strcasebufcmp(s1, es_getBufAddr(s2), s2->lenStr);
241 }
242
243
244 /**
245 * Compare two string objects, but only the first n characters.
246 * Semantics are the same as strncmp().
247 *
248 * @param[in] s1 frist string
249 * @param[in] s2 second string
250 * @param[in] len number of characters to compare
251 * @returns 0 if equal, negative if s1<s2, positive if s1>s2
252 */
253 int es_strncmp(es_str_t *s1, es_str_t *s2, es_size_t len);
254
255
256 /**
257 * This is the case insensitive version of es_strncmp. See there for
258 * further details.
259 */
260 int es_strncasecmp(es_str_t *s1, es_str_t *s2, es_size_t len);
261
262
263 /**
264 * Check if the second string is contained within the first string.
265 *
266 * @param[in] s1 frist string
267 * @param[in] s2 second string
268 * @returns -1 if s2 is not contained in s1, otherwise the offset
269 * of the first location where it is contained. This is
270 * zero-based, so 0 as return indicates everthing OK and s2
271 * is contained right at the start of s1.
272 */
273 int es_strContains(es_str_t *s1, es_str_t *s2);
274
275
276 /**
277 * This is the case-insensitive version of es_strContains. See there
278 * for further information.
279 */
280 int es_strCaseContains(es_str_t *s1, es_str_t *s2);
281
282
283 /**
284 * A macro to compare a string against a constant C string
285 */
286 #define es_strconstcmp(str, constcstr) \
287 es_strbufcmp(str, (unsigned char*) constcstr, sizeof(constcstr) - 1)
288
289 /**
290 * Extend string buffer.
291 * This is called if the size is insufficient. Note that the string
292 * pointer will be changed. This is an \b internal function that should
293 * \b not be called from any lib user app.
294 *
295 * @param[in/out] ps pointer to (pointo to) string to be extened
296 * @param[in] minNeeded minimum number of additional bytes needed
297 * @returns 0 on success, something else otherwise
298 */
299 int es_extendBuf(es_str_t **ps, es_size_t minNeeded);
300
301 /**
302 * Append a character to the current string object.
303 * Note that the pointer to the string object may change. This
304 * is because we may need to aquire more memory.
305 * @param[in/out] ps string to be extened (updatedable pointer required!)
306 * @returns 0 on success, something else otherwise
307 */
308 int es_addChar(es_str_t **ps, const unsigned char c);
309
310
311 /**
312 * Append a memory buffer to a string.
313 * This is the method that almost all other append methods actually use.
314 *
315 * @param[in/out] ps1 updateable pointer to to-be-appended-to string
316 * @param[in] buf buffer to append
317 * @param[in] lenBuf length of buffer
318 *
319 * @returns 0 on success, something else otherwise
320 */
321 int es_addBuf(es_str_t **ps1, const char *buf, const es_size_t lenBuf);
322
323 /**
324 * A macro to add a traditional C constant to a string.
325 */
326 #define es_addBufConstcstr(str, constcstr) \
327 es_addBuf(str, constcstr, sizeof(constcstr) - 1)
328
329 /**
330 * Append a second string to the first one.
331 *
332 * @param[in/out] ps1 updateable pointer to to-be-appended-to string
333 * @param[in] s2 string to append
334 *
335 * @returns 0 on success, something else otherwise
336 */
337 static inline int
es_addStr(es_str_t ** ps1,es_str_t * s2)338 es_addStr(es_str_t **ps1, es_str_t *s2)
339 {
340 return es_addBuf(ps1, (char*) es_getBufAddr(s2), s2->lenStr);
341 }
342
343 /**
344 * Obtain a traditional C-String from a string object.
345 * The string object is not modified. Note that the C string is not
346 * necessarily exactly the same string: C Strings can not contain NUL
347 * characters, and as such they need to be either encoded or dropped.
348 * This is done by this function. The user can specify with which character
349 * sequence (a traditional C String) it shall be replaced.
350 * @note
351 * This function has to do a lot of work, and should not be called unless
352 * absolutely necessary. If possible, use the native representation of
353 * the string object. For example, you can use the buffer address and
354 * string length in most i/o calls, if you use the native versions and avoid
355 * the C string i/o calls.
356 *
357 * @param[in] s string object
358 * @param[in] nulEsc escape sequence for NULs. If NULL, NUL characters will be dropped.
359 *
360 * @returns NULL in case of error, otherwise a suitably-encoded standard C string.
361 * This string is allocated from the dynamic memory pool and must be freed
362 * by the caller.
363 */
364 char *es_str2cstr(es_str_t *s, const char *nulEsc);
365
366 /**
367 * Obtain a number from the string object. The result is always valid
368 * and the number value is extracted as follows:
369 * - strings starting with "0x" are interpreted as being hex
370 * - strings starting with "0" are interpreted as being octal
371 * - strings starting with "-" are interpreted as negative decimal
372 * - all others are interpreted as postive decimal
373 * - octal and hex string are always unsigned
374 * - the number is made up from the longest sequence of (valid) digits
375 * from the start of the string. Trailing non-digits are ignored
376 * - if the string does not start with a valid digit, 0 is returned
377 * Note that the string always returns the best match as the number
378 * "represented" by the string. For example "1x234" will return the
379 * number 1 and "Test123" will return 0. You can use bSuccess to learn
380 * if the string could be converted completely (1) or only partially (0).
381 *
382 * @param[in] s string object
383 * @param[out] bSucccess 1 if the conversion was "successful", that means
384 * the whole string was number, 0 if "unsuccessful", that means
385 * the string was not a valid number. In this case, the first
386 * part of the string is treated as number. If the caller sets
387 * bSuccess to NULL, no conversion state information is returned.
388 *
389 * @returns number value as specified
390 */
391 long long es_str2num(es_str_t *s, int *bSuccess);
392
393 /**
394 * Unescape a string.
395 * The escape seqences defined below will be unescaped and replaced
396 * by a single character. The string is modified in place (note that
397 * space is always sufficient, because the resulting string will be
398 * smaller or of equal size). This function can not run into trouble,
399 * so it does not return a return status.
400 *
401 * The following escape sequences, inspired by the C language, are supported:
402 * (Note: double backslashes are for Doxygen, of course this is to
403 * be used with single backslashes):
404 * - \\0 NUL
405 * - \\a BEL
406 * - \\b Backspace
407 * - \\f FF
408 * - \\n LF
409 * - \\r CR
410 * - \\t HT
411 * - \\' singlu quotation mark
412 * - \\" double quotation mark
413 * - \\? question mark
414 * - \\\\ backslash character
415 * - \\ooo ASCII Character in octal notation (o being octal digit)
416 * - \\xhh ASCC character in hexadecimal notation
417 * - \\xhhhh Unicode characer in headecimal notation
418 * All other escape sequences are undefined. Currently, this is
419 * interpreted as the escape character itself, but this is not
420 * guaranteed. Most importantly, a special meaning may be assigned
421 * to any of the currently-unassigned characters in the future.
422 *
423 * @param[in/out] s string object to unescape.
424 */
425 void es_unescapeStr(es_str_t *s);
426
427 #endif /* #ifndef LIBESTR_H_INCLUDED */
428