1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #ifndef _h_klib_text_
28 #define _h_klib_text_
29 
30 #ifndef _h_klib_extern_
31 #include <klib/extern.h>
32 #endif
33 
34 #ifndef _h_klib_defs_
35 #include <klib/defs.h>
36 #endif
37 
38 #include <stdarg.h>
39 #include <string.h>
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 
45 /*--------------------------------------------------------------------------
46  * UTF-8
47  *  UNICODE TRANFORMATION FORMAT into sequences of 8-bit bytes
48  *
49  *  An ASCII character encoded with UTF-8 formatting will have a bit
50  *  pattern of 0b0xxxxxxx.
51  *
52  *  A non-ASCII UNICODE character encoded with UTF-8 formatting will
53  *  use 2..6 bytes, where the character length will be encoded into
54  *  the number of contiguous bits in the leading byte:
55  *
56  *    0b110xxxxx => 2 byte character
57  *    0b1110xxxx => 3 byte character
58  *    0b11110xxx => 4 byte character
59  *    0b111110xx => 5 byte character
60  *    0b1111110x => 6 byte character
61  *
62  *  all UTF-8 characters uniformly represent their non-leading bytes
63  *  by having the MSB set and the next bit 0:
64  *
65  *    0b10xxxxxx => non-leading byte
66  *
67  *  this allows a pointer to an arbitrary byte within UTF-8 to
68  *  be used to synchronize on the start of a character, since
69  *  non-starting bytes cannot be confused with start bytes. The test
70  *
71  *    if ( ( * utf8 & 0xC0 ) == 0x80 )
72  *      start_byte = false;
73  *
74  *  is sufficient for detecting this.
75  *
76  *  an understanding that string LENGTH ( the count of characters )
77  *  and string SIZE ( the count of bytes ) are different is critical.
78  */
79 
80 
81 /*--------------------------------------------------------------------------
82  * String
83  *  pseudo-intrinsic string
84  *
85  *  "addr" gives a pointer to constant UTF-8 data
86  *
87  *  "size" gives the number of BYTES in the UTF-8 string,
88  *   NOT the number of characters.
89  *
90  *  "len" gives the number of CHARACTERS in the UTF-8 string,
91  *   NOT the number of bytes.
92  *
93  *  with UTF-8 encoding, only ASCII-7 characters will be represented
94  *  using a single byte per character. In the case that a string is
95  *  composed of 100% ASCII-7 characters, String.size == String.len,
96  *  while if there is even a single UTF-8 character,
97  *  String.size > String.len by definition.
98  *
99  *  indexing into a string by BYTES is ridiculous because it may
100  *  land in the middle of a character. String operations index by
101  *  CHARACTER so as to always designate an entire character.
102  */
103 typedef struct String String;
104 struct String
105 {
106     const char *addr;
107     size_t size;
108     uint32_t len;
109 };
110 
111 /* StringInit
112  *  initializes a String object
113  */
114 #define StringInit( s, val, sz, length ) \
115     ( void ) \
116         ( ( s ) -> addr = ( val ), \
117           ( s ) -> size = ( sz ), \
118           ( s ) -> len = ( length ) )
119 
120 #define StringInitCString( s, cstr ) \
121     ( void ) \
122         ( ( s ) -> len = string_measure \
123           ( ( s ) -> addr = ( cstr ), & ( s ) -> size ) )
124 
125 /* CONST_STRING
126  *  initialize a string from a manifest constant
127  */
128 #define CONST_STRING( s, val ) \
129     StringInit ( s, val, sizeof val - 1, sizeof val - 1 )
130 
131 /* StringSize
132  *  size of string in BYTES
133  */
134 #define StringSize( s ) \
135     ( s ) -> size
136 
137 /* StringLength
138  *  length of string in CHARACTERS
139  */
140 #define StringLength( s ) \
141     ( s ) -> len
142 
143 /* StringCopy
144  *  allocates a copy of a string
145  */
146 KLIB_EXTERN rc_t CC StringCopy ( const String **cpy, const String *str );
147 
148 /* StringConcat
149  *  concatenate one string onto another
150  */
151 KLIB_EXTERN rc_t CC StringConcat ( const String **cat,
152     const String *a, const String *b );
153 
154 /* StringSubstr
155  *  creates a substring of an existing one
156  *  note that the substring is always a non-allocated copy
157  *  and is dependent upon the lifetime of its source
158  *
159  *  returns "sub" if "idx" was valid
160  *  or NULL otherwise
161  *
162  *  "len" may be 0 to indicate infinite length
163  *  or may extend beyond end of source string.
164  */
165 KLIB_EXTERN String* CC StringSubstr ( const String *str,
166     String *sub, uint32_t idx, uint32_t len );
167 
168 /* StringTrim
169  *  trims ascii white-space from both ends
170  *  returns trimmed string in "trimmed"
171  */
172 KLIB_EXTERN String * CC StringTrim ( const String * str, String * trimmed );
173 
174 /* StringHead
175  *  access the first character
176  *
177  *  this is an efficient enough function to be included.
178  *  the generic functions of accessing characters by index
179  *  are apt to be extremely inefficient with UTF-8, and
180  *  as such are not included.
181  *
182  *  returns EINVAL if the character is bad,
183  *  or ENODATA if the string is empty
184  */
185 KLIB_EXTERN rc_t CC StringHead ( const String *str, uint32_t *ch );
186 
187 /* StringPopHead
188  *  remove and return the first character
189  *
190  *  returns EINVAL if the character is bad,
191  *  or ENODATA if the string is empty
192  */
193 KLIB_EXTERN rc_t CC StringPopHead ( String *str, uint32_t *ch );
194 
195 /* StringEqual
196  *  compare strings for equality
197  *  not lexical for all characters
198  */
199 #define StringEqual( a, b ) \
200     ( ( a ) -> size == ( b ) -> size && \
201     memcmp ( ( a ) -> addr, ( b ) -> addr, ( a ) -> size ) == 0 )
202 
203 /* StringCompare
204  *  compare strings for relative ordering
205  *  not strictly lexical - generally orders by character code
206  */
207 KLIB_EXTERN int CC StringCompare ( const String *a, const String *b );
208 
209 /* StringCaseEqual
210  *  compare strings for case-insensitive equality
211  */
212 KLIB_EXTERN bool CC StringCaseEqual ( const String *a, const String *b );
213 
214 /* StringCaseCompare
215  *  compare strings for relative case-insensitive ordering
216  */
217 KLIB_EXTERN int CC StringCaseCompare ( const String *a, const String *b );
218 
219 /* StringOrder
220  * StringOrderNoNullCheck
221  *  compares strings as quickly as possible for
222  *  deterministic ordering: first by length, then
223  *  by binary ( byte-wise ) content.
224  *
225  *  performs more quickly than StringCompare for cases
226  *  where only deterministic ordering is needed ( e.g. symbol table ).
227  *
228  *  the "NoNullCheck" version will crash if either a or b are NULL
229  *  but avoids the overhead of checking when both are known to be good
230  */
231 KLIB_EXTERN int64_t CC StringOrder ( const String *a, const String *b );
232 KLIB_EXTERN int64_t CC StringOrderNoNullCheck ( const String *a, const String *b );
233 
234 /* StringMatch
235  *  creates a substring of "a" in "match"
236  *  for all of the sequential matching characters between "a" and "b"
237  *  starting from character [ 0 ].
238  *
239  *  returns the number of characters that match.
240  */
241 KLIB_EXTERN uint32_t CC StringMatch ( String *match,
242     const String *a, const String *b );
243 
244 /* StringMatchExtend
245  *  extends a substring of "a" in "match"
246  *  for all of the sequential matching characters between "a" and "b"
247  *  starting from character [ match -> len ].
248  *
249  *  returns the number of matching characters that were extended.
250  */
251 KLIB_EXTERN uint32_t CC StringMatchExtend ( String *match,
252     const String *a, const String *b );
253 
254 /* StringHash
255  *  hash value for string
256  */
257 #define StringHash( s ) \
258     string_hash ( ( s ) -> addr, ( s ) -> size )
259 
260 /* StringCopyUTF...
261  *  creates a String from UTF16 or UTF32 UNICODE input
262  *  wchar_t is one or the other, depending upon OS and compiler.
263  */
264 KLIB_EXTERN rc_t CC StringCopyUTF16 ( const String **cpy,
265     const uint16_t *text, size_t bytes );
266 KLIB_EXTERN rc_t CC StringCopyUTF32 ( const String **cpy,
267     const uint32_t *text, size_t bytes );
268 
269 /* StringCopyWChar_t
270  *  wchar_t is alternately defined as 16 or 32 bits
271  */
272 #define StringCopyWChar_t( cpy, text, bytes ) \
273     ( ( sizeof ( wchar_t ) == sizeof ( uint16_t ) ) ? \
274       StringCopyUTF16 ( cpy, ( const uint16_t* ) ( text ), bytes ) : \
275       StringCopyUTF32 ( cpy, ( const uint32_t* ) ( text ), bytes ) )
276 
277 /* StringWhack
278  *  deallocates a string
279  *  ignores strings not allocated by this library
280  */
281 KLIB_EXTERN void CC StringWhack ( const String* self );
282 
283 
284 /* StringToInt
285  *  simple string conversion functions
286  *
287  *  these functions are defined to consume the entire string.
288  *  leading spaces are tolerated, repeated signs are accepted for signed conversion,
289  *  decimal and hex encodings are accepted for unsigned conversion,
290  *  decimal only for signed conversion.
291  *
292  *  "optional_rc" [ OUT, NULL OKAY ] - if non-null, user is interested
293  *  in error conditions. if the parameter is present, the string must be
294  *  completely consumed without overflow.
295  *
296  *  optional return values ( with { GetRCObject ( rc ), GetRCState ( rc ) }:
297  *   0                            : no error
298  *   { rcRange, rcExcessive }     : integer overflow
299  *   { rcTransfer, rcIncomplete } : extra characters remain in string
300  *   { rcData, rcInsufficient }   : no numeric text was found
301  *
302  *  return values - regardless of "optional_rc":
303  *    val             : when no error
304  *    val             : on incomplete transfer
305  *    +/- max int64_t : when signed overflow occurs ( StringToI64 only )
306  *    max uint64_t    : when unsigned overflow occurs ( StringToU64 only )
307  *    0               : when no input text is found
308  */
309 KLIB_EXTERN int64_t StringToI64 ( const String * self, rc_t * optional_rc );
310 KLIB_EXTERN uint64_t StringToU64 ( const String * self, rc_t * optional_rc );
311 
312 
313 /*--------------------------------------------------------------------------
314  * raw text strings
315  *  the internal representation of text strings is implementation
316  *  dependent. it is assumed to be ASCII-7 or UTF-8, although
317  *  this is determined by the implementation library of these functions.
318  *
319  * NB - ASCII implementations are no longer being provided
320  *  all text handling is UTF-8 unless explictly stated otherwise
321  */
322 
323 /* string_size
324  *  length of string in bytes
325  */
326 KLIB_EXTERN size_t CC string_size ( const char *str );
327 
328 /* string_len
329  *  length of string in characters, when the size is known
330  */
331 KLIB_EXTERN uint32_t CC string_len ( const char *str, size_t size );
332 
333 /* string_measure
334  *  measures length of string in both characters and bytes
335  */
336 KLIB_EXTERN uint32_t CC string_measure ( const char *str, size_t *size );
337 
338 /* string_copy
339  *  copies whole character text into a buffer
340  *  terminates with NUL byte if possible
341  *  returns the number of bytes copied
342  */
343 KLIB_EXTERN size_t CC string_copy ( char *dst, size_t dst_size,
344     const char *src, size_t src_size );
345 
346 /* string_copy_measure
347  *  copies whole character text into a buffer
348  *  terminates with NUL byte if possible
349  *  returns the number of bytes copied
350  */
351 KLIB_EXTERN size_t CC string_copy_measure ( char *dst, size_t dst_size, const char *src );
352 
353 /* string_dup
354  *  replaces the broken C library strndup
355  *  creates a NUL-terminated malloc'd string
356  */
357 KLIB_EXTERN char* CC string_dup ( const char *str, size_t size );
358 
359 /* string_dup_measure
360  *  replaces the broken C library strdup
361  *  creates a NUL-terminated malloc'd string
362  *  returns size of string unless "size" is NULL
363  */
364 KLIB_EXTERN char* CC string_dup_measure ( const char *str, size_t *size );
365 
366 /* tolower_copy
367  *  copies whole character text in lower-case
368  *  terminates with NUL byte if possible
369  *  returns the number of bytes copied
370  */
371 KLIB_EXTERN size_t CC tolower_copy ( char *dst, size_t dst_size,
372     const char *src, size_t src_size );
373 
374 /* toupper_copy
375  *  copies whole character text in upper-case
376  *  terminates with NUL byte if possible
377  *  returns the number of bytes copied
378  */
379 KLIB_EXTERN size_t CC toupper_copy ( char *dst, size_t dst_size,
380     const char *src, size_t src_size );
381 
382 /* string_cmp
383  *  performs a safe strncmp
384  *
385  *  "max_chars" limits the extent of the comparison
386  *  to not exceed supplied value, i.e. the number of
387  *  characters actually compared will be the minimum
388  *  of asize, bsize and max_chars.
389  *
390  *  if either string size ( or both ) < max_chars and
391  *  all compared characters match, then the result will
392  *  be a comparison of asize against bsize.
393  */
394 KLIB_EXTERN int CC string_cmp ( const char *a, size_t asize,
395     const char *b, size_t bsize, uint32_t max_chars );
396 
397 /* strcase_cmp
398  *  like string_cmp except case insensitive
399  */
400 KLIB_EXTERN int CC strcase_cmp ( const char *a, size_t asize,
401     const char *b, size_t bsize, uint32_t max_chars );
402 
403 /* string_match
404  *  returns the number of matching characters
405  *
406  *  "max_chars" limits the extent of the comparison
407  *  to not exceed supplied value, i.e. the number of
408  *  characters actually compared will be the minimum
409  *  of asize, bsize and max_chars.
410  *
411  *  "msize" will be set to the size of the matched string
412  *  if not NULL
413  */
414 KLIB_EXTERN uint32_t CC string_match ( const char *a, size_t asize,
415     const char *b, size_t bsize, uint32_t max_chars, size_t *msize );
416 
417 /* strcase_match
418  *  like string_match except case insensitive
419  */
420 KLIB_EXTERN uint32_t CC strcase_match ( const char *a, size_t asize,
421     const char *b, size_t bsize, uint32_t max_chars, size_t *msize );
422 
423 /* string_chr
424  *  performs a safe strchr
425  *  "ch" is in UTF32
426  */
427 KLIB_EXTERN char* CC string_chr ( const char *str, size_t size, uint32_t ch );
428 
429 /* string_rchr
430  *  performs a safe strrchr
431  */
432 KLIB_EXTERN char* CC string_rchr ( const char *str, size_t size, uint32_t ch );
433 
434 /* string_brk
435  *  performs a safe strpbrk
436  */
437 #if 0
438 KLIB_EXTERN char* CC string_brk ( const char *str, size_t size,
439     const char *accept, size_t asize );
440 #endif
441 
442 /* string_rbrk
443  */
444 #if 0
445 KLIB_EXTERN char* CC string_rbrk ( const char *str, size_t size,
446     const char *accept, size_t asize );
447 #endif
448 
449 /* string_hash
450  *  hashes a string
451  */
452 KLIB_EXTERN uint32_t CC string_hash ( const char *str, size_t size );
453 
454 /* string_idx
455  *  seek an indexed character
456  *
457  *  the efficiency is based upon chosen internal
458  *  string representation, which, when using single byte chars,
459  *  is simple and efficient.
460  *
461  *  on the other hand, UTF-8 has a variable character width,
462  *  requiring scanning of the entire string until the indexed
463  *  character is found.
464  */
465 KLIB_EXTERN char* CC string_idx ( const char *str, size_t size, uint32_t idx );
466 
467 
468 /* string_to_int
469  *  simple string conversion functions
470  *
471  *  these functions are defined to consume the entire string.
472  *  leading spaces are tolerated, repeated signs are accepted for signed conversion,
473  *  decimal and hex encodings are accepted for unsigned conversion,
474  *  decimal only for signed conversion.
475  *
476  *  "optional_rc" [ OUT, NULL OKAY ] - if non-null, user is interested
477  *  in error conditions. if the parameter is present, the string must be
478  *  completely consumed without overflow.
479  *
480  *  optional return values ( with { GetRCObject ( rc ), GetRCState ( rc ) }:
481  *   0                            : no error
482  *   { rcRange, rcExcessive }     : integer overflow
483  *   { rcTransfer, rcIncomplete } : extra characters remain in string
484  *   { rcData, rcInsufficient }   : no numeric text was found
485  *
486  *  return values - regardless of "optional_rc":
487  *    val             : when no error
488  *    val             : on incomplete transfer
489  *    +/- max int64_t : when signed overflow occurs ( StringToI64 only )
490  *    max uint64_t    : when unsigned overflow occurs ( StringToU64 only )
491  *    0               : when no input text is found
492  */
493 KLIB_EXTERN int64_t string_to_I64 ( const char * str, size_t size, rc_t * optional_rc );
494 KLIB_EXTERN uint64_t string_to_U64 ( const char * str, size_t size, rc_t * optional_rc );
495 
496 
497 /*--------------------------------------------------------------------------
498  * conversion between UTF-32 and UTF-8 UNICODE
499  */
500 
501 /* utf8_utf32
502  *  converts UTF-8 text to a single UTF-32 character
503  *  returns the number of UTF8 bytes consumed, such that:
504  *    return > 0 means success
505  *    return == 0 means insufficient input
506  *    return < 0 means bad input
507  */
508 KLIB_EXTERN int CC utf8_utf32 ( uint32_t *ch, const char *begin, const char *end );
509 
510 /* utf32_utf8
511  *  converts a single UTF-32 character to UTF-8 text
512  *  returns the number of UTF8 bytes generated, such that:
513  *    return > 0 means success
514  *    return == 0 means insufficient output
515  *    return < 0 means bad character
516  */
517 KLIB_EXTERN int CC utf32_utf8 ( char *begin, char *end, uint32_t ch );
518 
519 
520 /*--------------------------------------------------------------------------
521  * support for 16 and 32-bit UTF formats
522  */
523 
524 /* utf16_string_size/len/measure
525  *  measures UTF-16 strings
526  */
527 KLIB_EXTERN size_t CC utf16_string_size ( const uint16_t *str );
528 KLIB_EXTERN uint32_t CC utf16_string_len ( const uint16_t *str, size_t size );
529 KLIB_EXTERN uint32_t CC utf16_string_measure ( const uint16_t *str, size_t *size );
530 
531 /* utf32_string_size/len/measure
532  */
533 KLIB_EXTERN size_t CC utf32_string_size ( const uint32_t *str );
534 KLIB_EXTERN uint32_t CC utf32_string_len ( const uint32_t *str, size_t size );
535 KLIB_EXTERN uint32_t CC utf32_string_measure ( const uint32_t *str, size_t *size );
536 
537 /* wchar_string_size/len/measure
538  *  measures wchar_t strings
539  */
540 KLIB_EXTERN size_t CC wchar_string_size ( const wchar_t *str );
541 KLIB_EXTERN uint32_t CC wchar_string_len ( const wchar_t *str, size_t size );
542 KLIB_EXTERN uint32_t CC wchar_string_measure ( const wchar_t *str, size_t *size );
543 
544 /* conversion from UTF-16 to internal standard */
545 KLIB_EXTERN uint32_t CC utf16_cvt_string_len ( const uint16_t *src,
546     size_t src_size, size_t *dst_size );
547 KLIB_EXTERN uint32_t CC utf16_cvt_string_measure ( const uint16_t *src,
548     size_t *src_size, size_t *dst_size );
549 KLIB_EXTERN size_t CC utf16_cvt_string_copy ( char *dst, size_t dst_size,
550     const uint16_t *src, size_t src_size );
551 
552 /* conversion from UTF-32 to internal standard */
553 KLIB_EXTERN uint32_t CC utf32_cvt_string_len ( const uint32_t *src,
554     size_t src_size, size_t *dst_size );
555 KLIB_EXTERN uint32_t CC utf32_cvt_string_measure ( const uint32_t *src,
556     size_t *src_size, size_t *dst_size );
557 KLIB_EXTERN size_t CC utf32_cvt_string_copy ( char *dst, size_t dst_size,
558     const uint32_t *src, size_t src_size );
559 
560 /* conversion from wchar_t to internal standard */
561 KLIB_EXTERN uint32_t CC wchar_cvt_string_len ( const wchar_t *src,
562     size_t src_size, size_t *dst_size );
563 KLIB_EXTERN uint32_t CC wchar_cvt_string_measure ( const wchar_t *src,
564     size_t *src_size, size_t *dst_size );
565 KLIB_EXTERN size_t CC wchar_cvt_string_copy ( char *dst, size_t dst_size,
566     const wchar_t *src, size_t src_size );
567 
568 /* conversion to wchar_t from internal standard */
569 KLIB_EXTERN size_t CC string_cvt_wchar_copy ( wchar_t *dst, size_t dst_size,
570     const char *src, size_t src_size );
571 
572 /*--------------------------------------------------------------------------
573  * support for ISO-8859-x 8-bit character sets
574  */
575 
576 /* iso8859_utf32
577  *  converts 8-bit text to a single UTF-32 character
578  *  returns the number of 8-bit bytes consumed, such that:
579  *    return > 0 means success
580  *    return == 0 means insufficient input
581  *    return < 0 means bad input
582  */
583 KLIB_EXTERN int CC iso8859_utf32 ( const uint32_t map [ 128 ],
584     uint32_t *ch, const char *begin, const char *end );
585 
586 /* iso8859_string_size/len/measure
587  *  measures UTF-16 strings
588  */
589 KLIB_EXTERN size_t CC iso8859_string_size ( const uint32_t map [ 128 ],
590     const char *str );
591 KLIB_EXTERN uint32_t CC iso8859_string_len ( const uint32_t map [ 128 ],
592     const char *str, size_t size );
593 KLIB_EXTERN uint32_t CC iso8859_string_measure ( const uint32_t map [ 128 ],
594     const char *str, size_t *size );
595 
596 /* conversion from ISO-8859-x to internal standard */
597 KLIB_EXTERN uint32_t CC iso8859_cvt_string_len ( const uint32_t map [ 128 ],
598     const char *src, size_t src_size, size_t *dst_size );
599 KLIB_EXTERN uint32_t CC iso8859_cvt_string_measure ( const uint32_t map [ 128 ],
600     const char *src, size_t *src_size, size_t *dst_size );
601 KLIB_EXTERN size_t CC iso8859_cvt_string_copy ( const uint32_t map [ 128 ],
602     char *dst, size_t dst_size, const char *src, size_t src_size );
603 
604 /* some externally defined character maps */
605 KLIB_EXTERN_DATA const uint32_t iso8859_1 [ 128 ];
606 KLIB_EXTERN_DATA const uint32_t cp1252 [ 128 ];
607 
608 
609 #ifdef __cplusplus
610 }
611 #endif
612 
613 #endif /* _h_klib_text_ */
614