1 #pragma once
2 
3 /** @file the_Foundation/string.h  UTF-8 string with copy-on-write semantics.
4 
5 String is derived from Block, and contains text with multibyte characters. When
6 iterating a string, the multibyte characters are converted to UTF-32 code points.
7 
8 String uses copy-on-write semantics (thanks to Block), so making copies is very
9 efficient. Conversions between String and Block are also trivial, and can be done
10 without duplicating the content. In fact, a pointer to a Block can be simply casted to
11 a String pointer and vice versa.
12 
13 @authors Copyright (c) 2017 Jaakko Keränen <jaakko.keranen@iki.fi>
14 
15 @par License
16 
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are met:
19 
20 1. Redistributions of source code must retain the above copyright notice, this
21    list of conditions and the following disclaimer.
22 2. Redistributions in binary form must reproduce the above copyright notice,
23    this list of conditions and the following disclaimer in the documentation
24    and/or other materials provided with the distribution.
25 
26 <small>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
30 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
33 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</small>
36 */
37 
38 #include "defs.h"
39 #include "block.h"
40 
41 #include <limits.h>
42 
43 iBeginPublic
44 
45 typedef uint32_t iChar;
46 
47 iChar   upper_Char          (iChar);
48 iChar   lower_Char          (iChar);
49 
50 iBool   isSpace_Char        (iChar);
51 iBool   isAlpha_Char        (iChar);
52 iBool   isNumeric_Char      (iChar);
53 iBool   isAlphaNumeric_Char (iChar);
54 iBool   isPunct_Char        (iChar);
55 
isVariationSelector_Char(iChar c)56 iLocalDef iBool isVariationSelector_Char(iChar c) {
57     return (c >= 0xfe00 && c <= 0xfe0f) || (c >= 0xe0100 && c <= 0xe0121);
58 }
isFitzpatrickType_Char(iChar c)59 iLocalDef iBool isFitzpatrickType_Char(iChar c) {
60     return c >= 0x1f3fb && c <= 0x1f3ff;
61 }
isDefaultIgnorable_Char(iChar c)62 iLocalDef iBool isDefaultIgnorable_Char(iChar c) {
63     return c == 0x115f || (c >= 0x200b && c <= 0x200e) || c == 0x2060 || c == 0x2061 ||
64            c == 0xfeff;
65 }
isEmoji_Char(iChar c)66 iLocalDef iBool isEmoji_Char(iChar c) {
67     return (c >= 0x1f300 && c < 0x1f700) || (c >= 0x1f7e0 && c <= 0x1f7eb) ||
68            (c >= 0x1f900 && c <= 0x1f9ff) || (c >= 0x1fa70 && c <= 0x1faff);
69 }
isDingbats_Char(iChar c)70 iLocalDef iBool isDingbats_Char(iChar c) {
71     return c >= 0x2702 && c <= 0x27b0;
72 }
isPictograph_Char(iChar c)73 iLocalDef iBool isPictograph_Char(iChar c) {
74     return (c == 0x21a9) ||
75            (c == 0x2218 || c == 0x2219) ||
76            (c >= 0x2300 && c <= 0x27bf) ||
77            (c >= 0x1f680 && c <= 0x1f6c0);
78 }
79 
80 iDeclareType(String)
81 iDeclareType(StringList)
82 iDeclareType(StringComparison)
83 iDeclareType(MultibyteChar)
84 iDeclareType(Stream)
85 
86 struct Impl_StringComparison {
87     int     (*cmp)      (const char *, const char *);
88     int     (*cmpN)     (const char *, const char *, size_t);
89     char *  (*locate)   (const char *, const char *);
90 };
91 
92 iFoundationAPIData iStringComparison iCaseSensitive;
93 iFoundationAPIData iStringComparison iCaseInsensitive;
94 
95 struct Impl_String {
96     iBlock chars;
97 };
98 
99 #define iStringLiteral(str)     (iString){ iBlockLiteral(str, strlen(str), strlen(str) + 1) }
100 
101 iDeclareTypeConstruction(String)
102 iDeclareTypeSerialization(String)
103 
104 iString *       newCStr_String      (const char *utf8CStr);
105 iString *       newCStrN_String     (const char *utf8CStr, size_t n);
106 iString *       newUtf16_String     (const uint16_t *utf16Str);
107 iString *       newUtf16N_String    (const uint16_t *utf16Str, size_t n);
108 iString *       newUnicode_String   (const iChar *ucs);
109 iString *       newUnicodeN_String  (const iChar *ucs, size_t n);
110 iString *       newLocalCStr_String (const char *localCStr);
111 iString *       newLocalCStrN_String(const char *localCStr, size_t n);
112 iString *       newBlock_String     (const iBlock *utf8Data);
113 iString *       newFormat_String    (const char *format, ...);
114 iString *       copy_String         (const iString *);
115 
newUtf32_String(const iChar * ucs)116 iLocalDef iString * newUtf32_String     (const iChar *ucs) { return newUnicode_String(ucs); }
newUtf32N_String(const iChar * ucs,size_t n)117 iLocalDef iString * newUtf32N_String    (const iChar *ucs, size_t n) { return newUnicodeN_String(ucs, n); }
newRange_String(const iRangecc range)118 iLocalDef iString * newRange_String     (const iRangecc range) { return newCStrN_String(range.start, size_Range(&range)); }
newLocal_String(const iBlock * localChars)119 iLocalDef iString * newLocal_String     (const iBlock *localChars) { return newLocalCStrN_String(cstr_Block(localChars), size_Block(localChars)); }
120 
121 iString *           collectNewFormat_String (const char *format, ...);
collectNewCStr_String(const char * cstr)122 iLocalDef iString * collectNewCStr_String   (const char *cstr) { return collect_String(newCStr_String(cstr)); }
collectNewRange_String(const iRangecc range)123 iLocalDef iString * collectNewRange_String  (const iRangecc range) { return collect_String(newRange_String(range)); }
124 
125 void            init_String             (iString *);
126 void            initCStr_String         (iString *, const char *utf8CStr);
127 void            initCStrN_String        (iString *, const char *utf8CStr, size_t n);
128 void            initUtf16_String        (iString *, const uint16_t *utf16Str);
129 void            initUtf16N_String       (iString *, const uint16_t *utf16Str, size_t n);
130 void            initUnicode_String      (iString *, const iChar *ucs);
131 void            initUnicodeN_String     (iString *, const iChar *ucs, size_t n);
132 void            initLocalCStr_String    (iString *, const char *localCStr);
133 void            initLocalCStrN_String   (iString *, const char *localCStr, size_t n);
134 void            initBlock_String        (iString *, const iBlock *chars);
135 void            initBlockEncoding_String(iString *, const iBlock *chars, const char *encoding);
136 void            initCopy_String         (iString *, const iString *other);
137 
initRange_String(iString * d,const iRangecc range)138 iLocalDef void initRange_String (iString *d, const iRangecc range) { initCStrN_String(d, range.start, size_Range(&range)); }
139 
140 const char *    cstr_String         (const iString *);
141 size_t          length_String       (const iString *);
142 size_t          size_String         (const iString *);
143 iString *       mid_String          (const iString *, size_t charStartPos, size_t charCount);
144 iString *       upper_String        (const iString *);
145 iString *       lower_String        (const iString *);
146 iStringList *   split_String        (const iString *, const char *separator);
147 iChar           first_String        (const iString *);
148 iChar           last_String         (const iString *);
149 iBlock *        toLocal_String      (const iString *);
150 iBlock *        toUtf16_String      (const iString *);
151 iBlock *        toUnicode_String    (const iString *);
152 
toUtf32_String(const iString * d)153 iLocalDef iBlock *toUtf32_String    (const iString *d) { return toUnicode_String(d); }
154 
155 iString *       urlEncode_String                (const iString *);
156 iString *       urlDecode_String                (const iString *);
157 iString *       urlEncodeExclude_String         (const iString *, const char *excluded);
158 iString *       urlDecodeExclude_String         (const iString *, const char *excluded);
159 iString *       maybeUrlEncodeExclude_String    (const iString *, const char *excluded); /* may return NULL */
160 iString *       maybeUrlDecodeExclude_String    (const iString *, const char *excluded); /* may return NULL */
161 
162 /**
163  * Returns a pointer to the string converted to the current locale's encoding.
164  * The temporary conversion is collected as garbage.
165  *
166  * @param str  String to convert.
167  *
168  * @return Converted text. The pointer will remain valid until garbage is recycled.
169  */
cstrLocal_String(const iString * str)170 iLocalDef const char *cstrLocal_String(const iString *str) {
171     return cstr_Block(collect_Block(toLocal_String(str)));
172 }
173 
cstrCollect_String(iString * d)174 iLocalDef const char *cstrCollect_String(iString *d) {
175     return cstr_String(collect_String(d));
176 }
177 
range_String(const iString * d)178 iLocalDef iRangecc range_String(const iString *d) {
179     const iRangecc r = { constBegin_Block(&(d)->chars), constEnd_Block(&(d)->chars) };
180     return r;
181 }
182 
utf8_String(const iString * d)183 iLocalDef const iBlock *utf8_String(const iString *d) {
184     return &d->chars; /* unmodified internal representation (UTF-8) */
185 }
186 
isEmpty_String(const iString * d)187 iLocalDef iBool         isEmpty_String   (const iString *d) { return size_String(d) == 0; }
constBegin_String(const iString * d)188 iLocalDef const char *  constBegin_String(const iString *d) { return cstr_String(d); }
constEnd_String(const iString * d)189 iLocalDef const char *  constEnd_String  (const iString *d) { return cstr_String(d) + size_String(d); }
190 
191 int             cmpSc_String        (const iString *, const char *cstr, const iStringComparison *);
192 int             cmpNSc_String       (const iString *, const char *cstr, size_t n, const iStringComparison *);
193 
194 #define         cmp_String(d, cstr)             cmpSc_String(d, cstr, &iCaseSensitive)
195 #define         cmpCase_String(d, cstr)         cmpSc_String(d, cstr, &iCaseInsensitive)
196 #define         cmpString_String(d, s)          cmpSc_String(d, cstr_String(s), &iCaseSensitive)
197 #define         cmpStringCase_String(d, s)      cmpSc_String(d, cstr_String(s), &iCaseInsensitive)
198 #define         cmpStringSc_String(d, s, sc)    cmpSc_String(d, cstr_String(s), sc)
199 
equal_String(const iString * d,const iString * other)200 iLocalDef iBool equal_String(const iString *d, const iString *other) {
201     return cmpString_String(d, other) == 0;
202 }
equalCase_String(const iString * d,const iString * other)203 iLocalDef iBool equalCase_String(const iString *d, const iString *other) {
204     return cmpStringCase_String(d, other) == 0;
205 }
206 
207 iBool           startsWithSc_String (const iString *, const char *cstr, const iStringComparison *);
208 iBool           endsWithSc_String   (const iString *, const char *cstr, const iStringComparison *);
209 
210 #define         startsWith_String(d, cstr)      startsWithSc_String(d, cstr, &iCaseSensitive)
211 #define         startsWithCase_String(d, cstr)  startsWithSc_String(d, cstr, &iCaseInsensitive)
212 #define         endsWith_String(d, cstr)        endsWithSc_String  (d, cstr, &iCaseSensitive)
213 #define         endsWithCase_String(d, cstr)    endsWithSc_String  (d, cstr, &iCaseInsensitive)
214 
215 size_t          indexOf_String              (const iString *, iChar ch);
216 size_t          indexOfCStr_String          (const iString *, const char *cstr);
217 size_t          indexOfCStrFrom_String      (const iString *, const char *cstr, size_t from);
218 size_t          indexOfCStrSc_String        (const iString *, const char *cstr, const iStringComparison *);
219 size_t          indexOfCStrFromSc_String    (const iString *, const char *cstr, size_t from, const iStringComparison *);
220 size_t          lastIndexOf_String          (const iString *, iChar ch);
221 size_t          lastIndexOfCStr_String      (const iString *, const char *cstr);
222 
223 #define         indexOfString_String(d, s)          indexOfCStr_String(d, cstr_String(s))
224 #define         indexOfStringFrom_String(d, s, pos) indexOfCStrFrom_String(d, cstr_String(s), pos)
225 #define         lastIndexOfString_String(d, s)      lastIndexOfCStr_String(d, cstr_String(s))
226 
contains_String(const iString * d,iChar ch)227 iLocalDef iBool contains_String(const iString *d, iChar ch) {
228     return indexOf_String(d, ch) != iInvalidPos;
229 }
230 
231 void            set_String      (iString *, const iString *other);
232 void            setCStr_String  (iString *, const char *cstr);
233 void            setCStrN_String (iString *, const char *cstr, size_t n);
234 void            setBlock_String (iString *, const iBlock *block);
235 void            format_String   (iString *, const char *format, ...);
236 
setRange_String(iString * d,iRangecc range)237 iLocalDef void setRange_String(iString *d, iRangecc range) {
238     setCStrN_String(d, range.start, size_Range(&range));
239 }
240 
241 void            append_String       (iString *, const iString *other);
242 void            appendCStr_String   (iString *, const char *cstr);
243 void            appendCStrN_String  (iString *, const char *cstr, size_t size);
244 void            appendChar_String   (iString *, iChar ch);
245 void            appendRange_String  (iString *, const iRangecc range);
246 void            appendFormat_String (iString *, const char *format, ...);
247 void            prepend_String      (iString *, const iString *other);
248 void            prependChar_String  (iString *, iChar ch);
249 void            prependCStr_String  (iString *, const char *cstr);
250 
251 void            clear_String        (iString *);
252 void            truncate_String     (iString *, size_t charCount);
253 void            removeEnd_String    (iString *, size_t charCount);
254 void            trimStart_String    (iString *);
255 void            trimEnd_String      (iString *);
256 void            trim_String         (iString *);
257 iString *       trimmed_String      (const iString *);
258 void            replace_String      (iString *, const char *src, const char *dst);
259 void            normalize_String    (iString *); /* NFC */
260 
261 int             toInt_String    (const iString *);
262 float           toFloat_String  (const iString *);
263 double          toDouble_String (const iString *);
264 
265 iString *       quote_String    (const iString *, iBool numericUnicode);
266 iString *       unquote_String  (const iString *);
267 
268 const char *    format_CStr     (const char *format, ...);
269 const char *    skipSpace_CStr  (const char *);
270 
rangeN_CStr(const char * cstr,size_t size)271 iLocalDef iRangecc rangeN_CStr  (const char *cstr, size_t size) {
272 #if __STDC_VERSION__ >= 201100L
273     return (iRangecc){ cstr, cstr + size };
274 #else
275     const iRangecc range = { cstr, cstr + size };
276     return range;
277 #endif
278 }
range_CStr(const char * cstr)279 iLocalDef iRangecc range_CStr(const char *cstr) {
280     return rangeN_CStr(cstr, strlen(cstr));
281 }
282 
283 const char *    cstr_Rangecc        (iRangecc); /* returns NULL-terminated collected copy */
284 const iString * string_Rangecc      (iRangecc); /* returns a collected String */
285 
286 iBool           isUtf8_Rangecc      (iRangecc); /* checks if the range is well-formed UTF-8 */
287 size_t          length_Rangecc      (iRangecc); /* returns number of characters in the range */
288 
289 #define         cmp_Rangecc(d, cstr) cmpCStrSc_Rangecc((d), (cstr), &iCaseSensitive)
290 int             cmpCStrSc_Rangecc   (iRangecc, const char *cstr, const iStringComparison *);
291 int             cmpCStrNSc_Rangecc  (iRangecc, const char *cstr, size_t n, const iStringComparison *);
292 iBool           startsWithSc_Rangecc(iRangecc, const char *cstr, const iStringComparison *);
293 iBool           endsWithSc_Rangecc  (iRangecc, const char *cstr, const iStringComparison *);
294 
isNull_Rangecc(const iRangecc d)295 iLocalDef iBool isNull_Rangecc(const iRangecc d) {
296     return d.start == NULL;
297 }
equal_Rangecc(const iRangecc d,const char * cstr)298 iLocalDef iBool equal_Rangecc(const iRangecc d, const char *cstr) {
299     return cmp_Rangecc(d, cstr) == 0;
300 }
equalCase_Rangecc(const iRangecc d,const char * cstr)301 iLocalDef iBool equalCase_Rangecc(const iRangecc d, const char *cstr) {
302     return cmpCStrSc_Rangecc(d, cstr, &iCaseInsensitive) == 0;
303 }
equalRange_Rangecc(const iRangecc d,const iRangecc other)304 iLocalDef iBool equalRange_Rangecc(const iRangecc d, const iRangecc other) {
305     return size_Range(&d) == size_Range(&other) &&
306            cmpCStrNSc_Rangecc(d, other.start, size_Range(&d), &iCaseSensitive) == 0;
307 }
equalRangeCase_Rangecc(const iRangecc d,const iRangecc other)308 iLocalDef iBool equalRangeCase_Rangecc(const iRangecc d, const iRangecc other) {
309     return size_Range(&d) == size_Range(&other) &&
310            cmpCStrNSc_Rangecc(d, other.start, size_Range(&d), &iCaseInsensitive) == 0;
311 }
startsWith_Rangecc(const iRangecc d,const char * cstr)312 iLocalDef iBool startsWith_Rangecc(const iRangecc d, const char *cstr) {
313     return startsWithSc_Rangecc(d, cstr, &iCaseSensitive);
314 }
startsWithCase_Rangecc(const iRangecc d,const char * cstr)315 iLocalDef iBool startsWithCase_Rangecc(const iRangecc d, const char *cstr) {
316     return startsWithSc_Rangecc(d, cstr, &iCaseInsensitive);
317 }
endsWith_Rangecc(const iRangecc d,const char * cstr)318 iLocalDef iBool endsWith_Rangecc(const iRangecc d, const char *cstr) {
319     return endsWithSc_Rangecc(d, cstr, &iCaseSensitive);
320 }
endsWithCase_Rangecc(const iRangecc d,const char * cstr)321 iLocalDef iBool endsWithCase_Rangecc(const iRangecc d, const char *cstr) {
322     return endsWithSc_Rangecc(d, cstr, &iCaseInsensitive);
323 }
324 
325 iStringList *   split_Rangecc       (iRangecc, const char *separator);
326 void            trimStart_Rangecc   (iRangecc *);
327 void            trimEnd_Rangecc     (iRangecc *);
328 void            trim_Rangecc        (iRangecc *);
329 
trimmed_Rangecc(iRangecc d)330 iLocalDef iRangecc trimmed_Rangecc(iRangecc d) {
331     trim_Rangecc(&d);
332     return d;
333 }
334 
335 size_t          lastIndexOfCStr_Rangecc     (iRangecc, const char *cstr);
336 
337 /**
338  * Finds the next range between separators. Empty ranges at the beginning and end of
339  * the string are ignored (i.e., when there is a separator at the beginning or the end
340  * of the string).
341  *
342  * A string containing nothing but the separator results in no split ranges.
343  *
344  * @param separator  Separator string.
345  * @param range      Next range. Must be initialized to zero. Subsequent ranges are
346  *                   searched based on the locations pointed to by this variable.
347  *
348  * @return @c iTrue, if a next range was found (@a range was updated).
349  */
350 iBool           nextSplit_Rangecc   (iRangecc, const char *separator, iRangecc *range);
351 
352 const char *    findAscii_Rangecc   (iRangecc, char ch);
353 
354 iString *       punyEncode_Rangecc  (iRangecc); /* RFC 3492 */
355 iString *       punyDecode_Rangecc  (iRangecc);
356 
357 iStringList *   split_CStr  (const char *cstr, const char *separator);
358 
359 /** @name Iterators */
360 ///@{
361 iDeclareConstIterator(String, const iString *)
362 struct ConstIteratorImpl_String {
363     iChar value;
364     const char *pos;
365     const char *next;
366     const iString *str;
367 };
368 ///@}
369 
370 /*-------------------------------------------------------------------------------------*/
371 
372 #define iMultibyteCharMaxSize ((size_t) 7)
373 
374 struct Impl_MultibyteChar {
375     char bytes[8]; // UTF-8 encoding
376 };
377 
378 void    init_MultibyteChar                  (iMultibyteChar *d, iChar ch);
379 int     decodeBytes_MultibyteChar           (const char *bytes, const char *end, iChar *ch_out);
380 int     decodePrecedingBytes_MultibyteChar  (const char *bytes, const char *start, iChar *ch_out);
381 
382 const char *    cstrLocal_Char  (iChar ch); // locale-encoding
383 
384 int             iCmpStr     (const char *a, const char *b);
385 int             iCmpStrN    (const char *a, const char *b, size_t n);
386 int             iCmpStrRange(iRangecc, const char *cstr);
387 int             iCmpStrCase (const char *a, const char *b);
388 int             iCmpStrNCase(const char *a, const char *b, size_t len);
389 
equal_CStr(const char * a,const char * b)390 iLocalDef iBool equal_CStr(const char *a, const char *b) {
391     return iCmpStr(a, b) == 0;
392 }
393 
equalCase_CStr(const char * a,const char * b)394 iLocalDef iBool equalCase_CStr(const char *a, const char *b) {
395     return iCmpStrCase(a, b) == 0;
396 }
397 
startsWith_CStr(const char * str,const char * pfx)398 iLocalDef iBool startsWith_CStr(const char *str, const char *pfx) {
399     return iCmpStrN(str, pfx, strlen(pfx)) == 0;
400 }
401 
startsWithCase_CStr(const char * str,const char * pfx)402 iLocalDef iBool startsWithCase_CStr(const char *str, const char *pfx) {
403     return iCmpStrNCase(str, pfx, strlen(pfx)) == 0;
404 }
405 
406 char *          iDupStr     (const char *);
407 char *          iStrStrN    (const char *, const char *, size_t);
408 
409 iEndPublic
410