1 #pragma once
2
3 /** @file the_Foundation/string.h UTF-8 string with copy-on-write semantics.
4
5 String is derived from Block, and contains text with multibyte characters. When
6 iterating a string, the multibyte characters are converted to UTF-32 code points.
7
8 String uses copy-on-write semantics (thanks to Block), so making copies is very
9 efficient. Conversions between String and Block are also trivial, and can be done
10 without duplicating the content. In fact, a pointer to a Block can be simply casted to
11 a String pointer and vice versa.
12
13 @authors Copyright (c) 2017 Jaakko Keränen <jaakko.keranen@iki.fi>
14
15 @par License
16
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are met:
19
20 1. Redistributions of source code must retain the above copyright notice, this
21 list of conditions and the following disclaimer.
22 2. Redistributions in binary form must reproduce the above copyright notice,
23 this list of conditions and the following disclaimer in the documentation
24 and/or other materials provided with the distribution.
25
26 <small>THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
30 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
33 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</small>
36 */
37
38 #include "defs.h"
39 #include "block.h"
40
41 #include <limits.h>
42
43 iBeginPublic
44
45 typedef uint32_t iChar;
46
47 iChar upper_Char (iChar);
48 iChar lower_Char (iChar);
49
50 iBool isSpace_Char (iChar);
51 iBool isAlpha_Char (iChar);
52 iBool isNumeric_Char (iChar);
53 iBool isAlphaNumeric_Char (iChar);
54 iBool isPunct_Char (iChar);
55
isVariationSelector_Char(iChar c)56 iLocalDef iBool isVariationSelector_Char(iChar c) {
57 return (c >= 0xfe00 && c <= 0xfe0f) || (c >= 0xe0100 && c <= 0xe0121);
58 }
isFitzpatrickType_Char(iChar c)59 iLocalDef iBool isFitzpatrickType_Char(iChar c) {
60 return c >= 0x1f3fb && c <= 0x1f3ff;
61 }
isDefaultIgnorable_Char(iChar c)62 iLocalDef iBool isDefaultIgnorable_Char(iChar c) {
63 return c == 0x115f || (c >= 0x200b && c <= 0x200e) || c == 0x2060 || c == 0x2061 ||
64 c == 0xfeff;
65 }
isEmoji_Char(iChar c)66 iLocalDef iBool isEmoji_Char(iChar c) {
67 return (c >= 0x1f300 && c < 0x1f700) || (c >= 0x1f7e0 && c <= 0x1f7eb) ||
68 (c >= 0x1f900 && c <= 0x1f9ff) || (c >= 0x1fa70 && c <= 0x1faff);
69 }
isDingbats_Char(iChar c)70 iLocalDef iBool isDingbats_Char(iChar c) {
71 return c >= 0x2702 && c <= 0x27b0;
72 }
isPictograph_Char(iChar c)73 iLocalDef iBool isPictograph_Char(iChar c) {
74 return (c == 0x21a9) ||
75 (c == 0x2218 || c == 0x2219) ||
76 (c >= 0x2300 && c <= 0x27bf) ||
77 (c >= 0x1f680 && c <= 0x1f6c0);
78 }
79
80 iDeclareType(String)
81 iDeclareType(StringList)
82 iDeclareType(StringComparison)
83 iDeclareType(MultibyteChar)
84 iDeclareType(Stream)
85
86 struct Impl_StringComparison {
87 int (*cmp) (const char *, const char *);
88 int (*cmpN) (const char *, const char *, size_t);
89 char * (*locate) (const char *, const char *);
90 };
91
92 iFoundationAPIData iStringComparison iCaseSensitive;
93 iFoundationAPIData iStringComparison iCaseInsensitive;
94
95 struct Impl_String {
96 iBlock chars;
97 };
98
99 #define iStringLiteral(str) (iString){ iBlockLiteral(str, strlen(str), strlen(str) + 1) }
100
101 iDeclareTypeConstruction(String)
102 iDeclareTypeSerialization(String)
103
104 iString * newCStr_String (const char *utf8CStr);
105 iString * newCStrN_String (const char *utf8CStr, size_t n);
106 iString * newUtf16_String (const uint16_t *utf16Str);
107 iString * newUtf16N_String (const uint16_t *utf16Str, size_t n);
108 iString * newUnicode_String (const iChar *ucs);
109 iString * newUnicodeN_String (const iChar *ucs, size_t n);
110 iString * newLocalCStr_String (const char *localCStr);
111 iString * newLocalCStrN_String(const char *localCStr, size_t n);
112 iString * newBlock_String (const iBlock *utf8Data);
113 iString * newFormat_String (const char *format, ...);
114 iString * copy_String (const iString *);
115
newUtf32_String(const iChar * ucs)116 iLocalDef iString * newUtf32_String (const iChar *ucs) { return newUnicode_String(ucs); }
newUtf32N_String(const iChar * ucs,size_t n)117 iLocalDef iString * newUtf32N_String (const iChar *ucs, size_t n) { return newUnicodeN_String(ucs, n); }
newRange_String(const iRangecc range)118 iLocalDef iString * newRange_String (const iRangecc range) { return newCStrN_String(range.start, size_Range(&range)); }
newLocal_String(const iBlock * localChars)119 iLocalDef iString * newLocal_String (const iBlock *localChars) { return newLocalCStrN_String(cstr_Block(localChars), size_Block(localChars)); }
120
121 iString * collectNewFormat_String (const char *format, ...);
collectNewCStr_String(const char * cstr)122 iLocalDef iString * collectNewCStr_String (const char *cstr) { return collect_String(newCStr_String(cstr)); }
collectNewRange_String(const iRangecc range)123 iLocalDef iString * collectNewRange_String (const iRangecc range) { return collect_String(newRange_String(range)); }
124
125 void init_String (iString *);
126 void initCStr_String (iString *, const char *utf8CStr);
127 void initCStrN_String (iString *, const char *utf8CStr, size_t n);
128 void initUtf16_String (iString *, const uint16_t *utf16Str);
129 void initUtf16N_String (iString *, const uint16_t *utf16Str, size_t n);
130 void initUnicode_String (iString *, const iChar *ucs);
131 void initUnicodeN_String (iString *, const iChar *ucs, size_t n);
132 void initLocalCStr_String (iString *, const char *localCStr);
133 void initLocalCStrN_String (iString *, const char *localCStr, size_t n);
134 void initBlock_String (iString *, const iBlock *chars);
135 void initBlockEncoding_String(iString *, const iBlock *chars, const char *encoding);
136 void initCopy_String (iString *, const iString *other);
137
initRange_String(iString * d,const iRangecc range)138 iLocalDef void initRange_String (iString *d, const iRangecc range) { initCStrN_String(d, range.start, size_Range(&range)); }
139
140 const char * cstr_String (const iString *);
141 size_t length_String (const iString *);
142 size_t size_String (const iString *);
143 iString * mid_String (const iString *, size_t charStartPos, size_t charCount);
144 iString * upper_String (const iString *);
145 iString * lower_String (const iString *);
146 iStringList * split_String (const iString *, const char *separator);
147 iChar first_String (const iString *);
148 iChar last_String (const iString *);
149 iBlock * toLocal_String (const iString *);
150 iBlock * toUtf16_String (const iString *);
151 iBlock * toUnicode_String (const iString *);
152
toUtf32_String(const iString * d)153 iLocalDef iBlock *toUtf32_String (const iString *d) { return toUnicode_String(d); }
154
155 iString * urlEncode_String (const iString *);
156 iString * urlDecode_String (const iString *);
157 iString * urlEncodeExclude_String (const iString *, const char *excluded);
158 iString * urlDecodeExclude_String (const iString *, const char *excluded);
159 iString * maybeUrlEncodeExclude_String (const iString *, const char *excluded); /* may return NULL */
160 iString * maybeUrlDecodeExclude_String (const iString *, const char *excluded); /* may return NULL */
161
162 /**
163 * Returns a pointer to the string converted to the current locale's encoding.
164 * The temporary conversion is collected as garbage.
165 *
166 * @param str String to convert.
167 *
168 * @return Converted text. The pointer will remain valid until garbage is recycled.
169 */
cstrLocal_String(const iString * str)170 iLocalDef const char *cstrLocal_String(const iString *str) {
171 return cstr_Block(collect_Block(toLocal_String(str)));
172 }
173
cstrCollect_String(iString * d)174 iLocalDef const char *cstrCollect_String(iString *d) {
175 return cstr_String(collect_String(d));
176 }
177
range_String(const iString * d)178 iLocalDef iRangecc range_String(const iString *d) {
179 const iRangecc r = { constBegin_Block(&(d)->chars), constEnd_Block(&(d)->chars) };
180 return r;
181 }
182
utf8_String(const iString * d)183 iLocalDef const iBlock *utf8_String(const iString *d) {
184 return &d->chars; /* unmodified internal representation (UTF-8) */
185 }
186
isEmpty_String(const iString * d)187 iLocalDef iBool isEmpty_String (const iString *d) { return size_String(d) == 0; }
constBegin_String(const iString * d)188 iLocalDef const char * constBegin_String(const iString *d) { return cstr_String(d); }
constEnd_String(const iString * d)189 iLocalDef const char * constEnd_String (const iString *d) { return cstr_String(d) + size_String(d); }
190
191 int cmpSc_String (const iString *, const char *cstr, const iStringComparison *);
192 int cmpNSc_String (const iString *, const char *cstr, size_t n, const iStringComparison *);
193
194 #define cmp_String(d, cstr) cmpSc_String(d, cstr, &iCaseSensitive)
195 #define cmpCase_String(d, cstr) cmpSc_String(d, cstr, &iCaseInsensitive)
196 #define cmpString_String(d, s) cmpSc_String(d, cstr_String(s), &iCaseSensitive)
197 #define cmpStringCase_String(d, s) cmpSc_String(d, cstr_String(s), &iCaseInsensitive)
198 #define cmpStringSc_String(d, s, sc) cmpSc_String(d, cstr_String(s), sc)
199
equal_String(const iString * d,const iString * other)200 iLocalDef iBool equal_String(const iString *d, const iString *other) {
201 return cmpString_String(d, other) == 0;
202 }
equalCase_String(const iString * d,const iString * other)203 iLocalDef iBool equalCase_String(const iString *d, const iString *other) {
204 return cmpStringCase_String(d, other) == 0;
205 }
206
207 iBool startsWithSc_String (const iString *, const char *cstr, const iStringComparison *);
208 iBool endsWithSc_String (const iString *, const char *cstr, const iStringComparison *);
209
210 #define startsWith_String(d, cstr) startsWithSc_String(d, cstr, &iCaseSensitive)
211 #define startsWithCase_String(d, cstr) startsWithSc_String(d, cstr, &iCaseInsensitive)
212 #define endsWith_String(d, cstr) endsWithSc_String (d, cstr, &iCaseSensitive)
213 #define endsWithCase_String(d, cstr) endsWithSc_String (d, cstr, &iCaseInsensitive)
214
215 size_t indexOf_String (const iString *, iChar ch);
216 size_t indexOfCStr_String (const iString *, const char *cstr);
217 size_t indexOfCStrFrom_String (const iString *, const char *cstr, size_t from);
218 size_t indexOfCStrSc_String (const iString *, const char *cstr, const iStringComparison *);
219 size_t indexOfCStrFromSc_String (const iString *, const char *cstr, size_t from, const iStringComparison *);
220 size_t lastIndexOf_String (const iString *, iChar ch);
221 size_t lastIndexOfCStr_String (const iString *, const char *cstr);
222
223 #define indexOfString_String(d, s) indexOfCStr_String(d, cstr_String(s))
224 #define indexOfStringFrom_String(d, s, pos) indexOfCStrFrom_String(d, cstr_String(s), pos)
225 #define lastIndexOfString_String(d, s) lastIndexOfCStr_String(d, cstr_String(s))
226
contains_String(const iString * d,iChar ch)227 iLocalDef iBool contains_String(const iString *d, iChar ch) {
228 return indexOf_String(d, ch) != iInvalidPos;
229 }
230
231 void set_String (iString *, const iString *other);
232 void setCStr_String (iString *, const char *cstr);
233 void setCStrN_String (iString *, const char *cstr, size_t n);
234 void setBlock_String (iString *, const iBlock *block);
235 void format_String (iString *, const char *format, ...);
236
setRange_String(iString * d,iRangecc range)237 iLocalDef void setRange_String(iString *d, iRangecc range) {
238 setCStrN_String(d, range.start, size_Range(&range));
239 }
240
241 void append_String (iString *, const iString *other);
242 void appendCStr_String (iString *, const char *cstr);
243 void appendCStrN_String (iString *, const char *cstr, size_t size);
244 void appendChar_String (iString *, iChar ch);
245 void appendRange_String (iString *, const iRangecc range);
246 void appendFormat_String (iString *, const char *format, ...);
247 void prepend_String (iString *, const iString *other);
248 void prependChar_String (iString *, iChar ch);
249 void prependCStr_String (iString *, const char *cstr);
250
251 void clear_String (iString *);
252 void truncate_String (iString *, size_t charCount);
253 void removeEnd_String (iString *, size_t charCount);
254 void trimStart_String (iString *);
255 void trimEnd_String (iString *);
256 void trim_String (iString *);
257 iString * trimmed_String (const iString *);
258 void replace_String (iString *, const char *src, const char *dst);
259 void normalize_String (iString *); /* NFC */
260
261 int toInt_String (const iString *);
262 float toFloat_String (const iString *);
263 double toDouble_String (const iString *);
264
265 iString * quote_String (const iString *, iBool numericUnicode);
266 iString * unquote_String (const iString *);
267
268 const char * format_CStr (const char *format, ...);
269 const char * skipSpace_CStr (const char *);
270
rangeN_CStr(const char * cstr,size_t size)271 iLocalDef iRangecc rangeN_CStr (const char *cstr, size_t size) {
272 #if __STDC_VERSION__ >= 201100L
273 return (iRangecc){ cstr, cstr + size };
274 #else
275 const iRangecc range = { cstr, cstr + size };
276 return range;
277 #endif
278 }
range_CStr(const char * cstr)279 iLocalDef iRangecc range_CStr(const char *cstr) {
280 return rangeN_CStr(cstr, strlen(cstr));
281 }
282
283 const char * cstr_Rangecc (iRangecc); /* returns NULL-terminated collected copy */
284 const iString * string_Rangecc (iRangecc); /* returns a collected String */
285
286 iBool isUtf8_Rangecc (iRangecc); /* checks if the range is well-formed UTF-8 */
287 size_t length_Rangecc (iRangecc); /* returns number of characters in the range */
288
289 #define cmp_Rangecc(d, cstr) cmpCStrSc_Rangecc((d), (cstr), &iCaseSensitive)
290 int cmpCStrSc_Rangecc (iRangecc, const char *cstr, const iStringComparison *);
291 int cmpCStrNSc_Rangecc (iRangecc, const char *cstr, size_t n, const iStringComparison *);
292 iBool startsWithSc_Rangecc(iRangecc, const char *cstr, const iStringComparison *);
293 iBool endsWithSc_Rangecc (iRangecc, const char *cstr, const iStringComparison *);
294
isNull_Rangecc(const iRangecc d)295 iLocalDef iBool isNull_Rangecc(const iRangecc d) {
296 return d.start == NULL;
297 }
equal_Rangecc(const iRangecc d,const char * cstr)298 iLocalDef iBool equal_Rangecc(const iRangecc d, const char *cstr) {
299 return cmp_Rangecc(d, cstr) == 0;
300 }
equalCase_Rangecc(const iRangecc d,const char * cstr)301 iLocalDef iBool equalCase_Rangecc(const iRangecc d, const char *cstr) {
302 return cmpCStrSc_Rangecc(d, cstr, &iCaseInsensitive) == 0;
303 }
equalRange_Rangecc(const iRangecc d,const iRangecc other)304 iLocalDef iBool equalRange_Rangecc(const iRangecc d, const iRangecc other) {
305 return size_Range(&d) == size_Range(&other) &&
306 cmpCStrNSc_Rangecc(d, other.start, size_Range(&d), &iCaseSensitive) == 0;
307 }
equalRangeCase_Rangecc(const iRangecc d,const iRangecc other)308 iLocalDef iBool equalRangeCase_Rangecc(const iRangecc d, const iRangecc other) {
309 return size_Range(&d) == size_Range(&other) &&
310 cmpCStrNSc_Rangecc(d, other.start, size_Range(&d), &iCaseInsensitive) == 0;
311 }
startsWith_Rangecc(const iRangecc d,const char * cstr)312 iLocalDef iBool startsWith_Rangecc(const iRangecc d, const char *cstr) {
313 return startsWithSc_Rangecc(d, cstr, &iCaseSensitive);
314 }
startsWithCase_Rangecc(const iRangecc d,const char * cstr)315 iLocalDef iBool startsWithCase_Rangecc(const iRangecc d, const char *cstr) {
316 return startsWithSc_Rangecc(d, cstr, &iCaseInsensitive);
317 }
endsWith_Rangecc(const iRangecc d,const char * cstr)318 iLocalDef iBool endsWith_Rangecc(const iRangecc d, const char *cstr) {
319 return endsWithSc_Rangecc(d, cstr, &iCaseSensitive);
320 }
endsWithCase_Rangecc(const iRangecc d,const char * cstr)321 iLocalDef iBool endsWithCase_Rangecc(const iRangecc d, const char *cstr) {
322 return endsWithSc_Rangecc(d, cstr, &iCaseInsensitive);
323 }
324
325 iStringList * split_Rangecc (iRangecc, const char *separator);
326 void trimStart_Rangecc (iRangecc *);
327 void trimEnd_Rangecc (iRangecc *);
328 void trim_Rangecc (iRangecc *);
329
trimmed_Rangecc(iRangecc d)330 iLocalDef iRangecc trimmed_Rangecc(iRangecc d) {
331 trim_Rangecc(&d);
332 return d;
333 }
334
335 size_t lastIndexOfCStr_Rangecc (iRangecc, const char *cstr);
336
337 /**
338 * Finds the next range between separators. Empty ranges at the beginning and end of
339 * the string are ignored (i.e., when there is a separator at the beginning or the end
340 * of the string).
341 *
342 * A string containing nothing but the separator results in no split ranges.
343 *
344 * @param separator Separator string.
345 * @param range Next range. Must be initialized to zero. Subsequent ranges are
346 * searched based on the locations pointed to by this variable.
347 *
348 * @return @c iTrue, if a next range was found (@a range was updated).
349 */
350 iBool nextSplit_Rangecc (iRangecc, const char *separator, iRangecc *range);
351
352 const char * findAscii_Rangecc (iRangecc, char ch);
353
354 iString * punyEncode_Rangecc (iRangecc); /* RFC 3492 */
355 iString * punyDecode_Rangecc (iRangecc);
356
357 iStringList * split_CStr (const char *cstr, const char *separator);
358
359 /** @name Iterators */
360 ///@{
361 iDeclareConstIterator(String, const iString *)
362 struct ConstIteratorImpl_String {
363 iChar value;
364 const char *pos;
365 const char *next;
366 const iString *str;
367 };
368 ///@}
369
370 /*-------------------------------------------------------------------------------------*/
371
372 #define iMultibyteCharMaxSize ((size_t) 7)
373
374 struct Impl_MultibyteChar {
375 char bytes[8]; // UTF-8 encoding
376 };
377
378 void init_MultibyteChar (iMultibyteChar *d, iChar ch);
379 int decodeBytes_MultibyteChar (const char *bytes, const char *end, iChar *ch_out);
380 int decodePrecedingBytes_MultibyteChar (const char *bytes, const char *start, iChar *ch_out);
381
382 const char * cstrLocal_Char (iChar ch); // locale-encoding
383
384 int iCmpStr (const char *a, const char *b);
385 int iCmpStrN (const char *a, const char *b, size_t n);
386 int iCmpStrRange(iRangecc, const char *cstr);
387 int iCmpStrCase (const char *a, const char *b);
388 int iCmpStrNCase(const char *a, const char *b, size_t len);
389
equal_CStr(const char * a,const char * b)390 iLocalDef iBool equal_CStr(const char *a, const char *b) {
391 return iCmpStr(a, b) == 0;
392 }
393
equalCase_CStr(const char * a,const char * b)394 iLocalDef iBool equalCase_CStr(const char *a, const char *b) {
395 return iCmpStrCase(a, b) == 0;
396 }
397
startsWith_CStr(const char * str,const char * pfx)398 iLocalDef iBool startsWith_CStr(const char *str, const char *pfx) {
399 return iCmpStrN(str, pfx, strlen(pfx)) == 0;
400 }
401
startsWithCase_CStr(const char * str,const char * pfx)402 iLocalDef iBool startsWithCase_CStr(const char *str, const char *pfx) {
403 return iCmpStrNCase(str, pfx, strlen(pfx)) == 0;
404 }
405
406 char * iDupStr (const char *);
407 char * iStrStrN (const char *, const char *, size_t);
408
409 iEndPublic
410