1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #include "number_stringbuilder.h"
9 #include "static_unicode_sets.h"
10 #include "unicode/utf16.h"
11 #include "number_utils.h"
12 
13 using namespace icu;
14 using namespace icu::number;
15 using namespace icu::number::impl;
16 
17 namespace {
18 
19 // A version of uprv_memcpy that checks for length 0.
20 // By default, uprv_memcpy requires a length of at least 1.
uprv_memcpy2(void * dest,const void * src,size_t len)21 inline void uprv_memcpy2(void* dest, const void* src, size_t len) {
22     if (len > 0) {
23         uprv_memcpy(dest, src, len);
24     }
25 }
26 
27 // A version of uprv_memmove that checks for length 0.
28 // By default, uprv_memmove requires a length of at least 1.
uprv_memmove2(void * dest,const void * src,size_t len)29 inline void uprv_memmove2(void* dest, const void* src, size_t len) {
30     if (len > 0) {
31         uprv_memmove(dest, src, len);
32     }
33 }
34 
35 } // namespace
36 
NumberStringBuilder()37 NumberStringBuilder::NumberStringBuilder() {
38 #if U_DEBUG
39     // Initializing the memory to non-zero helps catch some bugs that involve
40     // reading from an improperly terminated string.
41     for (int32_t i=0; i<getCapacity(); i++) {
42         getCharPtr()[i] = 1;
43     }
44 #endif
45 }
46 
~NumberStringBuilder()47 NumberStringBuilder::~NumberStringBuilder() {
48     if (fUsingHeap) {
49         uprv_free(fChars.heap.ptr);
50         uprv_free(fFields.heap.ptr);
51     }
52 }
53 
NumberStringBuilder(const NumberStringBuilder & other)54 NumberStringBuilder::NumberStringBuilder(const NumberStringBuilder &other) {
55     *this = other;
56 }
57 
operator =(const NumberStringBuilder & other)58 NumberStringBuilder &NumberStringBuilder::operator=(const NumberStringBuilder &other) {
59     // Check for self-assignment
60     if (this == &other) {
61         return *this;
62     }
63 
64     // Continue with deallocation and copying
65     if (fUsingHeap) {
66         uprv_free(fChars.heap.ptr);
67         uprv_free(fFields.heap.ptr);
68         fUsingHeap = false;
69     }
70 
71     int32_t capacity = other.getCapacity();
72     if (capacity > DEFAULT_CAPACITY) {
73         // FIXME: uprv_malloc
74         // C++ note: malloc appears in two places: here and in prepareForInsertHelper.
75         auto newChars = static_cast<char16_t *> (uprv_malloc(sizeof(char16_t) * capacity));
76         auto newFields = static_cast<Field *>(uprv_malloc(sizeof(Field) * capacity));
77         if (newChars == nullptr || newFields == nullptr) {
78             // UErrorCode is not available; fail silently.
79             uprv_free(newChars);
80             uprv_free(newFields);
81             *this = NumberStringBuilder();  // can't fail
82             return *this;
83         }
84 
85         fUsingHeap = true;
86         fChars.heap.capacity = capacity;
87         fChars.heap.ptr = newChars;
88         fFields.heap.capacity = capacity;
89         fFields.heap.ptr = newFields;
90     }
91 
92     uprv_memcpy2(getCharPtr(), other.getCharPtr(), sizeof(char16_t) * capacity);
93     uprv_memcpy2(getFieldPtr(), other.getFieldPtr(), sizeof(Field) * capacity);
94 
95     fZero = other.fZero;
96     fLength = other.fLength;
97     return *this;
98 }
99 
length() const100 int32_t NumberStringBuilder::length() const {
101     return fLength;
102 }
103 
codePointCount() const104 int32_t NumberStringBuilder::codePointCount() const {
105     return u_countChar32(getCharPtr() + fZero, fLength);
106 }
107 
getFirstCodePoint() const108 UChar32 NumberStringBuilder::getFirstCodePoint() const {
109     if (fLength == 0) {
110         return -1;
111     }
112     UChar32 cp;
113     U16_GET(getCharPtr() + fZero, 0, 0, fLength, cp);
114     return cp;
115 }
116 
getLastCodePoint() const117 UChar32 NumberStringBuilder::getLastCodePoint() const {
118     if (fLength == 0) {
119         return -1;
120     }
121     int32_t offset = fLength;
122     U16_BACK_1(getCharPtr() + fZero, 0, offset);
123     UChar32 cp;
124     U16_GET(getCharPtr() + fZero, 0, offset, fLength, cp);
125     return cp;
126 }
127 
codePointAt(int32_t index) const128 UChar32 NumberStringBuilder::codePointAt(int32_t index) const {
129     UChar32 cp;
130     U16_GET(getCharPtr() + fZero, 0, index, fLength, cp);
131     return cp;
132 }
133 
codePointBefore(int32_t index) const134 UChar32 NumberStringBuilder::codePointBefore(int32_t index) const {
135     int32_t offset = index;
136     U16_BACK_1(getCharPtr() + fZero, 0, offset);
137     UChar32 cp;
138     U16_GET(getCharPtr() + fZero, 0, offset, fLength, cp);
139     return cp;
140 }
141 
clear()142 NumberStringBuilder &NumberStringBuilder::clear() {
143     // TODO: Reset the heap here?
144     fZero = getCapacity() / 2;
145     fLength = 0;
146     return *this;
147 }
148 
appendCodePoint(UChar32 codePoint,Field field,UErrorCode & status)149 int32_t NumberStringBuilder::appendCodePoint(UChar32 codePoint, Field field, UErrorCode &status) {
150     return insertCodePoint(fLength, codePoint, field, status);
151 }
152 
153 int32_t
insertCodePoint(int32_t index,UChar32 codePoint,Field field,UErrorCode & status)154 NumberStringBuilder::insertCodePoint(int32_t index, UChar32 codePoint, Field field, UErrorCode &status) {
155     int32_t count = U16_LENGTH(codePoint);
156     int32_t position = prepareForInsert(index, count, status);
157     if (U_FAILURE(status)) {
158         return count;
159     }
160     if (count == 1) {
161         getCharPtr()[position] = (char16_t) codePoint;
162         getFieldPtr()[position] = field;
163     } else {
164         getCharPtr()[position] = U16_LEAD(codePoint);
165         getCharPtr()[position + 1] = U16_TRAIL(codePoint);
166         getFieldPtr()[position] = getFieldPtr()[position + 1] = field;
167     }
168     return count;
169 }
170 
append(const UnicodeString & unistr,Field field,UErrorCode & status)171 int32_t NumberStringBuilder::append(const UnicodeString &unistr, Field field, UErrorCode &status) {
172     return insert(fLength, unistr, field, status);
173 }
174 
insert(int32_t index,const UnicodeString & unistr,Field field,UErrorCode & status)175 int32_t NumberStringBuilder::insert(int32_t index, const UnicodeString &unistr, Field field,
176                                     UErrorCode &status) {
177     if (unistr.length() == 0) {
178         // Nothing to insert.
179         return 0;
180     } else if (unistr.length() == 1) {
181         // Fast path: insert using insertCodePoint.
182         return insertCodePoint(index, unistr.charAt(0), field, status);
183     } else {
184         return insert(index, unistr, 0, unistr.length(), field, status);
185     }
186 }
187 
188 int32_t
insert(int32_t index,const UnicodeString & unistr,int32_t start,int32_t end,Field field,UErrorCode & status)189 NumberStringBuilder::insert(int32_t index, const UnicodeString &unistr, int32_t start, int32_t end,
190                             Field field, UErrorCode &status) {
191     int32_t count = end - start;
192     int32_t position = prepareForInsert(index, count, status);
193     if (U_FAILURE(status)) {
194         return count;
195     }
196     for (int32_t i = 0; i < count; i++) {
197         getCharPtr()[position + i] = unistr.charAt(start + i);
198         getFieldPtr()[position + i] = field;
199     }
200     return count;
201 }
202 
203 int32_t
splice(int32_t startThis,int32_t endThis,const UnicodeString & unistr,int32_t startOther,int32_t endOther,Field field,UErrorCode & status)204 NumberStringBuilder::splice(int32_t startThis, int32_t endThis,  const UnicodeString &unistr,
205                             int32_t startOther, int32_t endOther, Field field, UErrorCode& status) {
206     int32_t thisLength = endThis - startThis;
207     int32_t otherLength = endOther - startOther;
208     int32_t count = otherLength - thisLength;
209     int32_t position;
210     if (count > 0) {
211         // Overall, chars need to be added.
212         position = prepareForInsert(startThis, count, status);
213     } else {
214         // Overall, chars need to be removed or kept the same.
215         position = remove(startThis, -count);
216     }
217     if (U_FAILURE(status)) {
218         return count;
219     }
220     for (int32_t i = 0; i < otherLength; i++) {
221         getCharPtr()[position + i] = unistr.charAt(startOther + i);
222         getFieldPtr()[position + i] = field;
223     }
224     return count;
225 }
226 
append(const NumberStringBuilder & other,UErrorCode & status)227 int32_t NumberStringBuilder::append(const NumberStringBuilder &other, UErrorCode &status) {
228     return insert(fLength, other, status);
229 }
230 
231 int32_t
insert(int32_t index,const NumberStringBuilder & other,UErrorCode & status)232 NumberStringBuilder::insert(int32_t index, const NumberStringBuilder &other, UErrorCode &status) {
233     if (this == &other) {
234         status = U_ILLEGAL_ARGUMENT_ERROR;
235         return 0;
236     }
237     int32_t count = other.fLength;
238     if (count == 0) {
239         // Nothing to insert.
240         return 0;
241     }
242     int32_t position = prepareForInsert(index, count, status);
243     if (U_FAILURE(status)) {
244         return count;
245     }
246     for (int32_t i = 0; i < count; i++) {
247         getCharPtr()[position + i] = other.charAt(i);
248         getFieldPtr()[position + i] = other.fieldAt(i);
249     }
250     return count;
251 }
252 
writeTerminator(UErrorCode & status)253 void NumberStringBuilder::writeTerminator(UErrorCode& status) {
254     int32_t position = prepareForInsert(fLength, 1, status);
255     if (U_FAILURE(status)) {
256         return;
257     }
258     getCharPtr()[position] = 0;
259     getFieldPtr()[position] = UNUM_FIELD_COUNT;
260     fLength--;
261 }
262 
prepareForInsert(int32_t index,int32_t count,UErrorCode & status)263 int32_t NumberStringBuilder::prepareForInsert(int32_t index, int32_t count, UErrorCode &status) {
264     U_ASSERT(index >= 0);
265     U_ASSERT(index <= fLength);
266     U_ASSERT(count >= 0);
267     if (index == 0 && fZero - count >= 0) {
268         // Append to start
269         fZero -= count;
270         fLength += count;
271         return fZero;
272     } else if (index == fLength && fZero + fLength + count < getCapacity()) {
273         // Append to end
274         fLength += count;
275         return fZero + fLength - count;
276     } else {
277         // Move chars around and/or allocate more space
278         return prepareForInsertHelper(index, count, status);
279     }
280 }
281 
prepareForInsertHelper(int32_t index,int32_t count,UErrorCode & status)282 int32_t NumberStringBuilder::prepareForInsertHelper(int32_t index, int32_t count, UErrorCode &status) {
283     int32_t oldCapacity = getCapacity();
284     int32_t oldZero = fZero;
285     char16_t *oldChars = getCharPtr();
286     Field *oldFields = getFieldPtr();
287     if (fLength + count > oldCapacity) {
288         int32_t newCapacity = (fLength + count) * 2;
289         int32_t newZero = newCapacity / 2 - (fLength + count) / 2;
290 
291         // C++ note: malloc appears in two places: here and in the assignment operator.
292         auto newChars = static_cast<char16_t *> (uprv_malloc(sizeof(char16_t) * newCapacity));
293         auto newFields = static_cast<Field *>(uprv_malloc(sizeof(Field) * newCapacity));
294         if (newChars == nullptr || newFields == nullptr) {
295             uprv_free(newChars);
296             uprv_free(newFields);
297             status = U_MEMORY_ALLOCATION_ERROR;
298             return -1;
299         }
300 
301         // First copy the prefix and then the suffix, leaving room for the new chars that the
302         // caller wants to insert.
303         // C++ note: memcpy is OK because the src and dest do not overlap.
304         uprv_memcpy2(newChars + newZero, oldChars + oldZero, sizeof(char16_t) * index);
305         uprv_memcpy2(newChars + newZero + index + count,
306                 oldChars + oldZero + index,
307                 sizeof(char16_t) * (fLength - index));
308         uprv_memcpy2(newFields + newZero, oldFields + oldZero, sizeof(Field) * index);
309         uprv_memcpy2(newFields + newZero + index + count,
310                 oldFields + oldZero + index,
311                 sizeof(Field) * (fLength - index));
312 
313         if (fUsingHeap) {
314             uprv_free(oldChars);
315             uprv_free(oldFields);
316         }
317         fUsingHeap = true;
318         fChars.heap.ptr = newChars;
319         fChars.heap.capacity = newCapacity;
320         fFields.heap.ptr = newFields;
321         fFields.heap.capacity = newCapacity;
322         fZero = newZero;
323         fLength += count;
324     } else {
325         int32_t newZero = oldCapacity / 2 - (fLength + count) / 2;
326 
327         // C++ note: memmove is required because src and dest may overlap.
328         // First copy the entire string to the location of the prefix, and then move the suffix
329         // to make room for the new chars that the caller wants to insert.
330         uprv_memmove2(oldChars + newZero, oldChars + oldZero, sizeof(char16_t) * fLength);
331         uprv_memmove2(oldChars + newZero + index + count,
332                 oldChars + newZero + index,
333                 sizeof(char16_t) * (fLength - index));
334         uprv_memmove2(oldFields + newZero, oldFields + oldZero, sizeof(Field) * fLength);
335         uprv_memmove2(oldFields + newZero + index + count,
336                 oldFields + newZero + index,
337                 sizeof(Field) * (fLength - index));
338 
339         fZero = newZero;
340         fLength += count;
341     }
342     return fZero + index;
343 }
344 
remove(int32_t index,int32_t count)345 int32_t NumberStringBuilder::remove(int32_t index, int32_t count) {
346     // TODO: Reset the heap here?  (If the string after removal can fit on stack?)
347     int32_t position = index + fZero;
348     uprv_memmove2(getCharPtr() + position,
349             getCharPtr() + position + count,
350             sizeof(char16_t) * (fLength - index - count));
351     uprv_memmove2(getFieldPtr() + position,
352             getFieldPtr() + position + count,
353             sizeof(Field) * (fLength - index - count));
354     fLength -= count;
355     return position;
356 }
357 
toUnicodeString() const358 UnicodeString NumberStringBuilder::toUnicodeString() const {
359     return UnicodeString(getCharPtr() + fZero, fLength);
360 }
361 
toTempUnicodeString() const362 const UnicodeString NumberStringBuilder::toTempUnicodeString() const {
363     // Readonly-alias constructor:
364     return UnicodeString(FALSE, getCharPtr() + fZero, fLength);
365 }
366 
toDebugString() const367 UnicodeString NumberStringBuilder::toDebugString() const {
368     UnicodeString sb;
369     sb.append(u"<NumberStringBuilder [", -1);
370     sb.append(toUnicodeString());
371     sb.append(u"] [", -1);
372     for (int i = 0; i < fLength; i++) {
373         if (fieldAt(i) == UNUM_FIELD_COUNT) {
374             sb.append(u'n');
375         } else {
376             char16_t c;
377             switch (fieldAt(i)) {
378                 case UNUM_SIGN_FIELD:
379                     c = u'-';
380                     break;
381                 case UNUM_INTEGER_FIELD:
382                     c = u'i';
383                     break;
384                 case UNUM_FRACTION_FIELD:
385                     c = u'f';
386                     break;
387                 case UNUM_EXPONENT_FIELD:
388                     c = u'e';
389                     break;
390                 case UNUM_EXPONENT_SIGN_FIELD:
391                     c = u'+';
392                     break;
393                 case UNUM_EXPONENT_SYMBOL_FIELD:
394                     c = u'E';
395                     break;
396                 case UNUM_DECIMAL_SEPARATOR_FIELD:
397                     c = u'.';
398                     break;
399                 case UNUM_GROUPING_SEPARATOR_FIELD:
400                     c = u',';
401                     break;
402                 case UNUM_PERCENT_FIELD:
403                     c = u'%';
404                     break;
405                 case UNUM_PERMILL_FIELD:
406                     c = u'‰';
407                     break;
408                 case UNUM_CURRENCY_FIELD:
409                     c = u'$';
410                     break;
411                 default:
412                     c = u'?';
413                     break;
414             }
415             sb.append(c);
416         }
417     }
418     sb.append(u"]>", -1);
419     return sb;
420 }
421 
chars() const422 const char16_t *NumberStringBuilder::chars() const {
423     return getCharPtr() + fZero;
424 }
425 
contentEquals(const NumberStringBuilder & other) const426 bool NumberStringBuilder::contentEquals(const NumberStringBuilder &other) const {
427     if (fLength != other.fLength) {
428         return false;
429     }
430     for (int32_t i = 0; i < fLength; i++) {
431         if (charAt(i) != other.charAt(i) || fieldAt(i) != other.fieldAt(i)) {
432             return false;
433         }
434     }
435     return true;
436 }
437 
nextFieldPosition(FieldPosition & fp,UErrorCode & status) const438 bool NumberStringBuilder::nextFieldPosition(FieldPosition& fp, UErrorCode& status) const {
439     int32_t rawField = fp.getField();
440 
441     if (rawField == FieldPosition::DONT_CARE) {
442         return FALSE;
443     }
444 
445     if (rawField < 0 || rawField >= UNUM_FIELD_COUNT) {
446         status = U_ILLEGAL_ARGUMENT_ERROR;
447         return FALSE;
448     }
449 
450     ConstrainedFieldPosition cfpos;
451     cfpos.constrainField(UFIELD_CATEGORY_NUMBER, rawField);
452     cfpos.setState(UFIELD_CATEGORY_NUMBER, rawField, fp.getBeginIndex(), fp.getEndIndex());
453     if (nextPosition(cfpos, 0, status)) {
454         fp.setBeginIndex(cfpos.getStart());
455         fp.setEndIndex(cfpos.getLimit());
456         return true;
457     }
458 
459     // Special case: fraction should start after integer if fraction is not present
460     if (rawField == UNUM_FRACTION_FIELD && fp.getEndIndex() == 0) {
461         bool inside = false;
462         int32_t i = fZero;
463         for (; i < fZero + fLength; i++) {
464             if (isIntOrGroup(getFieldPtr()[i]) || getFieldPtr()[i] == UNUM_DECIMAL_SEPARATOR_FIELD) {
465                 inside = true;
466             } else if (inside) {
467                 break;
468             }
469         }
470         fp.setBeginIndex(i - fZero);
471         fp.setEndIndex(i - fZero);
472     }
473 
474     return false;
475 }
476 
getAllFieldPositions(FieldPositionIteratorHandler & fpih,UErrorCode & status) const477 void NumberStringBuilder::getAllFieldPositions(FieldPositionIteratorHandler& fpih,
478                                                UErrorCode& status) const {
479     ConstrainedFieldPosition cfpos;
480     while (nextPosition(cfpos, 0, status)) {
481         fpih.addAttribute(cfpos.getField(), cfpos.getStart(), cfpos.getLimit());
482     }
483 }
484 
485 // Signal the end of the string using a field that doesn't exist and that is
486 // different from UNUM_FIELD_COUNT, which is used for "null number field".
487 static constexpr Field kEndField = 0xff;
488 
nextPosition(ConstrainedFieldPosition & cfpos,Field numericField,UErrorCode &) const489 bool NumberStringBuilder::nextPosition(ConstrainedFieldPosition& cfpos, Field numericField, UErrorCode& /*status*/) const {
490     auto numericCAF = NumFieldUtils::expand(numericField);
491     int32_t fieldStart = -1;
492     Field currField = UNUM_FIELD_COUNT;
493     for (int32_t i = fZero + cfpos.getLimit(); i <= fZero + fLength; i++) {
494         Field _field = (i < fZero + fLength) ? getFieldPtr()[i] : kEndField;
495         // Case 1: currently scanning a field.
496         if (currField != UNUM_FIELD_COUNT) {
497             if (currField != _field) {
498                 int32_t end = i - fZero;
499                 // Grouping separators can be whitespace; don't throw them out!
500                 if (currField != UNUM_GROUPING_SEPARATOR_FIELD) {
501                     end = trimBack(i - fZero);
502                 }
503                 if (end <= fieldStart) {
504                     // Entire field position is ignorable; skip.
505                     fieldStart = -1;
506                     currField = UNUM_FIELD_COUNT;
507                     i--;  // look at this index again
508                     continue;
509                 }
510                 int32_t start = fieldStart;
511                 if (currField != UNUM_GROUPING_SEPARATOR_FIELD) {
512                     start = trimFront(start);
513                 }
514                 auto caf = NumFieldUtils::expand(currField);
515                 cfpos.setState(caf.category, caf.field, start, end);
516                 return true;
517             }
518             continue;
519         }
520         // Special case: coalesce the INTEGER if we are pointing at the end of the INTEGER.
521         if (cfpos.matchesField(UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD)
522                 && i > fZero
523                 // don't return the same field twice in a row:
524                 && i - fZero > cfpos.getLimit()
525                 && isIntOrGroup(getFieldPtr()[i - 1])
526                 && !isIntOrGroup(_field)) {
527             int j = i - 1;
528             for (; j >= fZero && isIntOrGroup(getFieldPtr()[j]); j--) {}
529             cfpos.setState(UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD, j - fZero + 1, i - fZero);
530             return true;
531         }
532         // Special case: coalesce NUMERIC if we are pointing at the end of the NUMERIC.
533         if (numericField != 0
534                 && cfpos.matchesField(numericCAF.category, numericCAF.field)
535                 && i > fZero
536                 // don't return the same field twice in a row:
537                 && (i - fZero > cfpos.getLimit()
538                     || cfpos.getCategory() != numericCAF.category
539                     || cfpos.getField() != numericCAF.field)
540                 && isNumericField(getFieldPtr()[i - 1])
541                 && !isNumericField(_field)) {
542             int j = i - 1;
543             for (; j >= fZero && isNumericField(getFieldPtr()[j]); j--) {}
544             cfpos.setState(numericCAF.category, numericCAF.field, j - fZero + 1, i - fZero);
545             return true;
546         }
547         // Special case: skip over INTEGER; will be coalesced later.
548         if (_field == UNUM_INTEGER_FIELD) {
549             _field = UNUM_FIELD_COUNT;
550         }
551         // Case 2: no field starting at this position.
552         if (_field == UNUM_FIELD_COUNT || _field == kEndField) {
553             continue;
554         }
555         // Case 3: check for field starting at this position
556         auto caf = NumFieldUtils::expand(_field);
557         if (cfpos.matchesField(caf.category, caf.field)) {
558             fieldStart = i - fZero;
559             currField = _field;
560         }
561     }
562 
563     U_ASSERT(currField == UNUM_FIELD_COUNT);
564     return false;
565 }
566 
containsField(Field field) const567 bool NumberStringBuilder::containsField(Field field) const {
568     for (int32_t i = 0; i < fLength; i++) {
569         if (field == fieldAt(i)) {
570             return true;
571         }
572     }
573     return false;
574 }
575 
isIntOrGroup(Field field)576 bool NumberStringBuilder::isIntOrGroup(Field field) {
577     return field == UNUM_INTEGER_FIELD
578         || field == UNUM_GROUPING_SEPARATOR_FIELD;
579 }
580 
isNumericField(Field field)581 bool NumberStringBuilder::isNumericField(Field field) {
582     return NumFieldUtils::isNumericField(field);
583 }
584 
trimBack(int32_t limit) const585 int32_t NumberStringBuilder::trimBack(int32_t limit) const {
586     return unisets::get(unisets::DEFAULT_IGNORABLES)->spanBack(
587         getCharPtr() + fZero,
588         limit,
589         USET_SPAN_CONTAINED);
590 }
591 
trimFront(int32_t start) const592 int32_t NumberStringBuilder::trimFront(int32_t start) const {
593     return start + unisets::get(unisets::DEFAULT_IGNORABLES)->span(
594         getCharPtr() + fZero + start,
595         fLength - start,
596         USET_SPAN_CONTAINED);
597 }
598 
599 #endif /* #if !UCONFIG_NO_FORMATTING */
600