1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // edits.h
5 // created: 2016dec30 Markus W. Scherer
6 
7 #ifndef __EDITS_H__
8 #define __EDITS_H__
9 
10 #include "unicode/utypes.h"
11 
12 #if U_SHOW_CPLUSPLUS_API
13 
14 #include "unicode/uobject.h"
15 
16 /**
17  * \file
18  * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
19  */
20 
21 U_NAMESPACE_BEGIN
22 
23 class UnicodeString;
24 
25 /**
26  * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions
27  * in linear progression. Does not support moving/reordering of text.
28  *
29  * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to
30  * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and
31  * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity,
32  * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one
33  * mapping between code points in the source and destination strings.
34  *
35  * After all edits have been added, instances of this class should be considered immutable, and an
36  * {@link Edits::Iterator} can be used for queries.
37  *
38  * There are four flavors of Edits::Iterator:
39  *
40  * <ul>
41  * <li>{@link #getFineIterator()} retains full granularity of change edits.
42  * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling
43  * next() on the iterator, skips over no-change edits (unchanged regions).
44  * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change
45  * edits are automatically merged during the construction phase.)
46  * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when
47  * calling next() on the iterator, skips over no-change edits (unchanged regions).
48  * </ul>
49  *
50  * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the
51  * following fine edits:
52  * <ul>
53  * <li>abc ⇨ abc (no-change)
54  * <li>ß ⇨ ss (change)
55  * <li>D ⇨ d (change)
56  * <li>e ⇨ e (no-change)
57  * <li>F ⇨ f (change)
58  * </ul>
59  * and the following coarse edits (note how adjacent change edits get merged together):
60  * <ul>
61  * <li>abc ⇨ abc (no-change)
62  * <li>ßD ⇨ ssd (change)
63  * <li>e ⇨ e (no-change)
64  * <li>F ⇨ f (change)
65  * </ul>
66  *
67  * The "fine changes" and "coarse changes" iterators will step through only the change edits when their
68  * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when
69  * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()`
70  * methods are used to walk through the string.
71  *
72  * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in
73  * UCharacterCaseTest.java.
74  *
75  * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
76  * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
77  *
78  * @stable ICU 59
79  */
80 class U_COMMON_API Edits U_FINAL : public UMemory {
81 public:
82     /**
83      * Constructs an empty object.
84      * @stable ICU 59
85      */
Edits()86     Edits() :
87             array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
88             errorCode_(U_ZERO_ERROR) {}
89     /**
90      * Copy constructor.
91      * @param other source edits
92      * @stable ICU 60
93      */
Edits(const Edits & other)94     Edits(const Edits &other) :
95             array(stackArray), capacity(STACK_CAPACITY), length(other.length),
96             delta(other.delta), numChanges(other.numChanges),
97             errorCode_(other.errorCode_) {
98         copyArray(other);
99     }
100     /**
101      * Move constructor, might leave src empty.
102      * This object will have the same contents that the source object had.
103      * @param src source edits
104      * @stable ICU 60
105      */
Edits(Edits && src)106     Edits(Edits &&src) U_NOEXCEPT :
107             array(stackArray), capacity(STACK_CAPACITY), length(src.length),
108             delta(src.delta), numChanges(src.numChanges),
109             errorCode_(src.errorCode_) {
110         moveArray(src);
111     }
112 
113     /**
114      * Destructor.
115      * @stable ICU 59
116      */
117     ~Edits();
118 
119     /**
120      * Assignment operator.
121      * @param other source edits
122      * @return *this
123      * @stable ICU 60
124      */
125     Edits &operator=(const Edits &other);
126 
127     /**
128      * Move assignment operator, might leave src empty.
129      * This object will have the same contents that the source object had.
130      * The behavior is undefined if *this and src are the same object.
131      * @param src source edits
132      * @return *this
133      * @stable ICU 60
134      */
135     Edits &operator=(Edits &&src) U_NOEXCEPT;
136 
137     /**
138      * Resets the data but may not release memory.
139      * @stable ICU 59
140      */
141     void reset() U_NOEXCEPT;
142 
143     /**
144      * Adds a no-change edit: a record for an unchanged segment of text.
145      * Normally called from inside ICU string transformation functions, not user code.
146      * @stable ICU 59
147      */
148     void addUnchanged(int32_t unchangedLength);
149     /**
150      * Adds a change edit: a record for a text replacement/insertion/deletion.
151      * Normally called from inside ICU string transformation functions, not user code.
152      * @stable ICU 59
153      */
154     void addReplace(int32_t oldLength, int32_t newLength);
155     /**
156      * Sets the UErrorCode if an error occurred while recording edits.
157      * Preserves older error codes in the outErrorCode.
158      * Normally called from inside ICU string transformation functions, not user code.
159      * @param outErrorCode Set to an error code if it does not contain one already
160      *                  and an error occurred while recording edits.
161      *                  Otherwise unchanged.
162      * @return true if U_FAILURE(outErrorCode)
163      * @stable ICU 59
164      */
165     UBool copyErrorTo(UErrorCode &outErrorCode) const;
166 
167     /**
168      * How much longer is the new text compared with the old text?
169      * @return new length minus old length
170      * @stable ICU 59
171      */
lengthDelta()172     int32_t lengthDelta() const { return delta; }
173     /**
174      * @return true if there are any change edits
175      * @stable ICU 59
176      */
hasChanges()177     UBool hasChanges() const { return numChanges != 0; }
178 
179     /**
180      * @return the number of change edits
181      * @stable ICU 60
182      */
numberOfChanges()183     int32_t numberOfChanges() const { return numChanges; }
184 
185     /**
186      * Access to the list of edits.
187      *
188      * At any moment in time, an instance of this class points to a single edit: a "window" into a span
189      * of the source string and the corresponding span of the destination string. The source string span
190      * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string
191      * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.
192      *
193      * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`,
194      * and `findDestinationIndex(int32_t, UErrorCode &)` methods.
195      * Calling any of these methods mutates the iterator to make it point to the corresponding edit.
196      *
197      * For more information, see the documentation for {@link Edits}.
198      *
199      * @see getCoarseIterator
200      * @see getFineIterator
201      * @stable ICU 59
202      */
203     struct U_COMMON_API Iterator U_FINAL : public UMemory {
204         /**
205          * Default constructor, empty iterator.
206          * @stable ICU 60
207          */
IteratorU_FINAL208         Iterator() :
209                 array(nullptr), index(0), length(0),
210                 remaining(0), onlyChanges_(false), coarse(false),
211                 dir(0), changed(false), oldLength_(0), newLength_(0),
212                 srcIndex(0), replIndex(0), destIndex(0) {}
213         /**
214          * Copy constructor.
215          * @stable ICU 59
216          */
217         Iterator(const Iterator &other) = default;
218         /**
219          * Assignment operator.
220          * @stable ICU 59
221          */
222         Iterator &operator=(const Iterator &other) = default;
223 
224         /**
225          * Advances the iterator to the next edit.
226          * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
227          *                  or else the function returns immediately. Check for U_FAILURE()
228          *                  on output or use with function chaining. (See User Guide for details.)
229          * @return true if there is another edit
230          * @stable ICU 59
231          */
nextU_FINAL232         UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
233 
234         /**
235          * Moves the iterator to the edit that contains the source index.
236          * The source index may be found in a no-change edit
237          * even if normal iteration would skip no-change edits.
238          * Normal iteration can continue from a found edit.
239          *
240          * The iterator state before this search logically does not matter.
241          * (It may affect the performance of the search.)
242          *
243          * The iterator state after this search is undefined
244          * if the source index is out of bounds for the source string.
245          *
246          * @param i source index
247          * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
248          *                  or else the function returns immediately. Check for U_FAILURE()
249          *                  on output or use with function chaining. (See User Guide for details.)
250          * @return true if the edit for the source index was found
251          * @stable ICU 59
252          */
findSourceIndexU_FINAL253         UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
254             return findIndex(i, true, errorCode) == 0;
255         }
256 
257         /**
258          * Moves the iterator to the edit that contains the destination index.
259          * The destination index may be found in a no-change edit
260          * even if normal iteration would skip no-change edits.
261          * Normal iteration can continue from a found edit.
262          *
263          * The iterator state before this search logically does not matter.
264          * (It may affect the performance of the search.)
265          *
266          * The iterator state after this search is undefined
267          * if the source index is out of bounds for the source string.
268          *
269          * @param i destination index
270          * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
271          *                  or else the function returns immediately. Check for U_FAILURE()
272          *                  on output or use with function chaining. (See User Guide for details.)
273          * @return true if the edit for the destination index was found
274          * @stable ICU 60
275          */
findDestinationIndexU_FINAL276         UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
277             return findIndex(i, false, errorCode) == 0;
278         }
279 
280         /**
281          * Computes the destination index corresponding to the given source index.
282          * If the source index is inside a change edit (not at its start),
283          * then the destination index at the end of that edit is returned,
284          * since there is no information about index mapping inside a change edit.
285          *
286          * (This means that indexes to the start and middle of an edit,
287          * for example around a grapheme cluster, are mapped to indexes
288          * encompassing the entire edit.
289          * The alternative, mapping an interior index to the start,
290          * would map such an interval to an empty one.)
291          *
292          * This operation will usually but not always modify this object.
293          * The iterator state after this search is undefined.
294          *
295          * @param i source index
296          * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
297          *                  or else the function returns immediately. Check for U_FAILURE()
298          *                  on output or use with function chaining. (See User Guide for details.)
299          * @return destination index; undefined if i is not 0..string length
300          * @stable ICU 60
301          */
302         int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
303 
304         /**
305          * Computes the source index corresponding to the given destination index.
306          * If the destination index is inside a change edit (not at its start),
307          * then the source index at the end of that edit is returned,
308          * since there is no information about index mapping inside a change edit.
309          *
310          * (This means that indexes to the start and middle of an edit,
311          * for example around a grapheme cluster, are mapped to indexes
312          * encompassing the entire edit.
313          * The alternative, mapping an interior index to the start,
314          * would map such an interval to an empty one.)
315          *
316          * This operation will usually but not always modify this object.
317          * The iterator state after this search is undefined.
318          *
319          * @param i destination index
320          * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
321          *                  or else the function returns immediately. Check for U_FAILURE()
322          *                  on output or use with function chaining. (See User Guide for details.)
323          * @return source index; undefined if i is not 0..string length
324          * @stable ICU 60
325          */
326         int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
327 
328         /**
329          * Returns whether the edit currently represented by the iterator is a change edit.
330          *
331          * @return true if this edit replaces oldLength() units with newLength() different ones.
332          *         false if oldLength units remain unchanged.
333          * @stable ICU 59
334          */
hasChangeU_FINAL335         UBool hasChange() const { return changed; }
336 
337         /**
338          * The length of the current span in the source string, which starts at {@link #sourceIndex}.
339          *
340          * @return the number of units in the original string which are replaced or remain unchanged.
341          * @stable ICU 59
342          */
oldLengthU_FINAL343         int32_t oldLength() const { return oldLength_; }
344 
345         /**
346          * The length of the current span in the destination string, which starts at
347          * {@link #destinationIndex}, or in the replacement string, which starts at
348          * {@link #replacementIndex}.
349          *
350          * @return the number of units in the modified string, if hasChange() is true.
351          *         Same as oldLength if hasChange() is false.
352          * @stable ICU 59
353          */
newLengthU_FINAL354         int32_t newLength() const { return newLength_; }
355 
356         /**
357          * The start index of the current span in the source string; the span has length
358          * {@link #oldLength}.
359          *
360          * @return the current index into the source string
361          * @stable ICU 59
362          */
sourceIndexU_FINAL363         int32_t sourceIndex() const { return srcIndex; }
364 
365         /**
366          * The start index of the current span in the replacement string; the span has length
367          * {@link #newLength}. Well-defined only if the current edit is a change edit.
368          *
369          * The *replacement string* is the concatenation of all substrings of the destination
370          * string corresponding to change edits.
371          *
372          * This method is intended to be used together with operations that write only replacement
373          * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option).
374          * The source string can then be modified in-place.
375          *
376          * @return the current index into the replacement-characters-only string,
377          *         not counting unchanged spans
378          * @stable ICU 59
379          */
replacementIndexU_FINAL380         int32_t replacementIndex() const {
381             // TODO: Throw an exception if we aren't in a change edit?
382             return replIndex;
383         }
384 
385         /**
386          * The start index of the current span in the destination string; the span has length
387          * {@link #newLength}.
388          *
389          * @return the current index into the full destination string
390          * @stable ICU 59
391          */
destinationIndexU_FINAL392         int32_t destinationIndex() const { return destIndex; }
393 
394 #ifndef U_HIDE_INTERNAL_API
395         /**
396          * A string representation of the current edit represented by the iterator for debugging. You
397          * should not depend on the contents of the return string.
398          * @internal
399          */
400         UnicodeString& toString(UnicodeString& appendTo) const;
401 #endif  // U_HIDE_INTERNAL_API
402 
403     private:
404         friend class Edits;
405 
406         Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
407 
408         int32_t readLength(int32_t head);
409         void updateNextIndexes();
410         void updatePreviousIndexes();
411         UBool noNext();
412         UBool next(UBool onlyChanges, UErrorCode &errorCode);
413         UBool previous(UErrorCode &errorCode);
414         /** @return -1: error or i<0; 0: found; 1: i>=string length */
415         int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
416 
417         const uint16_t *array;
418         int32_t index, length;
419         // 0 if we are not within compressed equal-length changes.
420         // Otherwise the number of remaining changes, including the current one.
421         int32_t remaining;
422         UBool onlyChanges_, coarse;
423 
424         int8_t dir;  // iteration direction: back(<0), initial(0), forward(>0)
425         UBool changed;
426         int32_t oldLength_, newLength_;
427         int32_t srcIndex, replIndex, destIndex;
428     };
429 
430     /**
431      * Returns an Iterator for coarse-grained change edits
432      * (adjacent change edits are treated as one).
433      * Can be used to perform simple string updates.
434      * Skips no-change edits.
435      * @return an Iterator that merges adjacent changes.
436      * @stable ICU 59
437      */
getCoarseChangesIterator()438     Iterator getCoarseChangesIterator() const {
439         return Iterator(array, length, true, true);
440     }
441 
442     /**
443      * Returns an Iterator for coarse-grained change and no-change edits
444      * (adjacent change edits are treated as one).
445      * Can be used to perform simple string updates.
446      * Adjacent change edits are treated as one edit.
447      * @return an Iterator that merges adjacent changes.
448      * @stable ICU 59
449      */
getCoarseIterator()450     Iterator getCoarseIterator() const {
451         return Iterator(array, length, false, true);
452     }
453 
454     /**
455      * Returns an Iterator for fine-grained change edits
456      * (full granularity of change edits is retained).
457      * Can be used for modifying styled text.
458      * Skips no-change edits.
459      * @return an Iterator that separates adjacent changes.
460      * @stable ICU 59
461      */
getFineChangesIterator()462     Iterator getFineChangesIterator() const {
463         return Iterator(array, length, true, false);
464     }
465 
466     /**
467      * Returns an Iterator for fine-grained change and no-change edits
468      * (full granularity of change edits is retained).
469      * Can be used for modifying styled text.
470      * @return an Iterator that separates adjacent changes.
471      * @stable ICU 59
472      */
getFineIterator()473     Iterator getFineIterator() const {
474         return Iterator(array, length, false, false);
475     }
476 
477     /**
478      * Merges the two input Edits and appends the result to this object.
479      *
480      * Consider two string transformations (for example, normalization and case mapping)
481      * where each records Edits in addition to writing an output string.<br>
482      * Edits ab reflect how substrings of input string a
483      * map to substrings of intermediate string b.<br>
484      * Edits bc reflect how substrings of intermediate string b
485      * map to substrings of output string c.<br>
486      * This function merges ab and bc such that the additional edits
487      * recorded in this object reflect how substrings of input string a
488      * map to substrings of output string c.
489      *
490      * If unrelated Edits are passed in where the output string of the first
491      * has a different length than the input string of the second,
492      * then a U_ILLEGAL_ARGUMENT_ERROR is reported.
493      *
494      * @param ab reflects how substrings of input string a
495      *     map to substrings of intermediate string b.
496      * @param bc reflects how substrings of intermediate string b
497      *     map to substrings of output string c.
498      * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
499      *                  or else the function returns immediately. Check for U_FAILURE()
500      *                  on output or use with function chaining. (See User Guide for details.)
501      * @return *this, with the merged edits appended
502      * @stable ICU 60
503      */
504     Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
505 
506 private:
507     void releaseArray() U_NOEXCEPT;
508     Edits &copyArray(const Edits &other);
509     Edits &moveArray(Edits &src) U_NOEXCEPT;
510 
setLastUnit(int32_t last)511     void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
lastUnit()512     int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
513 
514     void append(int32_t r);
515     UBool growArray();
516 
517     static const int32_t STACK_CAPACITY = 100;
518     uint16_t *array;
519     int32_t capacity;
520     int32_t length;
521     int32_t delta;
522     int32_t numChanges;
523     UErrorCode errorCode_;
524     uint16_t stackArray[STACK_CAPACITY];
525 };
526 
527 U_NAMESPACE_END
528 
529 #endif /* U_SHOW_CPLUSPLUS_API */
530 
531 #endif  // __EDITS_H__
532