1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // file: rbbi_cache.h
5 //
6 #ifndef RBBI_CACHE_H
7 #define RBBI_CACHE_H
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_BREAK_ITERATION
12 
13 #include "unicode/rbbi.h"
14 #include "unicode/uobject.h"
15 
16 #include "uvectr32.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
21  *                 Dictionary boundaries are moved first to this cache, then from here
22  *                 to the main BreakCache, where they may inter-leave with non-dictionary
23  *                 boundaries. The public BreakIterator API always fetches directly
24  *                 from the main BreakCache, not from here.
25  *
26  *                 In common situations, the number of boundaries in a single dictionary run
27  *                 should be quite small, it will be terminated by punctuation, spaces,
28  *                 or any other non-dictionary characters. The main BreakCache may end
29  *                 up with boundaries from multiple dictionary based runs.
30  *
31  *                 The boundaries are stored in a simple ArrayList (vector), with the
32  *                 assumption that they will be accessed sequentially.
33  */
34 class RuleBasedBreakIterator::DictionaryCache: public UMemory {
35   public:
36      DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
37      ~DictionaryCache();
38 
39      void reset();
40 
41      UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
42      UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
43 
44     /**
45      * Populate the cache with the dictionary based boundaries within a region of text.
46      * @param startPos  The start position of a range of text
47      * @param endPos    The end position of a range of text
48      * @param firstRuleStatus The rule status index that applies to the break at startPos
49      * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
50      * @internal
51      */
52     void populateDictionary(int32_t startPos, int32_t endPos,
53                          int32_t firstRuleStatus, int32_t otherRuleStatus);
54 
55 
56 
57     RuleBasedBreakIterator *fBI;
58 
59     UVector32           fBreaks;                // A vector containing the boundaries.
60     int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
61                                                 //    or preceding(). Optimizes sequential access.
62     int32_t             fStart;                 // Text position of first boundary in cache.
63     int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the
64                                                 //    text segment being handled by the dictionary.
65     int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary.
66     int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
67 };
68 
69 
70 /*
71  * class BreakCache
72  *
73  * Cache of break boundary positions and rule status values.
74  * Break iterator API functions, next(), previous(), etc., will use cached results
75  * when possible, and otherwise cache new results as they are obtained.
76  *
77  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
78  *
79  * The cache is implemented as a single circular buffer.
80  */
81 
82 /*
83  * size of the circular cache buffer.
84  */
85 
86 class RuleBasedBreakIterator::BreakCache: public UMemory {
87   public:
88                 BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
89     virtual     ~BreakCache();
90     void        reset(int32_t pos = 0, int32_t ruleStatus = 0);
next()91     void        next() {    if (fBufIdx == fEndBufIdx) {
92                                 nextOL();
93                             } else {
94                                 fBufIdx = modChunkSize(fBufIdx + 1);
95                                 fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
96                                 fBI->fRuleStatusIndex = fStatuses[fBufIdx];
97                             }
98                 }
99 
100 
101     void        nextOL();
102     void        previous(UErrorCode &status);
103 
104     // Move the iteration state to the position following the startPosition.
105     // Input position must be pinned to the input length.
106     void        following(int32_t startPosition, UErrorCode &status);
107 
108     void        preceding(int32_t startPosition, UErrorCode &status);
109 
110     /*
111      * Update the state of the public BreakIterator (fBI) to reflect the
112      * current state of the break iterator cache (this).
113      */
114     int32_t     current();
115 
116     /**
117      * Add boundaries to the cache near the specified position.
118      * The given position need not be a boundary itself.
119      * The input position must be within the range of the text, and
120      * on a code point boundary.
121      * If the requested position is a break boundary, leave the iteration
122      * position on it.
123      * If the requested position is not a boundary, leave the iteration
124      * position on the preceding boundary and include both the
125      * preceding and following boundaries in the cache.
126      * Additional boundaries, either preceding or following, may be added
127      * to the cache as a side effect.
128      *
129      * Return FALSE if the operation failed.
130      */
131     UBool populateNear(int32_t position, UErrorCode &status);
132 
133     /**
134      *  Add boundary(s) to the cache following the current last boundary.
135      *  Return FALSE if at the end of the text, and no more boundaries can be added.
136      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
137      */
138     UBool populateFollowing();
139 
140     /**
141      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
142      *  Leave the iteration position on the first added boundary.
143      *  Return false if no boundaries could be added (if at the start of the text.)
144      */
145     UBool populatePreceding(UErrorCode &status);
146 
147     enum UpdatePositionValues {
148         RetainCachePosition = 0,
149         UpdateCachePosition = 1
150     };
151 
152     /*
153      * Add the boundary following the current position.
154      * The current position can be left as it was, or changed to the newly added boundary,
155      * as specified by the update parameter.
156      */
157     void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
158 
159 
160     /*
161      * Add the boundary preceding the current position.
162      * The current position can be left as it was, or changed to the newly added boundary,
163      * as specified by the update parameter.
164      */
165     bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
166 
167     /**
168      *  Set the cache position to the specified position, or, if the position
169      *  falls between to cached boundaries, to the preceding boundary.
170      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
171      *  The startPosition must be on a code point boundary.
172      *
173      *  Return TRUE if successful, FALSE if the specified position is after
174      *  the last cached boundary or before the first.
175      */
176     UBool                   seek(int32_t startPosition);
177 
178     void dumpCache();
179 
180   private:
modChunkSize(int index)181     static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); }
182 
183     static constexpr int32_t CACHE_SIZE = 128;
184     static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
185 
186     RuleBasedBreakIterator *fBI;
187     int32_t                 fStartBufIdx;
188     int32_t                 fEndBufIdx;    // inclusive
189 
190     int32_t                 fTextIdx;
191     int32_t                 fBufIdx;
192 
193     int32_t                 fBoundaries[CACHE_SIZE];
194     uint16_t                fStatuses[CACHE_SIZE];
195 
196     UVector32               fSideBuffer;
197 };
198 
199 U_NAMESPACE_END
200 
201 #endif // #if !UCONFIG_NO_BREAK_ITERATION
202 
203 #endif // RBBI_CACHE_H
204