1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2016, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * dictionarydata.h
9 *
10 * created on: 2012may31
11 * created by: Markus W. Scherer & Maxime Serrano
12 */
13 
14 #include "dictionarydata.h"
15 #include "unicode/ucharstrie.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/udata.h"
18 #include "cmemory.h"
19 
20 #if !UCONFIG_NO_BREAK_ITERATION
21 
22 U_NAMESPACE_BEGIN
23 
24 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
25 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
26 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
27 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
28 
29 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
30 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33 
~DictionaryMatcher()34 DictionaryMatcher::~DictionaryMatcher() {
35 }
36 
~UCharsDictionaryMatcher()37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38     udata_close(file);
39 }
40 
getType() const41 int32_t UCharsDictionaryMatcher::getType() const {
42     return DictionaryData::TRIE_TYPE_UCHARS;
43 }
44 
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
47                             int32_t *prefix) const {
48 
49     UCharsTrie uct(characters);
50     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
51     int32_t wordCount = 0;
52     int32_t codePointsMatched = 0;
53 
54     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
56         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
57         codePointsMatched += 1;
58         if (USTRINGTRIE_HAS_VALUE(result)) {
59             if (wordCount < limit) {
60                 if (values != NULL) {
61                     values[wordCount] = uct.getValue();
62                 }
63                 if (lengths != NULL) {
64                     lengths[wordCount] = lengthMatched;
65                 }
66                 if (cpLengths != NULL) {
67                     cpLengths[wordCount] = codePointsMatched;
68                 }
69                 ++wordCount;
70             }
71             if (result == USTRINGTRIE_FINAL_VALUE) {
72                 break;
73             }
74         }
75         else if (result == USTRINGTRIE_NO_MATCH) {
76             break;
77         }
78         if (lengthMatched >= maxLength) {
79             break;
80         }
81     }
82 
83     if (prefix != NULL) {
84         *prefix = codePointsMatched;
85     }
86     return wordCount;
87 }
88 
~BytesDictionaryMatcher()89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90     udata_close(file);
91 }
92 
transform(UChar32 c) const93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95         if (c == 0x200D) {
96             return 0xFF;
97         } else if (c == 0x200C) {
98             return 0xFE;
99         }
100         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101         if (delta < 0 || 0xFD < delta) {
102             return U_SENTINEL;
103         }
104         return (UChar32)delta;
105     }
106     return c;
107 }
108 
getType() const109 int32_t BytesDictionaryMatcher::getType() const {
110     return DictionaryData::TRIE_TYPE_BYTES;
111 }
112 
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
115                             int32_t *prefix) const {
116     BytesTrie bt(characters);
117     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
118     int32_t wordCount = 0;
119     int32_t codePointsMatched = 0;
120 
121     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
123         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
124         codePointsMatched += 1;
125         if (USTRINGTRIE_HAS_VALUE(result)) {
126             if (wordCount < limit) {
127                 if (values != NULL) {
128                     values[wordCount] = bt.getValue();
129                 }
130                 if (lengths != NULL) {
131                     lengths[wordCount] = lengthMatched;
132                 }
133                 if (cpLengths != NULL) {
134                     cpLengths[wordCount] = codePointsMatched;
135                 }
136                 ++wordCount;
137             }
138             if (result == USTRINGTRIE_FINAL_VALUE) {
139                 break;
140             }
141         }
142         else if (result == USTRINGTRIE_NO_MATCH) {
143             break;
144         }
145         if (lengthMatched >= maxLength) {
146             break;
147         }
148     }
149 
150     if (prefix != NULL) {
151         *prefix = codePointsMatched;
152     }
153     return wordCount;
154 }
155 
156 
157 U_NAMESPACE_END
158 
159 U_NAMESPACE_USE
160 
161 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
163            void *outData, UErrorCode *pErrorCode) {
164     const UDataInfo *pInfo;
165     int32_t headerSize;
166     const uint8_t *inBytes;
167     uint8_t *outBytes;
168     const int32_t *inIndexes;
169     int32_t indexes[DictionaryData::IX_COUNT];
170     int32_t i, offset, size;
171 
172     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
174     pInfo = (const UDataInfo *)((const char *)inData + 4);
175     if (!(pInfo->dataFormat[0] == 0x44 &&
176           pInfo->dataFormat[1] == 0x69 &&
177           pInfo->dataFormat[2] == 0x63 &&
178           pInfo->dataFormat[3] == 0x74 &&
179           pInfo->formatVersion[0] == 1)) {
180         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182         *pErrorCode = U_UNSUPPORTED_ERROR;
183         return 0;
184     }
185 
186     inBytes = (const uint8_t *)inData + headerSize;
187     outBytes = (uint8_t *)outData + headerSize;
188 
189     inIndexes = (const int32_t *)inBytes;
190     if (length >= 0) {
191         length -= headerSize;
192         if (length < (int32_t)(sizeof(indexes))) {
193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195             return 0;
196         }
197     }
198 
199     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200         indexes[i] = udata_readInt32(ds, inIndexes[i]);
201     }
202 
203     size = indexes[DictionaryData::IX_TOTAL_SIZE];
204 
205     if (length >= 0) {
206         if (length < size) {
207             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209             return 0;
210         }
211 
212         if (inBytes != outBytes) {
213             uprv_memcpy(outBytes, inBytes, size);
214         }
215 
216         offset = 0;
217         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218         offset = (int32_t)sizeof(indexes);
219         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221 
222         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225             // nothing to do
226         } else {
227             udata_printError(ds, "udict_swap(): unknown trie type!\n");
228             *pErrorCode = U_UNSUPPORTED_ERROR;
229             return 0;
230         }
231 
232         // these next two sections are empty in the current format,
233         // but may be used later.
234         offset = nextOffset;
235         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236         offset = nextOffset;
237         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238         offset = nextOffset;
239     }
240     return headerSize + size;
241 }
242 #endif
243