1 /*
2 *************************************************************************
3 *   © 2016 and later: Unicode, Inc. and others.
4 *   License & terms of use: http://www.unicode.org/copyright.html
5 *************************************************************************
6 *************************************************************************
7 *   Copyright (C) 2007, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 *************************************************************************
10 *   file name:  trieset.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan15
16 *   created by: Markus Scherer
17 *
18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 *   using a UTrie with 8-bit (byte) results per code point.
20 *   Modifies the trie index to make the BMP linear, and uses the original set
21 *   for supplementary code points.
22 */
23 
24 #include "unicode/utypes.h"
25 #include "unicont.h"
26 
27 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
28 
29 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
30     ((const uint8_t *)(trie)->data32)[ \
31         ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
32         ((c16)&UTRIE_MASK) \
33     ]
34 
35 class TrieSet : public UObject, public UnicodeContainable {
36 public:
TrieSet(const UnicodeSet & set,UErrorCode & errorCode)37     TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
38             : trieData(NULL), latin1(NULL), restSet(set.clone()) {
39         if(U_FAILURE(errorCode)) {
40             return;
41         }
42         if(restSet==NULL) {
43             errorCode=U_MEMORY_ALLOCATION_ERROR;
44             return;
45         }
46 
47         UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
48         UChar32 start, end;
49 
50         UnicodeSetIterator iter(set);
51 
52         while(iter.nextRange() && !iter.isString()) {
53             start=iter.getCodepoint();
54             end=iter.getCodepointEnd();
55             if(start>0xffff) {
56                 break;
57             }
58             if(end>0xffff) {
59                 end=0xffff;
60             }
61             if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
62                 errorCode=U_INTERNAL_PROGRAM_ERROR;
63                 return;
64             }
65         }
66 
67         // Preflight the trie length.
68         int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
69         if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
70             return;
71         }
72 
73         trieData=(uint32_t *)uprv_malloc(length);
74         if(trieData==NULL) {
75             errorCode=U_MEMORY_ALLOCATION_ERROR;
76             return;
77         }
78 
79         errorCode=U_ZERO_ERROR;
80         utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
81         utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
82 
83         if(U_SUCCESS(errorCode)) {
84             // Copy the indexes for surrogate code points into the BMP range
85             // for simple access across the entire BMP.
86             uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
87                         trie.index+UTRIE_BMP_INDEX_LENGTH,
88                         (0x800>>UTRIE_SHIFT)*2);
89             latin1=UTRIE_GET8_LATIN1(&trie);
90         }
91 
92         restSet.remove(0, 0xffff);
93     }
94 
~TrieSet()95     ~TrieSet() {
96         uprv_free(trieData);
97         delete restSet;
98     }
99 
contains(UChar32 c) const100     UBool contains(UChar32 c) const {
101         if((uint32_t)c<=0xff) {
102             return (UBool)latin1[c];
103         } else if((uint32_t)c<0xffff) {
104             return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
105         } else {
106             return restSet->contains(c);
107         }
108     }
109 
110 private:
111     uint32_t *trieData;
112     const uint8_t *latin1;
113     UTrie trie;
114     UnicodeSet *restSet;
115 };
116