1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collation.cpp
9 *
10 * created on: 2010oct27
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "collation.h"
19 #include "uassert.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 // Some compilers don't care if constants are defined in the .cpp file.
24 // MS Visual C++ does not like it, but gcc requires it. clang does not care.
25 #ifndef _MSC_VER
26 const uint8_t Collation::LEVEL_SEPARATOR_BYTE;
27 const uint8_t Collation::MERGE_SEPARATOR_BYTE;
28 const uint32_t Collation::ONLY_TERTIARY_MASK;
29 const uint32_t Collation::CASE_AND_TERTIARY_MASK;
30 #endif
31 
32 uint32_t
incTwoBytePrimaryByOffset(uint32_t basePrimary,UBool isCompressible,int32_t offset)33 Collation::incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset) {
34     // Extract the second byte, minus the minimum byte value,
35     // plus the offset, modulo the number of usable byte values, plus the minimum.
36     // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
37     uint32_t primary;
38     if(isCompressible) {
39         offset += ((int32_t)(basePrimary >> 16) & 0xff) - 4;
40         primary = (uint32_t)((offset % 251) + 4) << 16;
41         offset /= 251;
42     } else {
43         offset += ((int32_t)(basePrimary >> 16) & 0xff) - 2;
44         primary = (uint32_t)((offset % 254) + 2) << 16;
45         offset /= 254;
46     }
47     // First byte, assume no further overflow.
48     return primary | ((basePrimary & 0xff000000) + (uint32_t)(offset << 24));
49 }
50 
51 uint32_t
incThreeBytePrimaryByOffset(uint32_t basePrimary,UBool isCompressible,int32_t offset)52 Collation::incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset) {
53     // Extract the third byte, minus the minimum byte value,
54     // plus the offset, modulo the number of usable byte values, plus the minimum.
55     offset += ((int32_t)(basePrimary >> 8) & 0xff) - 2;
56     uint32_t primary = (uint32_t)((offset % 254) + 2) << 8;
57     offset /= 254;
58     // Same with the second byte,
59     // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
60     if(isCompressible) {
61         offset += ((int32_t)(basePrimary >> 16) & 0xff) - 4;
62         primary |= (uint32_t)((offset % 251) + 4) << 16;
63         offset /= 251;
64     } else {
65         offset += ((int32_t)(basePrimary >> 16) & 0xff) - 2;
66         primary |= (uint32_t)((offset % 254) + 2) << 16;
67         offset /= 254;
68     }
69     // First byte, assume no further overflow.
70     return primary | ((basePrimary & 0xff000000) + (uint32_t)(offset << 24));
71 }
72 
73 uint32_t
decTwoBytePrimaryByOneStep(uint32_t basePrimary,UBool isCompressible,int32_t step)74 Collation::decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step) {
75     // Extract the second byte, minus the minimum byte value,
76     // minus the step, modulo the number of usable byte values, plus the minimum.
77     // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
78     // Assume no further underflow for the first byte.
79     U_ASSERT(0 < step && step <= 0x7f);
80     int32_t byte2 = ((int32_t)(basePrimary >> 16) & 0xff) - step;
81     if(isCompressible) {
82         if(byte2 < 4) {
83             byte2 += 251;
84             basePrimary -= 0x1000000;
85         }
86     } else {
87         if(byte2 < 2) {
88             byte2 += 254;
89             basePrimary -= 0x1000000;
90         }
91     }
92     return (basePrimary & 0xff000000) | ((uint32_t)byte2 << 16);
93 }
94 
95 uint32_t
decThreeBytePrimaryByOneStep(uint32_t basePrimary,UBool isCompressible,int32_t step)96 Collation::decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step) {
97     // Extract the third byte, minus the minimum byte value,
98     // minus the step, modulo the number of usable byte values, plus the minimum.
99     U_ASSERT(0 < step && step <= 0x7f);
100     int32_t byte3 = ((int32_t)(basePrimary >> 8) & 0xff) - step;
101     if(byte3 >= 2) {
102         return (basePrimary & 0xffff0000) | ((uint32_t)byte3 << 8);
103     }
104     byte3 += 254;
105     // Same with the second byte,
106     // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
107     int32_t byte2 = ((int32_t)(basePrimary >> 16) & 0xff) - 1;
108     if(isCompressible) {
109         if(byte2 < 4) {
110             byte2 = 0xfe;
111             basePrimary -= 0x1000000;
112         }
113     } else {
114         if(byte2 < 2) {
115             byte2 = 0xff;
116             basePrimary -= 0x1000000;
117         }
118     }
119     // First byte, assume no further underflow.
120     return (basePrimary & 0xff000000) | ((uint32_t)byte2 << 16) | ((uint32_t)byte3 << 8);
121 }
122 
123 uint32_t
getThreeBytePrimaryForOffsetData(UChar32 c,int64_t dataCE)124 Collation::getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE) {
125     uint32_t p = (uint32_t)(dataCE >> 32);  // three-byte primary pppppp00
126     int32_t lower32 = (int32_t)dataCE;  // base code point b & step s: bbbbbbss (bit 7: isCompressible)
127     int32_t offset = (c - (lower32 >> 8)) * (lower32 & 0x7f);  // delta * increment
128     UBool isCompressible = (lower32 & 0x80) != 0;
129     return Collation::incThreeBytePrimaryByOffset(p, isCompressible, offset);
130 }
131 
132 uint32_t
unassignedPrimaryFromCodePoint(UChar32 c)133 Collation::unassignedPrimaryFromCodePoint(UChar32 c) {
134     // Create a gap before U+0000. Use c=-1 for [first unassigned].
135     ++c;
136     // Fourth byte: 18 values, every 14th byte value (gap of 13).
137     uint32_t primary = 2 + (c % 18) * 14;
138     c /= 18;
139     // Third byte: 254 values.
140     primary |= (2 + (c % 254)) << 8;
141     c /= 254;
142     // Second byte: 251 values 04..FE excluding the primary compression bytes.
143     primary |= (4 + (c % 251)) << 16;
144     // One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18).
145     return primary | (UNASSIGNED_IMPLICIT_BYTE << 24);
146 }
147 
148 U_NAMESPACE_END
149 
150 #endif  // !UCONFIG_NO_COLLATION
151