1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #ifndef nsBidiUtils_h__
7 #define nsBidiUtils_h__
8
9 #include "nsString.h"
10
11 extern "C" {
12
13 bool encoding_mem_is_utf16_bidi(char16_t const* buffer, size_t len);
14 }
15
16 /**
17 * Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt
18 * section BIDIRECTIONAL PROPERTIES
19 * for the detailed definition of the following categories
20 *
21 * The values here must match the equivalents in %bidicategorycode in
22 * mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
23 * and must also match the values used by ICU's UCharDirection.
24 */
25
26 enum nsCharType {
27 eCharType_LeftToRight = 0,
28 eCharType_RightToLeft = 1,
29 eCharType_EuropeanNumber = 2,
30 eCharType_EuropeanNumberSeparator = 3,
31 eCharType_EuropeanNumberTerminator = 4,
32 eCharType_ArabicNumber = 5,
33 eCharType_CommonNumberSeparator = 6,
34 eCharType_BlockSeparator = 7,
35 eCharType_SegmentSeparator = 8,
36 eCharType_WhiteSpaceNeutral = 9,
37 eCharType_OtherNeutral = 10,
38 eCharType_LeftToRightEmbedding = 11,
39 eCharType_LeftToRightOverride = 12,
40 eCharType_RightToLeftArabic = 13,
41 eCharType_RightToLeftEmbedding = 14,
42 eCharType_RightToLeftOverride = 15,
43 eCharType_PopDirectionalFormat = 16,
44 eCharType_DirNonSpacingMark = 17,
45 eCharType_BoundaryNeutral = 18,
46 eCharType_FirstStrongIsolate = 19,
47 eCharType_LeftToRightIsolate = 20,
48 eCharType_RightToLeftIsolate = 21,
49 eCharType_PopDirectionalIsolate = 22,
50 eCharType_CharTypeCount
51 };
52
53 /**
54 * This specifies the language directional property of a character set.
55 */
56 typedef enum nsCharType nsCharType;
57
58 /**
59 * Find the direction of an embedding level or paragraph level set by
60 * the Unicode Bidi Algorithm. (Even levels are left-to-right, odd
61 * levels right-to-left.
62 */
63 #define IS_LEVEL_RTL(level) (((level)&1) == 1)
64
65 /**
66 * Check whether two bidi levels have the same parity and thus the same
67 * directionality
68 */
69 #define IS_SAME_DIRECTION(level1, level2) (((level1 ^ level2) & 1) == 0)
70
71 /**
72 * Convert from nsBidiLevel to nsBidiDirection
73 */
74 #define DIRECTION_FROM_LEVEL(level) \
75 ((IS_LEVEL_RTL(level)) ? NSBIDI_RTL : NSBIDI_LTR)
76
77 /**
78 * definitions of bidirection character types by category
79 */
80
81 #define CHARTYPE_IS_RTL(val) \
82 (((val) == eCharType_RightToLeft) || ((val) == eCharType_RightToLeftArabic))
83
84 #define CHARTYPE_IS_WEAK(val) \
85 (((val) == eCharType_EuropeanNumberSeparator) || \
86 ((val) == eCharType_EuropeanNumberTerminator) || \
87 (((val) > eCharType_ArabicNumber) && \
88 ((val) != eCharType_RightToLeftArabic)))
89
90 /**
91 * Inspects a Unichar, converting numbers to Arabic or Hindi forms and
92 * returning them
93 * @param aChar is the character
94 * @param aPrevCharArabic is true if the previous character in the string is
95 * an Arabic char
96 * @param aNumFlag specifies the conversion to perform:
97 * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
98 * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms
99 * (Unicode 0660-0669)
100 * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms
101 * (Unicode 0030-0039)
102 * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
103 * Hindi, otherwise to Arabic
104 * @return the converted Unichar
105 */
106 char16_t HandleNumberInChar(char16_t aChar, bool aPrevCharArabic,
107 uint32_t aNumFlag);
108
109 /**
110 * Scan a Unichar string, converting numbers to Arabic or Hindi forms in
111 * place
112 * @param aBuffer is the string
113 * @param aSize is the size of aBuffer
114 * @param aNumFlag specifies the conversion to perform:
115 * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
116 * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms
117 * (Unicode 0660-0669)
118 * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms
119 * (Unicode 0030-0039)
120 * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
121 * Hindi, otherwise to Arabic
122 */
123 nsresult HandleNumbers(char16_t* aBuffer, uint32_t aSize, uint32_t aNumFlag);
124
125 /**
126 * Give a UTF-32 codepoint
127 * return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
128 * LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
129 * Return false, otherwise
130 */
131 #define LRM_CHAR 0x200e
132 #define RLM_CHAR 0x200f
133
134 #define LRE_CHAR 0x202a
135 #define RLE_CHAR 0x202b
136 #define PDF_CHAR 0x202c
137 #define LRO_CHAR 0x202d
138 #define RLO_CHAR 0x202e
139
140 #define LRI_CHAR 0x2066
141 #define RLI_CHAR 0x2067
142 #define FSI_CHAR 0x2068
143 #define PDI_CHAR 0x2069
144
145 #define ALM_CHAR 0x061C
IsBidiControl(uint32_t aChar)146 inline bool IsBidiControl(uint32_t aChar) {
147 return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
148 (LRI_CHAR <= aChar && aChar <= PDI_CHAR) || (aChar == ALM_CHAR) ||
149 (aChar & 0xfffffe) == LRM_CHAR);
150 }
151
152 /**
153 * Give a UTF-32 codepoint
154 * Return true if the codepoint is a Bidi control character that may result
155 * in RTL directionality and therefore needs to trigger bidi resolution;
156 * return false otherwise.
157 */
IsBidiControlRTL(uint32_t aChar)158 inline bool IsBidiControlRTL(uint32_t aChar) {
159 return aChar == RLM_CHAR || aChar == RLE_CHAR || aChar == RLO_CHAR ||
160 aChar == RLI_CHAR || aChar == ALM_CHAR;
161 }
162
163 /**
164 * Give a 16-bit (UTF-16) text buffer
165 * @return true if the string contains right-to-left characters
166 */
HasRTLChars(mozilla::Span<const char16_t> aBuffer)167 inline bool HasRTLChars(mozilla::Span<const char16_t> aBuffer) {
168 // Span ensures we never pass a nullptr to Rust--even if the
169 // length of the buffer is zero.
170 return encoding_mem_is_utf16_bidi(aBuffer.Elements(), aBuffer.Length());
171 }
172
173 // These values are shared with Preferences dialog
174 // ------------------
175 // If Pref values are to be changed
176 // in the XUL file of Prefs. the values
177 // Must be changed here too..
178 // ------------------
179 //
180 #define IBMBIDI_TEXTDIRECTION_STR "bidi.direction"
181 #define IBMBIDI_TEXTTYPE_STR "bidi.texttype"
182 #define IBMBIDI_NUMERAL_STR "bidi.numeral"
183
184 // ------------------
185 // Text Direction
186 // ------------------
187 // bidi.direction
188 #define IBMBIDI_TEXTDIRECTION_LTR 1 // 1 = directionLTRBidi *
189 #define IBMBIDI_TEXTDIRECTION_RTL 2 // 2 = directionRTLBidi
190 // ------------------
191 // Text Type
192 // ------------------
193 // bidi.texttype
194 #define IBMBIDI_TEXTTYPE_CHARSET 1 // 1 = charsettexttypeBidi *
195 #define IBMBIDI_TEXTTYPE_LOGICAL 2 // 2 = logicaltexttypeBidi
196 #define IBMBIDI_TEXTTYPE_VISUAL 3 // 3 = visualtexttypeBidi
197 // ------------------
198 // Numeral Style
199 // ------------------
200 // bidi.numeral
201 #define IBMBIDI_NUMERAL_NOMINAL 0 // 0 = nominalnumeralBidi *
202 #define IBMBIDI_NUMERAL_REGULAR 1 // 1 = regularcontextnumeralBidi
203 #define IBMBIDI_NUMERAL_HINDICONTEXT 2 // 2 = hindicontextnumeralBidi
204 #define IBMBIDI_NUMERAL_ARABIC 3 // 3 = arabicnumeralBidi
205 #define IBMBIDI_NUMERAL_HINDI 4 // 4 = hindinumeralBidi
206 #define IBMBIDI_NUMERAL_PERSIANCONTEXT 5 // 5 = persiancontextnumeralBidi
207 #define IBMBIDI_NUMERAL_PERSIAN 6 // 6 = persiannumeralBidi
208
209 #define IBMBIDI_DEFAULT_BIDI_OPTIONS \
210 ((IBMBIDI_TEXTDIRECTION_LTR << 0) | (IBMBIDI_TEXTTYPE_CHARSET << 4) | \
211 (IBMBIDI_NUMERAL_NOMINAL << 8))
212
213 #define GET_BIDI_OPTION_DIRECTION(bo) \
214 (((bo) >> 0) & 0x0000000F) /* 4 bits for DIRECTION */
215 #define GET_BIDI_OPTION_TEXTTYPE(bo) \
216 (((bo) >> 4) & 0x0000000F) /* 4 bits for TEXTTYPE */
217 #define GET_BIDI_OPTION_NUMERAL(bo) \
218 (((bo) >> 8) & 0x0000000F) /* 4 bits for NUMERAL */
219
220 #define SET_BIDI_OPTION_DIRECTION(bo, dir) \
221 { (bo) = ((bo)&0xFFFFFFF0) | (((dir)&0x0000000F) << 0); }
222 #define SET_BIDI_OPTION_TEXTTYPE(bo, tt) \
223 { (bo) = ((bo)&0xFFFFFF0F) | (((tt)&0x0000000F) << 4); }
224 #define SET_BIDI_OPTION_NUMERAL(bo, num) \
225 { (bo) = ((bo)&0xFFFFF0FF) | (((num)&0x0000000F) << 8); }
226
227 /* Constants related to the position of numerics in the codepage */
228 #define START_HINDI_DIGITS 0x0660
229 #define END_HINDI_DIGITS 0x0669
230 #define START_ARABIC_DIGITS 0x0030
231 #define END_ARABIC_DIGITS 0x0039
232 #define START_FARSI_DIGITS 0x06f0
233 #define END_FARSI_DIGITS 0x06f9
234 #define IS_HINDI_DIGIT(u) \
235 (((u) >= START_HINDI_DIGITS) && ((u) <= END_HINDI_DIGITS))
236 #define IS_ARABIC_DIGIT(u) \
237 (((u) >= START_ARABIC_DIGITS) && ((u) <= END_ARABIC_DIGITS))
238 #define IS_FARSI_DIGIT(u) \
239 (((u) >= START_FARSI_DIGITS) && ((u) <= END_FARSI_DIGITS))
240 /**
241 * Arabic numeric separator and numeric formatting characters:
242 * U+0600;ARABIC NUMBER SIGN
243 * U+0601;ARABIC SIGN SANAH
244 * U+0602;ARABIC FOOTNOTE MARKER
245 * U+0603;ARABIC SIGN SAFHA
246 * U+066A;ARABIC PERCENT SIGN
247 * U+066B;ARABIC DECIMAL SEPARATOR
248 * U+066C;ARABIC THOUSANDS SEPARATOR
249 * U+06DD;ARABIC END OF AYAH
250 */
251 #define IS_ARABIC_SEPARATOR(u) \
252 ((/*(u) >= 0x0600 &&*/ (u) <= 0x0603) || ((u) >= 0x066A && (u) <= 0x066C) || \
253 ((u) == 0x06DD))
254
255 #define IS_BIDI_DIACRITIC(u) \
256 (((u) >= 0x0591 && (u) <= 0x05A1) || ((u) >= 0x05A3 && (u) <= 0x05B9) || \
257 ((u) >= 0x05BB && (u) <= 0x05BD) || ((u) == 0x05BF) || ((u) == 0x05C1) || \
258 ((u) == 0x05C2) || ((u) == 0x05C4) || ((u) >= 0x064B && (u) <= 0x0652) || \
259 ((u) == 0x0670) || ((u) >= 0x06D7 && (u) <= 0x06E4) || ((u) == 0x06E7) || \
260 ((u) == 0x06E8) || ((u) >= 0x06EA && (u) <= 0x06ED))
261
262 #define IS_HEBREW_CHAR(c) \
263 (((0x0590 <= (c)) && ((c) <= 0x05FF)) || (((c) >= 0xfb1d) && ((c) <= 0xfb4f)))
264 #define IS_ARABIC_CHAR(c) \
265 ((0x0600 <= (c) && (c) <= 0x08FF) && \
266 ((c) <= 0x06ff || ((c) >= 0x0750 && (c) <= 0x077f) || (c) >= 0x08a0))
267 #define IS_ARABIC_ALPHABETIC(c) \
268 (IS_ARABIC_CHAR(c) && \
269 !(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
270
271 /**
272 * The codepoint ranges in the following macros are based on the blocks
273 * allocated, or planned to be allocated, to right-to-left characters in the
274 * BMP (Basic Multilingual Plane) and SMP (Supplementary Multilingual Plane)
275 * according to
276 * http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt and
277 * http://www.unicode.org/roadmaps/
278 */
279
280 #define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
281 #define IS_RTL_PRESENTATION_FORM(c) \
282 (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || ((0xfe70 <= (c)) && ((c) <= 0xfefc)))
283 #define IS_IN_SMP_RTL_BLOCK(c) \
284 (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
285 ((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
286 // Due to the supplementary-plane RTL blocks being identifiable from the
287 // high surrogate without examining the low surrogate, it is correct to
288 // use this by-code-unit check on potentially astral text without doing
289 // the math to decode surrogate pairs into code points. However, unpaired
290 // high surrogates that are RTL high surrogates then count as RTL even
291 // though, if replaced by the REPLACEMENT CHARACTER, it would not be
292 // RTL.
293 #define UTF16_CODE_UNIT_IS_BIDI(c) \
294 ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
295 (c) == 0xD802 || (c) == 0xD803 || (c) == 0xD83A || (c) == 0xD83B)
296 #define UTF32_CHAR_IS_BIDI(c) \
297 ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
298 (IS_IN_SMP_RTL_BLOCK(c)))
299 #endif /* nsBidiUtils_h__ */
300