1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #ifndef nsBidiUtils_h__
7 #define nsBidiUtils_h__
8 
9 #include "nsString.h"
10 
11 extern "C" {
12 
13 bool encoding_mem_is_utf16_bidi(char16_t const* buffer, size_t len);
14 }
15 
16 /**
17  *  Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt
18  *  section BIDIRECTIONAL PROPERTIES
19  *  for the detailed definition of the following categories
20  *
21  *  The values here must match the equivalents in %bidicategorycode in
22  *  mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
23  *  and must also match the values used by ICU's UCharDirection.
24  */
25 
26 enum nsCharType {
27   eCharType_LeftToRight = 0,
28   eCharType_RightToLeft = 1,
29   eCharType_EuropeanNumber = 2,
30   eCharType_EuropeanNumberSeparator = 3,
31   eCharType_EuropeanNumberTerminator = 4,
32   eCharType_ArabicNumber = 5,
33   eCharType_CommonNumberSeparator = 6,
34   eCharType_BlockSeparator = 7,
35   eCharType_SegmentSeparator = 8,
36   eCharType_WhiteSpaceNeutral = 9,
37   eCharType_OtherNeutral = 10,
38   eCharType_LeftToRightEmbedding = 11,
39   eCharType_LeftToRightOverride = 12,
40   eCharType_RightToLeftArabic = 13,
41   eCharType_RightToLeftEmbedding = 14,
42   eCharType_RightToLeftOverride = 15,
43   eCharType_PopDirectionalFormat = 16,
44   eCharType_DirNonSpacingMark = 17,
45   eCharType_BoundaryNeutral = 18,
46   eCharType_FirstStrongIsolate = 19,
47   eCharType_LeftToRightIsolate = 20,
48   eCharType_RightToLeftIsolate = 21,
49   eCharType_PopDirectionalIsolate = 22,
50   eCharType_CharTypeCount
51 };
52 
53 /**
54  * This specifies the language directional property of a character set.
55  */
56 typedef enum nsCharType nsCharType;
57 
58 /**
59  * Find the direction of an embedding level or paragraph level set by
60  * the Unicode Bidi Algorithm. (Even levels are left-to-right, odd
61  * levels right-to-left.
62  */
63 #define IS_LEVEL_RTL(level) (((level)&1) == 1)
64 
65 /**
66  * Check whether two bidi levels have the same parity and thus the same
67  * directionality
68  */
69 #define IS_SAME_DIRECTION(level1, level2) (((level1 ^ level2) & 1) == 0)
70 
71 /**
72  * Convert from nsBidiLevel to nsBidiDirection
73  */
74 #define DIRECTION_FROM_LEVEL(level) \
75   ((IS_LEVEL_RTL(level)) ? NSBIDI_RTL : NSBIDI_LTR)
76 
77 /**
78  * definitions of bidirection character types by category
79  */
80 
81 #define CHARTYPE_IS_RTL(val) \
82   (((val) == eCharType_RightToLeft) || ((val) == eCharType_RightToLeftArabic))
83 
84 #define CHARTYPE_IS_WEAK(val)                       \
85   (((val) == eCharType_EuropeanNumberSeparator) ||  \
86    ((val) == eCharType_EuropeanNumberTerminator) || \
87    (((val) > eCharType_ArabicNumber) &&             \
88     ((val) != eCharType_RightToLeftArabic)))
89 
90 /**
91  * Inspects a Unichar, converting numbers to Arabic or Hindi forms and
92  * returning them
93  * @param aChar is the character
94  * @param aPrevCharArabic is true if the previous character in the string is
95  *        an Arabic char
96  * @param aNumFlag specifies the conversion to perform:
97  *        IBMBIDI_NUMERAL_NOMINAL:      don't do any conversion
98  *        IBMBIDI_NUMERAL_HINDI:        convert to Hindi forms
99  *                                        (Unicode 0660-0669)
100  *        IBMBIDI_NUMERAL_ARABIC:       convert to Arabic forms
101  *                                        (Unicode 0030-0039)
102  *        IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
103  *                                      Hindi, otherwise to Arabic
104  * @return the converted Unichar
105  */
106 char16_t HandleNumberInChar(char16_t aChar, bool aPrevCharArabic,
107                             uint32_t aNumFlag);
108 
109 /**
110  * Scan a Unichar string, converting numbers to Arabic or Hindi forms in
111  * place
112  * @param aBuffer is the string
113  * @param aSize is the size of aBuffer
114  * @param aNumFlag specifies the conversion to perform:
115  *        IBMBIDI_NUMERAL_NOMINAL:      don't do any conversion
116  *        IBMBIDI_NUMERAL_HINDI:        convert to Hindi forms
117  *                                        (Unicode 0660-0669)
118  *        IBMBIDI_NUMERAL_ARABIC:       convert to Arabic forms
119  *                                        (Unicode 0030-0039)
120  *        IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to
121  *                                      Hindi, otherwise to Arabic
122  */
123 nsresult HandleNumbers(char16_t* aBuffer, uint32_t aSize, uint32_t aNumFlag);
124 
125 /**
126  * Give a UTF-32 codepoint
127  * return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
128  * LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
129  * Return false, otherwise
130  */
131 #define LRM_CHAR 0x200e
132 #define RLM_CHAR 0x200f
133 
134 #define LRE_CHAR 0x202a
135 #define RLE_CHAR 0x202b
136 #define PDF_CHAR 0x202c
137 #define LRO_CHAR 0x202d
138 #define RLO_CHAR 0x202e
139 
140 #define LRI_CHAR 0x2066
141 #define RLI_CHAR 0x2067
142 #define FSI_CHAR 0x2068
143 #define PDI_CHAR 0x2069
144 
145 #define ALM_CHAR 0x061C
IsBidiControl(uint32_t aChar)146 inline bool IsBidiControl(uint32_t aChar) {
147   return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
148           (LRI_CHAR <= aChar && aChar <= PDI_CHAR) || (aChar == ALM_CHAR) ||
149           (aChar & 0xfffffe) == LRM_CHAR);
150 }
151 
152 /**
153  * Give a UTF-32 codepoint
154  * Return true if the codepoint is a Bidi control character that may result
155  * in RTL directionality and therefore needs to trigger bidi resolution;
156  * return false otherwise.
157  */
IsBidiControlRTL(uint32_t aChar)158 inline bool IsBidiControlRTL(uint32_t aChar) {
159   return aChar == RLM_CHAR || aChar == RLE_CHAR || aChar == RLO_CHAR ||
160          aChar == RLI_CHAR || aChar == ALM_CHAR;
161 }
162 
163 /**
164  * Give a 16-bit (UTF-16) text buffer
165  * @return true if the string contains right-to-left characters
166  */
HasRTLChars(mozilla::Span<const char16_t> aBuffer)167 inline bool HasRTLChars(mozilla::Span<const char16_t> aBuffer) {
168   // Span ensures we never pass a nullptr to Rust--even if the
169   // length of the buffer is zero.
170   return encoding_mem_is_utf16_bidi(aBuffer.Elements(), aBuffer.Length());
171 }
172 
173 // These values are shared with Preferences dialog
174 //  ------------------
175 //  If Pref values are to be changed
176 //  in the XUL file of Prefs. the values
177 //  Must be changed here too..
178 //  ------------------
179 //
180 #define IBMBIDI_TEXTDIRECTION_STR "bidi.direction"
181 #define IBMBIDI_TEXTTYPE_STR "bidi.texttype"
182 #define IBMBIDI_NUMERAL_STR "bidi.numeral"
183 
184 //  ------------------
185 //  Text Direction
186 //  ------------------
187 //  bidi.direction
188 #define IBMBIDI_TEXTDIRECTION_LTR 1  //  1 = directionLTRBidi *
189 #define IBMBIDI_TEXTDIRECTION_RTL 2  //  2 = directionRTLBidi
190 //  ------------------
191 //  Text Type
192 //  ------------------
193 //  bidi.texttype
194 #define IBMBIDI_TEXTTYPE_CHARSET 1  //  1 = charsettexttypeBidi *
195 #define IBMBIDI_TEXTTYPE_LOGICAL 2  //  2 = logicaltexttypeBidi
196 #define IBMBIDI_TEXTTYPE_VISUAL 3   //  3 = visualtexttypeBidi
197 //  ------------------
198 //  Numeral Style
199 //  ------------------
200 //  bidi.numeral
201 #define IBMBIDI_NUMERAL_NOMINAL 0         //  0 = nominalnumeralBidi *
202 #define IBMBIDI_NUMERAL_REGULAR 1         //  1 = regularcontextnumeralBidi
203 #define IBMBIDI_NUMERAL_HINDICONTEXT 2    //  2 = hindicontextnumeralBidi
204 #define IBMBIDI_NUMERAL_ARABIC 3          //  3 = arabicnumeralBidi
205 #define IBMBIDI_NUMERAL_HINDI 4           //  4 = hindinumeralBidi
206 #define IBMBIDI_NUMERAL_PERSIANCONTEXT 5  // 5 = persiancontextnumeralBidi
207 #define IBMBIDI_NUMERAL_PERSIAN 6         //  6 = persiannumeralBidi
208 
209 #define IBMBIDI_DEFAULT_BIDI_OPTIONS                                    \
210   ((IBMBIDI_TEXTDIRECTION_LTR << 0) | (IBMBIDI_TEXTTYPE_CHARSET << 4) | \
211    (IBMBIDI_NUMERAL_NOMINAL << 8))
212 
213 #define GET_BIDI_OPTION_DIRECTION(bo) \
214   (((bo) >> 0) & 0x0000000F) /* 4 bits for DIRECTION */
215 #define GET_BIDI_OPTION_TEXTTYPE(bo) \
216   (((bo) >> 4) & 0x0000000F) /* 4 bits for TEXTTYPE */
217 #define GET_BIDI_OPTION_NUMERAL(bo) \
218   (((bo) >> 8) & 0x0000000F) /* 4 bits for NUMERAL */
219 
220 #define SET_BIDI_OPTION_DIRECTION(bo, dir) \
221   { (bo) = ((bo)&0xFFFFFFF0) | (((dir)&0x0000000F) << 0); }
222 #define SET_BIDI_OPTION_TEXTTYPE(bo, tt) \
223   { (bo) = ((bo)&0xFFFFFF0F) | (((tt)&0x0000000F) << 4); }
224 #define SET_BIDI_OPTION_NUMERAL(bo, num) \
225   { (bo) = ((bo)&0xFFFFF0FF) | (((num)&0x0000000F) << 8); }
226 
227 /* Constants related to the position of numerics in the codepage */
228 #define START_HINDI_DIGITS 0x0660
229 #define END_HINDI_DIGITS 0x0669
230 #define START_ARABIC_DIGITS 0x0030
231 #define END_ARABIC_DIGITS 0x0039
232 #define START_FARSI_DIGITS 0x06f0
233 #define END_FARSI_DIGITS 0x06f9
234 #define IS_HINDI_DIGIT(u) \
235   (((u) >= START_HINDI_DIGITS) && ((u) <= END_HINDI_DIGITS))
236 #define IS_ARABIC_DIGIT(u) \
237   (((u) >= START_ARABIC_DIGITS) && ((u) <= END_ARABIC_DIGITS))
238 #define IS_FARSI_DIGIT(u) \
239   (((u) >= START_FARSI_DIGITS) && ((u) <= END_FARSI_DIGITS))
240 /**
241  * Arabic numeric separator and numeric formatting characters:
242  *  U+0600;ARABIC NUMBER SIGN
243  *  U+0601;ARABIC SIGN SANAH
244  *  U+0602;ARABIC FOOTNOTE MARKER
245  *  U+0603;ARABIC SIGN SAFHA
246  *  U+066A;ARABIC PERCENT SIGN
247  *  U+066B;ARABIC DECIMAL SEPARATOR
248  *  U+066C;ARABIC THOUSANDS SEPARATOR
249  *  U+06DD;ARABIC END OF AYAH
250  */
251 #define IS_ARABIC_SEPARATOR(u)                                                 \
252   ((/*(u) >= 0x0600 &&*/ (u) <= 0x0603) || ((u) >= 0x066A && (u) <= 0x066C) || \
253    ((u) == 0x06DD))
254 
255 #define IS_BIDI_DIACRITIC(u)                                                 \
256   (((u) >= 0x0591 && (u) <= 0x05A1) || ((u) >= 0x05A3 && (u) <= 0x05B9) ||   \
257    ((u) >= 0x05BB && (u) <= 0x05BD) || ((u) == 0x05BF) || ((u) == 0x05C1) || \
258    ((u) == 0x05C2) || ((u) == 0x05C4) || ((u) >= 0x064B && (u) <= 0x0652) || \
259    ((u) == 0x0670) || ((u) >= 0x06D7 && (u) <= 0x06E4) || ((u) == 0x06E7) || \
260    ((u) == 0x06E8) || ((u) >= 0x06EA && (u) <= 0x06ED))
261 
262 #define IS_HEBREW_CHAR(c) \
263   (((0x0590 <= (c)) && ((c) <= 0x05FF)) || (((c) >= 0xfb1d) && ((c) <= 0xfb4f)))
264 #define IS_ARABIC_CHAR(c)              \
265   ((0x0600 <= (c) && (c) <= 0x08FF) && \
266    ((c) <= 0x06ff || ((c) >= 0x0750 && (c) <= 0x077f) || (c) >= 0x08a0))
267 #define IS_ARABIC_ALPHABETIC(c) \
268   (IS_ARABIC_CHAR(c) &&         \
269    !(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
270 
271   /**
272    * The codepoint ranges in the following macros are based on the blocks
273    *  allocated, or planned to be allocated, to right-to-left characters in the
274    *  BMP (Basic Multilingual Plane) and SMP (Supplementary Multilingual Plane)
275    *  according to
276    *  http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt and
277    *  http://www.unicode.org/roadmaps/
278    */
279 
280 #define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
281 #define IS_RTL_PRESENTATION_FORM(c) \
282   (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || ((0xfe70 <= (c)) && ((c) <= 0xfefc)))
283 #define IS_IN_SMP_RTL_BLOCK(c)               \
284   (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
285    ((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
286 // Due to the supplementary-plane RTL blocks being identifiable from the
287 // high surrogate without examining the low surrogate, it is correct to
288 // use this by-code-unit check on potentially astral text without doing
289 // the math to decode surrogate pairs into code points. However, unpaired
290 // high surrogates that are RTL high surrogates then count as RTL even
291 // though, if replaced by the REPLACEMENT CHARACTER, it would not be
292 // RTL.
293 #define UTF16_CODE_UNIT_IS_BIDI(c)                              \
294   ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
295    (c) == 0xD802 || (c) == 0xD803 || (c) == 0xD83A || (c) == 0xD83B)
296 #define UTF32_CHAR_IS_BIDI(c)                                   \
297   ((IS_IN_BMP_RTL_BLOCK(c)) || (IS_RTL_PRESENTATION_FORM(c)) || \
298    (IS_IN_SMP_RTL_BLOCK(c)))
299 #endif /* nsBidiUtils_h__ */
300