1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "AccessibleWrap.h"
8 #include "nsString.h"
9 #include "nsMai.h"
10
11 /**
12 * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
13 * in UTF-16 code units. That makes a difference for non-BMP characters,
14 * which need two UTF-16 code units to be represented (a pair of surrogates),
15 * while they are just one unicode character.
16 *
17 * To keep synchronization between ATK offsets (unicode codepoints) and DOM
18 * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
19 * BOM after each non-BMP character (which would otherwise use 2 UTF-16
20 * code units for only 1 unicode codepoint).
21 *
22 * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
23 * that usage is deprecated) normally only appear at the beginning of unicode
24 * files, but their occurrence within text (notably after cut&paste) is not
25 * uncommon, and are thus considered as non-text.
26 *
27 * Since the selection requested through ATK may not contain both surrogates
28 * at the ends of the selection, we need to fetch one UTF-16 code point more
29 * on both side, and get rid of it before returning the string to ATK. The
30 * ATKStringConverterHelper class maintains this, NewATKString should be used
31 * to call it properly.
32 *
33 * In the end,
34 * - if the start is between the high and low surrogates, the UTF-8 result
35 * includes a BOM from it but not the character
36 * - if the end is between the high and low surrogates, the UTF-8 result
37 * includes the character but *not* the BOM
38 * - all non-BMP characters that are fully in the string are in the UTF-8 result
39 * as character followed by BOM
40 */
41 namespace mozilla {
42 namespace a11y {
43
44 namespace DOMtoATK {
45
46 /**
47 * Converts a string of accessible text into ATK gchar* string (by adding
48 * BOMs). This can be used when offsets do not need to be adjusted because
49 * ends of the string can not fall between surrogates.
50 */
51 gchar* Convert(const nsAString& aStr);
52
53 /**
54 * Add a BOM after each non-BMP character.
55 */
56 void AddBOMs(nsACString& aDest, const nsACString& aSource);
57
58 /**
59 * Replace all characters with asterisks (e.g. for password fields).
60 */
61 void ConvertTexttoAsterisks(nsAString& aString);
62
63 /**
64 * Parameterize conversion.
65 */
66 enum class AtkStringConvertFlags : uint32_t {
67 None = 0,
68 ConvertTextToAsterisks = 1 << 0,
69 };
70
MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(AtkStringConvertFlags)71 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(AtkStringConvertFlags)
72
73 class ATKStringConverterHelper {
74 public:
75 ATKStringConverterHelper(void)
76 :
77 #ifdef DEBUG
78 mAdjusted(false),
79 #endif
80 mStartShifted(false),
81 mEndShifted(false) {
82 }
83
84 /**
85 * In order to properly get non-BMP values, offsets need to be changed
86 * to get one character more on each end, so that ConvertUTF16toUTF8 can
87 * convert surrogates even if the originally requested offsets fall between
88 * them.
89 */
90 void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);
91
92 /**
93 * Converts a string of accessible text with adjusted offsets into ATK
94 * gchar* string (by adding BOMs). Note, AdjustOffsets has to be called
95 * before getting the text passed to this.
96 */
97 gchar* ConvertAdjusted(const nsAString& aStr);
98
99 private:
100 /**
101 * Remove the additional characters requested by PrepareUTF16toUTF8.
102 */
103 gchar* FinishUTF16toUTF8(nsCString& aStr);
104
105 #ifdef DEBUG
106 bool mAdjusted;
107 #endif
108 bool mStartShifted;
109 bool mEndShifted;
110 };
111
112 /**
113 * Get text from aAccessible, using ATKStringConverterHelper to properly
114 * introduce appropriate BOMs.
115 */
116 template <class Accessible>
NewATKString(Accessible * aAccessible,gint aStartOffset,gint aEndOffset,AtkStringConvertFlags aFlags)117 gchar* NewATKString(Accessible* aAccessible, gint aStartOffset, gint aEndOffset,
118 AtkStringConvertFlags aFlags) {
119 gint startOffset = aStartOffset, endOffset = aEndOffset;
120 ATKStringConverterHelper converter;
121 converter.AdjustOffsets(&startOffset, &endOffset,
122 gint(aAccessible->CharacterCount()));
123 nsAutoString str;
124 aAccessible->TextSubstring(startOffset, endOffset, str);
125
126 if (str.Length() == 0) {
127 // Bogus offsets, or empty string, either way we do not need conversion.
128 return g_strdup("");
129 }
130
131 if (aFlags & AtkStringConvertFlags::ConvertTextToAsterisks) {
132 ConvertTexttoAsterisks(str);
133 }
134 return converter.ConvertAdjusted(str);
135 }
136
137 /**
138 * Get a character from aAccessible, fetching more data as appropriate to
139 * properly get non-BMP characters or a BOM as appropriate.
140 */
141 template <class AccessibleCharAt>
ATKCharacter(AccessibleCharAt * aAccessible,gint aOffset)142 gunichar ATKCharacter(AccessibleCharAt* aAccessible, gint aOffset) {
143 // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
144 gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));
145
146 if (NS_IS_LOW_SURROGATE(character)) {
147 // Trailing surrogate, return BOM instead.
148 return 0xFEFF;
149 }
150
151 if (NS_IS_HIGH_SURROGATE(character)) {
152 // Heading surrogate, get the trailing surrogate and combine them.
153 gunichar characterLow =
154 static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));
155
156 if (!NS_IS_LOW_SURROGATE(characterLow)) {
157 // It should have been a trailing surrogate... Flag the error.
158 return 0xFFFD;
159 }
160 return SURROGATE_TO_UCS4(character, characterLow);
161 }
162
163 return character;
164 }
165
166 } // namespace DOMtoATK
167
168 } // namespace a11y
169 } // namespace mozilla
170