1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "AccessibleWrap.h"
8 #include "nsString.h"
9 #include "nsMai.h"
10 
11 /**
12  * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
13  * in UTF-16 code units.  That makes a difference for non-BMP characters,
14  * which need two UTF-16 code units to be represented (a pair of surrogates),
15  * while they are just one unicode character.
16  *
17  * To keep synchronization between ATK offsets (unicode codepoints) and DOM
18  * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
19  * BOM after each non-BMP character (which would otherwise use 2 UTF-16
20  * code units for only 1 unicode codepoint).
21  *
22  * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
23  * that usage is deprecated) normally only appear at the beginning of unicode
24  * files, but their occurrence within text (notably after cut&paste) is not
25  * uncommon, and are thus considered as non-text.
26  *
27  * Since the selection requested through ATK may not contain both surrogates
28  * at the ends of the selection, we need to fetch one UTF-16 code point more
29  * on both side, and get rid of it before returning the string to ATK. The
30  * ATKStringConverterHelper class maintains this, NewATKString should be used
31  * to call it properly.
32  *
33  * In the end,
34  * - if the start is between the high and low surrogates, the UTF-8 result
35  * includes a BOM from it but not the character
36  * - if the end is between the high and low surrogates, the UTF-8 result
37  * includes the character but *not* the BOM
38  * - all non-BMP characters that are fully in the string are in the UTF-8 result
39  * as character followed by BOM
40  */
41 namespace mozilla {
42 namespace a11y {
43 
44 namespace DOMtoATK {
45 
46 /**
47  * Converts a string of accessible text into ATK gchar* string (by adding
48  * BOMs). This can be used when offsets do not need to be adjusted because
49  * ends of the string can not fall between surrogates.
50  */
51 gchar* Convert(const nsAString& aStr);
52 
53 /**
54  * Add a BOM after each non-BMP character.
55  */
56 void AddBOMs(nsACString& aDest, const nsACString& aSource);
57 
58 /**
59  * Replace all characters with asterisks (e.g. for password fields).
60  */
61 void ConvertTexttoAsterisks(nsAString& aString);
62 
63 /**
64  * Parameterize conversion.
65  */
66 enum class AtkStringConvertFlags : uint32_t {
67   None = 0,
68   ConvertTextToAsterisks = 1 << 0,
69 };
70 
MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(AtkStringConvertFlags)71 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(AtkStringConvertFlags)
72 
73 class ATKStringConverterHelper {
74  public:
75   ATKStringConverterHelper(void)
76       :
77 #ifdef DEBUG
78         mAdjusted(false),
79 #endif
80         mStartShifted(false),
81         mEndShifted(false) {
82   }
83 
84   /**
85    * In order to properly get non-BMP values, offsets need to be changed
86    * to get one character more on each end, so that ConvertUTF16toUTF8 can
87    * convert surrogates even if the originally requested offsets fall between
88    * them.
89    */
90   void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);
91 
92   /**
93    * Converts a string of accessible text with adjusted offsets into ATK
94    * gchar* string (by adding BOMs).  Note, AdjustOffsets has to be called
95    * before getting the text passed to this.
96    */
97   gchar* ConvertAdjusted(const nsAString& aStr);
98 
99  private:
100   /**
101    * Remove the additional characters requested by PrepareUTF16toUTF8.
102    */
103   gchar* FinishUTF16toUTF8(nsCString& aStr);
104 
105 #ifdef DEBUG
106   bool mAdjusted;
107 #endif
108   bool mStartShifted;
109   bool mEndShifted;
110 };
111 
112 /**
113  * Get text from aAccessible, using ATKStringConverterHelper to properly
114  * introduce appropriate BOMs.
115  */
116 template <class Accessible>
NewATKString(Accessible * aAccessible,gint aStartOffset,gint aEndOffset,AtkStringConvertFlags aFlags)117 gchar* NewATKString(Accessible* aAccessible, gint aStartOffset, gint aEndOffset,
118                     AtkStringConvertFlags aFlags) {
119   gint startOffset = aStartOffset, endOffset = aEndOffset;
120   ATKStringConverterHelper converter;
121   converter.AdjustOffsets(&startOffset, &endOffset,
122                           gint(aAccessible->CharacterCount()));
123   nsAutoString str;
124   aAccessible->TextSubstring(startOffset, endOffset, str);
125 
126   if (str.Length() == 0) {
127     // Bogus offsets, or empty string, either way we do not need conversion.
128     return g_strdup("");
129   }
130 
131   if (aFlags & AtkStringConvertFlags::ConvertTextToAsterisks) {
132     ConvertTexttoAsterisks(str);
133   }
134   return converter.ConvertAdjusted(str);
135 }
136 
137 /**
138  * Get a character from aAccessible, fetching more data as appropriate to
139  * properly get non-BMP characters or a BOM as appropriate.
140  */
141 template <class AccessibleCharAt>
ATKCharacter(AccessibleCharAt * aAccessible,gint aOffset)142 gunichar ATKCharacter(AccessibleCharAt* aAccessible, gint aOffset) {
143   // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
144   gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));
145 
146   if (NS_IS_LOW_SURROGATE(character)) {
147     // Trailing surrogate, return BOM instead.
148     return 0xFEFF;
149   }
150 
151   if (NS_IS_HIGH_SURROGATE(character)) {
152     // Heading surrogate, get the trailing surrogate and combine them.
153     gunichar characterLow =
154         static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));
155 
156     if (!NS_IS_LOW_SURROGATE(characterLow)) {
157       // It should have been a trailing surrogate... Flag the error.
158       return 0xFFFD;
159     }
160     return SURROGATE_TO_UCS4(character, characterLow);
161   }
162 
163   return character;
164 }
165 
166 }  // namespace DOMtoATK
167 
168 }  // namespace a11y
169 }  // namespace mozilla
170