1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/rtl.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include <algorithm>
11
12 #include "base/command_line.h"
13 #include "base/files/file_path.h"
14 #include "base/i18n/base_i18n_switches.h"
15 #include "base/logging.h"
16 #include "base/stl_util.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "build/build_config.h"
22 #include "third_party/icu/source/common/unicode/locid.h"
23 #include "third_party/icu/source/common/unicode/uchar.h"
24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/i18n/unicode/coll.h"
26
27 #if defined(OS_IOS)
28 #include "base/debug/crash_logging.h"
29 #include "base/ios/ios_util.h"
30 #endif
31
32 namespace {
33
34 // Extract language, country and variant, but ignore keywords. For example,
35 // en-US, ca@valencia, ca-ES@valencia.
GetLocaleString(const icu::Locale & locale)36 std::string GetLocaleString(const icu::Locale& locale) {
37 const char* language = locale.getLanguage();
38 const char* country = locale.getCountry();
39 const char* variant = locale.getVariant();
40
41 std::string result =
42 (language != nullptr && *language != '\0') ? language : "und";
43
44 if (country != nullptr && *country != '\0') {
45 result += '-';
46 result += country;
47 }
48
49 if (variant != nullptr && *variant != '\0')
50 result += '@' + base::ToLowerASCII(variant);
51
52 return result;
53 }
54
55 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
56 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
57 // http://unicode.org/reports/tr9/ for more information.
GetCharacterDirection(UChar32 character)58 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
59 static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
60 switches::kForceTextDirection);
61 if (has_switch) {
62 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
63 std::string force_flag =
64 command_line->GetSwitchValueASCII(switches::kForceTextDirection);
65
66 if (force_flag == switches::kForceDirectionRTL)
67 return base::i18n::RIGHT_TO_LEFT;
68 if (force_flag == switches::kForceDirectionLTR)
69 return base::i18n::LEFT_TO_RIGHT;
70 }
71 // Now that we have the character, we use ICU in order to query for the
72 // appropriate Unicode BiDi character type.
73 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
74 switch (property) {
75 case U_RIGHT_TO_LEFT:
76 case U_RIGHT_TO_LEFT_ARABIC:
77 case U_RIGHT_TO_LEFT_EMBEDDING:
78 case U_RIGHT_TO_LEFT_OVERRIDE:
79 return base::i18n::RIGHT_TO_LEFT;
80 case U_LEFT_TO_RIGHT:
81 case U_LEFT_TO_RIGHT_EMBEDDING:
82 case U_LEFT_TO_RIGHT_OVERRIDE:
83 return base::i18n::LEFT_TO_RIGHT;
84 }
85 return base::i18n::UNKNOWN_DIRECTION;
86 }
87
88 } // namespace
89
90 namespace base {
91 namespace i18n {
92
93 // Represents the locale-specific ICU text direction.
94 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
95
96 // Convert the ICU default locale to a string.
GetConfiguredLocale()97 std::string GetConfiguredLocale() {
98 return GetLocaleString(icu::Locale::getDefault());
99 }
100
101 // Convert the ICU canonicalized locale to a string.
GetCanonicalLocale(const std::string & locale)102 std::string GetCanonicalLocale(const std::string& locale) {
103 return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
104 }
105
106 // Convert Chrome locale name to ICU locale name
ICULocaleName(const std::string & locale_string)107 std::string ICULocaleName(const std::string& locale_string) {
108 // If not Spanish, just return it.
109 if (locale_string.substr(0, 2) != "es")
110 return locale_string;
111 // Expand es to es-ES.
112 if (LowerCaseEqualsASCII(locale_string, "es"))
113 return "es-ES";
114 // Map es-419 (Latin American Spanish) to es-FOO depending on the system
115 // locale. If it's es-RR other than es-ES, map to es-RR. Otherwise, map
116 // to es-MX (the most populous in Spanish-speaking Latin America).
117 if (LowerCaseEqualsASCII(locale_string, "es-419")) {
118 const icu::Locale& locale = icu::Locale::getDefault();
119 std::string language = locale.getLanguage();
120 const char* country = locale.getCountry();
121 if (LowerCaseEqualsASCII(language, "es") &&
122 !LowerCaseEqualsASCII(country, "es")) {
123 language += '-';
124 language += country;
125 return language;
126 }
127 return "es-MX";
128 }
129 // Currently, Chrome has only "es" and "es-419", but later we may have
130 // more specific "es-RR".
131 return locale_string;
132 }
133
SetICUDefaultLocale(const std::string & locale_string)134 void SetICUDefaultLocale(const std::string& locale_string) {
135 #if defined(OS_IOS)
136 static base::debug::CrashKeyString* crash_key_locale =
137 base::debug::AllocateCrashKeyString("icu_locale_input",
138 base::debug::CrashKeySize::Size256);
139 base::debug::SetCrashKeyString(crash_key_locale, locale_string);
140 #endif
141 icu::Locale locale(ICULocaleName(locale_string).c_str());
142 UErrorCode error_code = U_ZERO_ERROR;
143 const char* lang = locale.getLanguage();
144 if (lang != nullptr && *lang != '\0') {
145 icu::Locale::setDefault(locale, error_code);
146 } else {
147 LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
148 << ". Falling back to en-US.";
149 icu::Locale::setDefault(icu::Locale::getUS(), error_code);
150 }
151 g_icu_text_direction = UNKNOWN_DIRECTION;
152 }
153
IsRTL()154 bool IsRTL() {
155 return ICUIsRTL();
156 }
157
SetRTLForTesting(bool rtl)158 void SetRTLForTesting(bool rtl) {
159 SetICUDefaultLocale(rtl ? "he" : "en");
160 DCHECK_EQ(rtl, IsRTL());
161 }
162
ICUIsRTL()163 bool ICUIsRTL() {
164 if (g_icu_text_direction == UNKNOWN_DIRECTION) {
165 const icu::Locale& locale = icu::Locale::getDefault();
166 g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
167 }
168 return g_icu_text_direction == RIGHT_TO_LEFT;
169 }
170
GetForcedTextDirection()171 TextDirection GetForcedTextDirection() {
172 // On iOS, check for RTL forcing.
173 #if defined(OS_IOS)
174 if (base::ios::IsInForcedRTL())
175 return base::i18n::RIGHT_TO_LEFT;
176 #endif
177
178 base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
179 if (command_line->HasSwitch(switches::kForceUIDirection)) {
180 std::string force_flag =
181 command_line->GetSwitchValueASCII(switches::kForceUIDirection);
182
183 if (force_flag == switches::kForceDirectionLTR)
184 return base::i18n::LEFT_TO_RIGHT;
185
186 if (force_flag == switches::kForceDirectionRTL)
187 return base::i18n::RIGHT_TO_LEFT;
188 }
189
190 return base::i18n::UNKNOWN_DIRECTION;
191 }
192
GetTextDirectionForLocaleInStartUp(const char * locale_name)193 TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
194 // Check for direction forcing.
195 TextDirection forced_direction = GetForcedTextDirection();
196 if (forced_direction != UNKNOWN_DIRECTION)
197 return forced_direction;
198
199 // This list needs to be updated in alphabetical order if we add more RTL
200 // locales.
201 static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
202 std::vector<StringPiece> locale_split =
203 SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
204 const StringPiece& language_code = locale_split[0];
205 if (std::binary_search(kRTLLanguageCodes,
206 kRTLLanguageCodes + base::size(kRTLLanguageCodes),
207 language_code))
208 return RIGHT_TO_LEFT;
209 return LEFT_TO_RIGHT;
210 }
211
GetTextDirectionForLocale(const char * locale_name)212 TextDirection GetTextDirectionForLocale(const char* locale_name) {
213 // Check for direction forcing.
214 TextDirection forced_direction = GetForcedTextDirection();
215 if (forced_direction != UNKNOWN_DIRECTION)
216 return forced_direction;
217
218 UErrorCode status = U_ZERO_ERROR;
219 ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
220 DCHECK(U_SUCCESS(status));
221 // Treat anything other than RTL as LTR.
222 return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
223 }
224
GetFirstStrongCharacterDirection(const string16 & text)225 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
226 const UChar* string = text.c_str();
227 size_t length = text.length();
228 size_t position = 0;
229 while (position < length) {
230 UChar32 character;
231 size_t next_position = position;
232 U16_NEXT(string, next_position, length, character);
233 TextDirection direction = GetCharacterDirection(character);
234 if (direction != UNKNOWN_DIRECTION)
235 return direction;
236 position = next_position;
237 }
238 return LEFT_TO_RIGHT;
239 }
240
GetLastStrongCharacterDirection(const string16 & text)241 TextDirection GetLastStrongCharacterDirection(const string16& text) {
242 const UChar* string = text.c_str();
243 size_t position = text.length();
244 while (position > 0) {
245 UChar32 character;
246 size_t prev_position = position;
247 U16_PREV(string, 0, prev_position, character);
248 TextDirection direction = GetCharacterDirection(character);
249 if (direction != UNKNOWN_DIRECTION)
250 return direction;
251 position = prev_position;
252 }
253 return LEFT_TO_RIGHT;
254 }
255
GetStringDirection(const string16 & text)256 TextDirection GetStringDirection(const string16& text) {
257 const UChar* string = text.c_str();
258 size_t length = text.length();
259 size_t position = 0;
260
261 TextDirection result(UNKNOWN_DIRECTION);
262 while (position < length) {
263 UChar32 character;
264 size_t next_position = position;
265 U16_NEXT(string, next_position, length, character);
266 TextDirection direction = GetCharacterDirection(character);
267 if (direction != UNKNOWN_DIRECTION) {
268 if (result != UNKNOWN_DIRECTION && result != direction)
269 return UNKNOWN_DIRECTION;
270 result = direction;
271 }
272 position = next_position;
273 }
274
275 // Handle the case of a string not containing any strong directionality
276 // characters defaulting to LEFT_TO_RIGHT.
277 if (result == UNKNOWN_DIRECTION)
278 return LEFT_TO_RIGHT;
279
280 return result;
281 }
282
283 #if defined(OS_WIN)
AdjustStringForLocaleDirection(string16 * text)284 bool AdjustStringForLocaleDirection(string16* text) {
285 if (!IsRTL() || text->empty())
286 return false;
287
288 // Marking the string as LTR if the locale is RTL and the string does not
289 // contain strong RTL characters. Otherwise, mark the string as RTL.
290 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
291 if (!has_rtl_chars)
292 WrapStringWithLTRFormatting(text);
293 else
294 WrapStringWithRTLFormatting(text);
295
296 return true;
297 }
298
UnadjustStringForLocaleDirection(string16 * text)299 bool UnadjustStringForLocaleDirection(string16* text) {
300 if (!IsRTL() || text->empty())
301 return false;
302
303 *text = StripWrappingBidiControlCharacters(*text);
304 return true;
305 }
306 #else
AdjustStringForLocaleDirection(string16 * text)307 bool AdjustStringForLocaleDirection(string16* text) {
308 // On OS X & GTK the directionality of a label is determined by the first
309 // strongly directional character.
310 // However, we want to make sure that in an LTR-language-UI all strings are
311 // left aligned and vice versa.
312 // A problem can arise if we display a string which starts with user input.
313 // User input may be of the opposite directionality to the UI. So the whole
314 // string will be displayed in the opposite directionality, e.g. if we want to
315 // display in an LTR UI [such as US English]:
316 //
317 // EMAN_NOISNETXE is now installed.
318 //
319 // Since EXTENSION_NAME begins with a strong RTL char, the label's
320 // directionality will be set to RTL and the string will be displayed visually
321 // as:
322 //
323 // .is now installed EMAN_NOISNETXE
324 //
325 // In order to solve this issue, we prepend an LRM to the string. An LRM is a
326 // strongly directional LTR char.
327 // We also append an LRM at the end, which ensures that we're in an LTR
328 // context.
329
330 // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
331 // box so there is no issue with displaying zero-width bidi control characters
332 // on any system. Thus no need for the !IsRTL() check here.
333 if (text->empty())
334 return false;
335
336 bool ui_direction_is_rtl = IsRTL();
337
338 bool has_rtl_chars = StringContainsStrongRTLChars(*text);
339 if (!ui_direction_is_rtl && has_rtl_chars) {
340 WrapStringWithRTLFormatting(text);
341 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
342 kLeftToRightMark);
343 text->push_back(kLeftToRightMark);
344 } else if (ui_direction_is_rtl && has_rtl_chars) {
345 WrapStringWithRTLFormatting(text);
346 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
347 kRightToLeftMark);
348 text->push_back(kRightToLeftMark);
349 } else if (ui_direction_is_rtl) {
350 WrapStringWithLTRFormatting(text);
351 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
352 kRightToLeftMark);
353 text->push_back(kRightToLeftMark);
354 } else {
355 return false;
356 }
357
358 return true;
359 }
360
UnadjustStringForLocaleDirection(string16 * text)361 bool UnadjustStringForLocaleDirection(string16* text) {
362 if (text->empty())
363 return false;
364
365 size_t begin_index = 0;
366 char16 begin = text->at(begin_index);
367 if (begin == kLeftToRightMark ||
368 begin == kRightToLeftMark) {
369 ++begin_index;
370 }
371
372 size_t end_index = text->length() - 1;
373 char16 end = text->at(end_index);
374 if (end == kLeftToRightMark ||
375 end == kRightToLeftMark) {
376 --end_index;
377 }
378
379 string16 unmarked_text =
380 text->substr(begin_index, end_index - begin_index + 1);
381 *text = StripWrappingBidiControlCharacters(unmarked_text);
382 return true;
383 }
384
385 #endif // !OS_WIN
386
EnsureTerminatedDirectionalFormatting(string16 * text)387 void EnsureTerminatedDirectionalFormatting(string16* text) {
388 int count = 0;
389 for (auto c : *text) {
390 if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
391 c == kLeftToRightOverride || c == kRightToLeftOverride) {
392 ++count;
393 } else if (c == kPopDirectionalFormatting && count > 0) {
394 --count;
395 }
396 }
397 for (int j = 0; j < count; j++)
398 text->push_back(kPopDirectionalFormatting);
399 }
400
SanitizeUserSuppliedString(string16 * text)401 void SanitizeUserSuppliedString(string16* text) {
402 EnsureTerminatedDirectionalFormatting(text);
403 AdjustStringForLocaleDirection(text);
404 }
405
StringContainsStrongRTLChars(const string16 & text)406 bool StringContainsStrongRTLChars(const string16& text) {
407 const UChar* string = text.c_str();
408 size_t length = text.length();
409 size_t position = 0;
410 while (position < length) {
411 UChar32 character;
412 size_t next_position = position;
413 U16_NEXT(string, next_position, length, character);
414
415 // Now that we have the character, we use ICU in order to query for the
416 // appropriate Unicode BiDi character type.
417 int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
418 if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
419 return true;
420
421 position = next_position;
422 }
423
424 return false;
425 }
426
WrapStringWithLTRFormatting(string16 * text)427 void WrapStringWithLTRFormatting(string16* text) {
428 if (text->empty())
429 return;
430
431 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
432 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
433 kLeftToRightEmbeddingMark);
434
435 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
436 text->push_back(kPopDirectionalFormatting);
437 }
438
WrapStringWithRTLFormatting(string16 * text)439 void WrapStringWithRTLFormatting(string16* text) {
440 if (text->empty())
441 return;
442
443 // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
444 text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
445 kRightToLeftEmbeddingMark);
446
447 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
448 text->push_back(kPopDirectionalFormatting);
449 }
450
WrapPathWithLTRFormatting(const FilePath & path,string16 * rtl_safe_path)451 void WrapPathWithLTRFormatting(const FilePath& path,
452 string16* rtl_safe_path) {
453 // Wrap the overall path with LRE-PDF pair which essentialy marks the
454 // string as a Left-To-Right string.
455 // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
456 rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
457 #if defined(OS_APPLE)
458 rtl_safe_path->append(UTF8ToUTF16(path.value()));
459 #elif defined(OS_WIN)
460 rtl_safe_path->append(AsString16(path.value()));
461 #else // defined(OS_POSIX) && !defined(OS_APPLE)
462 std::wstring wide_path = base::SysNativeMBToWide(path.value());
463 rtl_safe_path->append(WideToUTF16(wide_path));
464 #endif
465 // Inserting a PDF (Pop Directional Formatting) mark as the last character.
466 rtl_safe_path->push_back(kPopDirectionalFormatting);
467 }
468
GetDisplayStringInLTRDirectionality(const string16 & text)469 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
470 // Always wrap the string in RTL UI (it may be appended to RTL string).
471 // Also wrap strings with an RTL first strong character direction in LTR UI.
472 if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
473 string16 text_mutable(text);
474 WrapStringWithLTRFormatting(&text_mutable);
475 return text_mutable;
476 }
477 return text;
478 }
479
StripWrappingBidiControlCharacters(const string16 & text)480 string16 StripWrappingBidiControlCharacters(const string16& text) {
481 if (text.empty())
482 return text;
483 size_t begin_index = 0;
484 char16 begin = text[begin_index];
485 if (begin == kLeftToRightEmbeddingMark ||
486 begin == kRightToLeftEmbeddingMark ||
487 begin == kLeftToRightOverride ||
488 begin == kRightToLeftOverride)
489 ++begin_index;
490 size_t end_index = text.length() - 1;
491 if (text[end_index] == kPopDirectionalFormatting)
492 --end_index;
493 return text.substr(begin_index, end_index - begin_index + 1);
494 }
495
496 } // namespace i18n
497 } // namespace base
498