1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/rtl.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include <algorithm>
11 
12 #include "base/command_line.h"
13 #include "base/files/file_path.h"
14 #include "base/i18n/base_i18n_switches.h"
15 #include "base/logging.h"
16 #include "base/stl_util.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "build/build_config.h"
22 #include "third_party/icu/source/common/unicode/locid.h"
23 #include "third_party/icu/source/common/unicode/uchar.h"
24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/i18n/unicode/coll.h"
26 
27 #if defined(OS_IOS)
28 #include "base/debug/crash_logging.h"
29 #include "base/ios/ios_util.h"
30 #endif
31 
32 namespace {
33 
34 // Extract language, country and variant, but ignore keywords.  For example,
35 // en-US, ca@valencia, ca-ES@valencia.
GetLocaleString(const icu::Locale & locale)36 std::string GetLocaleString(const icu::Locale& locale) {
37   const char* language = locale.getLanguage();
38   const char* country = locale.getCountry();
39   const char* variant = locale.getVariant();
40 
41   std::string result =
42       (language != nullptr && *language != '\0') ? language : "und";
43 
44   if (country != nullptr && *country != '\0') {
45     result += '-';
46     result += country;
47   }
48 
49   if (variant != nullptr && *variant != '\0')
50     result += '@' + base::ToLowerASCII(variant);
51 
52   return result;
53 }
54 
55 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
56 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
57 // http://unicode.org/reports/tr9/ for more information.
GetCharacterDirection(UChar32 character)58 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
59   static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
60       switches::kForceTextDirection);
61   if (has_switch) {
62     base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
63     std::string force_flag =
64         command_line->GetSwitchValueASCII(switches::kForceTextDirection);
65 
66     if (force_flag == switches::kForceDirectionRTL)
67       return base::i18n::RIGHT_TO_LEFT;
68     if (force_flag == switches::kForceDirectionLTR)
69       return base::i18n::LEFT_TO_RIGHT;
70   }
71   // Now that we have the character, we use ICU in order to query for the
72   // appropriate Unicode BiDi character type.
73   int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
74   switch (property) {
75     case U_RIGHT_TO_LEFT:
76     case U_RIGHT_TO_LEFT_ARABIC:
77     case U_RIGHT_TO_LEFT_EMBEDDING:
78     case U_RIGHT_TO_LEFT_OVERRIDE:
79       return base::i18n::RIGHT_TO_LEFT;
80     case U_LEFT_TO_RIGHT:
81     case U_LEFT_TO_RIGHT_EMBEDDING:
82     case U_LEFT_TO_RIGHT_OVERRIDE:
83       return base::i18n::LEFT_TO_RIGHT;
84   }
85   return base::i18n::UNKNOWN_DIRECTION;
86 }
87 
88 }  // namespace
89 
90 namespace base {
91 namespace i18n {
92 
93 // Represents the locale-specific ICU text direction.
94 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
95 
96 // Convert the ICU default locale to a string.
GetConfiguredLocale()97 std::string GetConfiguredLocale() {
98   return GetLocaleString(icu::Locale::getDefault());
99 }
100 
101 // Convert the ICU canonicalized locale to a string.
GetCanonicalLocale(const std::string & locale)102 std::string GetCanonicalLocale(const std::string& locale) {
103   return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
104 }
105 
106 // Convert Chrome locale name to ICU locale name
ICULocaleName(const std::string & locale_string)107 std::string ICULocaleName(const std::string& locale_string) {
108   // If not Spanish, just return it.
109   if (locale_string.substr(0, 2) != "es")
110     return locale_string;
111   // Expand es to es-ES.
112   if (LowerCaseEqualsASCII(locale_string, "es"))
113     return "es-ES";
114   // Map es-419 (Latin American Spanish) to es-FOO depending on the system
115   // locale.  If it's es-RR other than es-ES, map to es-RR. Otherwise, map
116   // to es-MX (the most populous in Spanish-speaking Latin America).
117   if (LowerCaseEqualsASCII(locale_string, "es-419")) {
118     const icu::Locale& locale = icu::Locale::getDefault();
119     std::string language = locale.getLanguage();
120     const char* country = locale.getCountry();
121     if (LowerCaseEqualsASCII(language, "es") &&
122       !LowerCaseEqualsASCII(country, "es")) {
123         language += '-';
124         language += country;
125         return language;
126     }
127     return "es-MX";
128   }
129   // Currently, Chrome has only "es" and "es-419", but later we may have
130   // more specific "es-RR".
131   return locale_string;
132 }
133 
SetICUDefaultLocale(const std::string & locale_string)134 void SetICUDefaultLocale(const std::string& locale_string) {
135 #if defined(OS_IOS)
136   static base::debug::CrashKeyString* crash_key_locale =
137       base::debug::AllocateCrashKeyString("icu_locale_input",
138                                           base::debug::CrashKeySize::Size256);
139   base::debug::SetCrashKeyString(crash_key_locale, locale_string);
140 #endif
141   icu::Locale locale(ICULocaleName(locale_string).c_str());
142   UErrorCode error_code = U_ZERO_ERROR;
143   const char* lang = locale.getLanguage();
144   if (lang != nullptr && *lang != '\0') {
145     icu::Locale::setDefault(locale, error_code);
146   } else {
147     LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
148                << ". Falling back to en-US.";
149     icu::Locale::setDefault(icu::Locale::getUS(), error_code);
150   }
151   g_icu_text_direction = UNKNOWN_DIRECTION;
152 }
153 
IsRTL()154 bool IsRTL() {
155   return ICUIsRTL();
156 }
157 
SetRTLForTesting(bool rtl)158 void SetRTLForTesting(bool rtl) {
159   SetICUDefaultLocale(rtl ? "he" : "en");
160   DCHECK_EQ(rtl, IsRTL());
161 }
162 
ICUIsRTL()163 bool ICUIsRTL() {
164   if (g_icu_text_direction == UNKNOWN_DIRECTION) {
165     const icu::Locale& locale = icu::Locale::getDefault();
166     g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
167   }
168   return g_icu_text_direction == RIGHT_TO_LEFT;
169 }
170 
GetForcedTextDirection()171 TextDirection GetForcedTextDirection() {
172 // On iOS, check for RTL forcing.
173 #if defined(OS_IOS)
174   if (base::ios::IsInForcedRTL())
175     return base::i18n::RIGHT_TO_LEFT;
176 #endif
177 
178   base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
179   if (command_line->HasSwitch(switches::kForceUIDirection)) {
180     std::string force_flag =
181         command_line->GetSwitchValueASCII(switches::kForceUIDirection);
182 
183     if (force_flag == switches::kForceDirectionLTR)
184       return base::i18n::LEFT_TO_RIGHT;
185 
186     if (force_flag == switches::kForceDirectionRTL)
187       return base::i18n::RIGHT_TO_LEFT;
188   }
189 
190   return base::i18n::UNKNOWN_DIRECTION;
191 }
192 
GetTextDirectionForLocaleInStartUp(const char * locale_name)193 TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
194   // Check for direction forcing.
195   TextDirection forced_direction = GetForcedTextDirection();
196   if (forced_direction != UNKNOWN_DIRECTION)
197     return forced_direction;
198 
199   // This list needs to be updated in alphabetical order if we add more RTL
200   // locales.
201   static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
202   std::vector<StringPiece> locale_split =
203       SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
204   const StringPiece& language_code = locale_split[0];
205   if (std::binary_search(kRTLLanguageCodes,
206                          kRTLLanguageCodes + base::size(kRTLLanguageCodes),
207                          language_code))
208     return RIGHT_TO_LEFT;
209   return LEFT_TO_RIGHT;
210 }
211 
GetTextDirectionForLocale(const char * locale_name)212 TextDirection GetTextDirectionForLocale(const char* locale_name) {
213   // Check for direction forcing.
214   TextDirection forced_direction = GetForcedTextDirection();
215   if (forced_direction != UNKNOWN_DIRECTION)
216     return forced_direction;
217 
218   UErrorCode status = U_ZERO_ERROR;
219   ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
220   DCHECK(U_SUCCESS(status));
221   // Treat anything other than RTL as LTR.
222   return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
223 }
224 
GetFirstStrongCharacterDirection(const string16 & text)225 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
226   const UChar* string = text.c_str();
227   size_t length = text.length();
228   size_t position = 0;
229   while (position < length) {
230     UChar32 character;
231     size_t next_position = position;
232     U16_NEXT(string, next_position, length, character);
233     TextDirection direction = GetCharacterDirection(character);
234     if (direction != UNKNOWN_DIRECTION)
235       return direction;
236     position = next_position;
237   }
238   return LEFT_TO_RIGHT;
239 }
240 
GetLastStrongCharacterDirection(const string16 & text)241 TextDirection GetLastStrongCharacterDirection(const string16& text) {
242   const UChar* string = text.c_str();
243   size_t position = text.length();
244   while (position > 0) {
245     UChar32 character;
246     size_t prev_position = position;
247     U16_PREV(string, 0, prev_position, character);
248     TextDirection direction = GetCharacterDirection(character);
249     if (direction != UNKNOWN_DIRECTION)
250       return direction;
251     position = prev_position;
252   }
253   return LEFT_TO_RIGHT;
254 }
255 
GetStringDirection(const string16 & text)256 TextDirection GetStringDirection(const string16& text) {
257   const UChar* string = text.c_str();
258   size_t length = text.length();
259   size_t position = 0;
260 
261   TextDirection result(UNKNOWN_DIRECTION);
262   while (position < length) {
263     UChar32 character;
264     size_t next_position = position;
265     U16_NEXT(string, next_position, length, character);
266     TextDirection direction = GetCharacterDirection(character);
267     if (direction != UNKNOWN_DIRECTION) {
268       if (result != UNKNOWN_DIRECTION && result != direction)
269         return UNKNOWN_DIRECTION;
270       result = direction;
271     }
272     position = next_position;
273   }
274 
275   // Handle the case of a string not containing any strong directionality
276   // characters defaulting to LEFT_TO_RIGHT.
277   if (result == UNKNOWN_DIRECTION)
278     return LEFT_TO_RIGHT;
279 
280   return result;
281 }
282 
283 #if defined(OS_WIN)
AdjustStringForLocaleDirection(string16 * text)284 bool AdjustStringForLocaleDirection(string16* text) {
285   if (!IsRTL() || text->empty())
286     return false;
287 
288   // Marking the string as LTR if the locale is RTL and the string does not
289   // contain strong RTL characters. Otherwise, mark the string as RTL.
290   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
291   if (!has_rtl_chars)
292     WrapStringWithLTRFormatting(text);
293   else
294     WrapStringWithRTLFormatting(text);
295 
296   return true;
297 }
298 
UnadjustStringForLocaleDirection(string16 * text)299 bool UnadjustStringForLocaleDirection(string16* text) {
300   if (!IsRTL() || text->empty())
301     return false;
302 
303   *text = StripWrappingBidiControlCharacters(*text);
304   return true;
305 }
306 #else
AdjustStringForLocaleDirection(string16 * text)307 bool AdjustStringForLocaleDirection(string16* text) {
308   // On OS X & GTK the directionality of a label is determined by the first
309   // strongly directional character.
310   // However, we want to make sure that in an LTR-language-UI all strings are
311   // left aligned and vice versa.
312   // A problem can arise if we display a string which starts with user input.
313   // User input may be of the opposite directionality to the UI. So the whole
314   // string will be displayed in the opposite directionality, e.g. if we want to
315   // display in an LTR UI [such as US English]:
316   //
317   // EMAN_NOISNETXE is now installed.
318   //
319   // Since EXTENSION_NAME begins with a strong RTL char, the label's
320   // directionality will be set to RTL and the string will be displayed visually
321   // as:
322   //
323   // .is now installed EMAN_NOISNETXE
324   //
325   // In order to solve this issue, we prepend an LRM to the string. An LRM is a
326   // strongly directional LTR char.
327   // We also append an LRM at the end, which ensures that we're in an LTR
328   // context.
329 
330   // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
331   // box so there is no issue with displaying zero-width bidi control characters
332   // on any system.  Thus no need for the !IsRTL() check here.
333   if (text->empty())
334     return false;
335 
336   bool ui_direction_is_rtl = IsRTL();
337 
338   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
339   if (!ui_direction_is_rtl && has_rtl_chars) {
340     WrapStringWithRTLFormatting(text);
341     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
342                  kLeftToRightMark);
343     text->push_back(kLeftToRightMark);
344   } else if (ui_direction_is_rtl && has_rtl_chars) {
345     WrapStringWithRTLFormatting(text);
346     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
347                  kRightToLeftMark);
348     text->push_back(kRightToLeftMark);
349   } else if (ui_direction_is_rtl) {
350     WrapStringWithLTRFormatting(text);
351     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
352                  kRightToLeftMark);
353     text->push_back(kRightToLeftMark);
354   } else {
355     return false;
356   }
357 
358   return true;
359 }
360 
UnadjustStringForLocaleDirection(string16 * text)361 bool UnadjustStringForLocaleDirection(string16* text) {
362   if (text->empty())
363     return false;
364 
365   size_t begin_index = 0;
366   char16 begin = text->at(begin_index);
367   if (begin == kLeftToRightMark ||
368       begin == kRightToLeftMark) {
369     ++begin_index;
370   }
371 
372   size_t end_index = text->length() - 1;
373   char16 end = text->at(end_index);
374   if (end == kLeftToRightMark ||
375       end == kRightToLeftMark) {
376     --end_index;
377   }
378 
379   string16 unmarked_text =
380       text->substr(begin_index, end_index - begin_index + 1);
381   *text = StripWrappingBidiControlCharacters(unmarked_text);
382   return true;
383 }
384 
385 #endif  // !OS_WIN
386 
EnsureTerminatedDirectionalFormatting(string16 * text)387 void EnsureTerminatedDirectionalFormatting(string16* text) {
388   int count = 0;
389   for (auto c : *text) {
390     if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
391         c == kLeftToRightOverride || c == kRightToLeftOverride) {
392       ++count;
393     } else if (c == kPopDirectionalFormatting && count > 0) {
394       --count;
395     }
396   }
397   for (int j = 0; j < count; j++)
398     text->push_back(kPopDirectionalFormatting);
399 }
400 
SanitizeUserSuppliedString(string16 * text)401 void SanitizeUserSuppliedString(string16* text) {
402   EnsureTerminatedDirectionalFormatting(text);
403   AdjustStringForLocaleDirection(text);
404 }
405 
StringContainsStrongRTLChars(const string16 & text)406 bool StringContainsStrongRTLChars(const string16& text) {
407   const UChar* string = text.c_str();
408   size_t length = text.length();
409   size_t position = 0;
410   while (position < length) {
411     UChar32 character;
412     size_t next_position = position;
413     U16_NEXT(string, next_position, length, character);
414 
415     // Now that we have the character, we use ICU in order to query for the
416     // appropriate Unicode BiDi character type.
417     int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
418     if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
419       return true;
420 
421     position = next_position;
422   }
423 
424   return false;
425 }
426 
WrapStringWithLTRFormatting(string16 * text)427 void WrapStringWithLTRFormatting(string16* text) {
428   if (text->empty())
429     return;
430 
431   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
432   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
433                kLeftToRightEmbeddingMark);
434 
435   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
436   text->push_back(kPopDirectionalFormatting);
437 }
438 
WrapStringWithRTLFormatting(string16 * text)439 void WrapStringWithRTLFormatting(string16* text) {
440   if (text->empty())
441     return;
442 
443   // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
444   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
445                kRightToLeftEmbeddingMark);
446 
447   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
448   text->push_back(kPopDirectionalFormatting);
449 }
450 
WrapPathWithLTRFormatting(const FilePath & path,string16 * rtl_safe_path)451 void WrapPathWithLTRFormatting(const FilePath& path,
452                                string16* rtl_safe_path) {
453   // Wrap the overall path with LRE-PDF pair which essentialy marks the
454   // string as a Left-To-Right string.
455   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
456   rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
457 #if defined(OS_APPLE)
458   rtl_safe_path->append(UTF8ToUTF16(path.value()));
459 #elif defined(OS_WIN)
460   rtl_safe_path->append(AsString16(path.value()));
461 #else  // defined(OS_POSIX) && !defined(OS_APPLE)
462   std::wstring wide_path = base::SysNativeMBToWide(path.value());
463   rtl_safe_path->append(WideToUTF16(wide_path));
464 #endif
465   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
466   rtl_safe_path->push_back(kPopDirectionalFormatting);
467 }
468 
GetDisplayStringInLTRDirectionality(const string16 & text)469 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
470   // Always wrap the string in RTL UI (it may be appended to RTL string).
471   // Also wrap strings with an RTL first strong character direction in LTR UI.
472   if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
473     string16 text_mutable(text);
474     WrapStringWithLTRFormatting(&text_mutable);
475     return text_mutable;
476   }
477   return text;
478 }
479 
StripWrappingBidiControlCharacters(const string16 & text)480 string16 StripWrappingBidiControlCharacters(const string16& text) {
481   if (text.empty())
482     return text;
483   size_t begin_index = 0;
484   char16 begin = text[begin_index];
485   if (begin == kLeftToRightEmbeddingMark ||
486       begin == kRightToLeftEmbeddingMark ||
487       begin == kLeftToRightOverride ||
488       begin == kRightToLeftOverride)
489     ++begin_index;
490   size_t end_index = text.length() - 1;
491   if (text[end_index] == kPopDirectionalFormatting)
492     --end_index;
493   return text.substr(begin_index, end_index - begin_index + 1);
494 }
495 
496 }  // namespace i18n
497 }  // namespace base
498