1 /*
2  * Copyright 2010-2014 OpenXcom Developers.
3  *
4  * This file is part of OpenXcom.
5  *
6  * OpenXcom is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * OpenXcom is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with OpenXcom.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 #include "Language.h"
20 #include <algorithm>
21 #include <assert.h>
22 #include <locale>
23 #include <fstream>
24 #include <cassert>
25 #include "CrossPlatform.h"
26 #include "Logger.h"
27 #include "Exception.h"
28 #include "Options.h"
29 #include "LanguagePlurality.h"
30 #include "../Ruleset/ExtraStrings.h"
31 #include "../Interface/TextList.h"
32 #ifdef _WIN32
33 #ifndef NOMINMAX
34 #define NOMINMAX
35 #endif
36 #define WIN32_LEAN_AND_MEAN
37 #include <windows.h>
38 #endif
39 
40 namespace OpenXcom
41 {
42 
43 std::map<std::string, std::wstring> Language::_names;
44 std::vector<std::string> Language::_rtl, Language::_cjk;
45 
46 /**
47  * Initializes an empty language file.
48  */
Language()49 Language::Language() : _id(""), _strings(), _handler(0), _direction(DIRECTION_LTR), _wrap(WRAP_WORDS)
50 {
51 	// maps don't have initializers :(
52 	if (_names.empty())
53 	{
54 		_names["en-US"] = utf8ToWstr("English (US)");
55 		_names["en-GB"] = utf8ToWstr("English (UK)");
56 		_names["bg-BG"] = utf8ToWstr("Български");
57 		_names["cs-CZ"] = utf8ToWstr("Česky");
58 		_names["da"] = utf8ToWstr("Dansk");
59 		_names["de"] = utf8ToWstr("Deutsch");
60 		_names["es"] = utf8ToWstr("Español (ES)");
61 		_names["es-419"] = utf8ToWstr("Español (AL)");
62 		_names["fr"] = utf8ToWstr("Français");
63 		_names["fi"] = utf8ToWstr("Suomi");
64 		_names["grk"] = utf8ToWstr("Ελληνικά");
65 		_names["hu-HU"] = utf8ToWstr("Magyar");
66 		_names["it"] = utf8ToWstr("Italiano");
67 		_names["ja-JP"] = utf8ToWstr("日本語");
68 		_names["ko"] = utf8ToWstr("한국어");
69 		_names["nl"] = utf8ToWstr("Nederlands");
70 		_names["no"] = utf8ToWstr("Norsk");
71 		_names["pl-PL"] = utf8ToWstr("Polski");
72 		_names["pt-BR"] = utf8ToWstr("Português (BR)");
73 		_names["pt-PT"] = utf8ToWstr("Português (PT)");
74 		_names["ro"] = utf8ToWstr("Română");
75 		_names["ru"] = utf8ToWstr("Русский");
76 		_names["sk-SK"] = utf8ToWstr("Slovenčina");
77 		_names["sv"] = utf8ToWstr("Svenska");
78 		_names["tr-TR"] = utf8ToWstr("Türkçe");
79 		_names["uk"] = utf8ToWstr("Українська");
80 		_names["zh-CN"] = utf8ToWstr("中文");
81 		_names["zh-TW"] = utf8ToWstr("文言");
82 	}
83 	if (_rtl.empty())
84 	{
85 		_rtl.push_back("he");
86 	}
87 	if (_cjk.empty())
88 	{
89 		_cjk.push_back("ja-JP");
90 		//_cjk.push_back("ko");  has spacing between words
91 		_cjk.push_back("zh-CN");
92 		_cjk.push_back("zh-TW");
93 	}
94 }
95 
96 /**
97  *
98  */
~Language()99 Language::~Language()
100 {
101 	delete _handler;
102 }
103 
104 /**
105  * Takes a wide-character string and converts it
106  * to a 8-bit string encoded in UTF-8.
107  * @note Adapted from http://stackoverflow.com/questions/148403/utf8-to-from-wide-char-conversion-in-stl
108  * @param src Wide-character string.
109  * @return UTF-8 string.
110  */
wstrToUtf8(const std::wstring & src)111 std::string Language::wstrToUtf8(const std::wstring& src)
112 {
113 	if (src.empty())
114 		return "";
115 #ifdef _WIN32
116 	int size = WideCharToMultiByte(CP_UTF8, 0, &src[0], (int)src.size(), NULL, 0, NULL, NULL);
117     std::string str(size, 0);
118 	WideCharToMultiByte(CP_UTF8, 0, &src[0], (int)src.size(), &str[0], size, NULL, NULL);
119 	return str;
120 #else
121 	std::string out;
122     unsigned int codepoint = 0;
123     for (std::wstring::const_iterator i = src.begin(); i != src.end(); ++i)
124     {
125 		wchar_t ch = *i;
126         if (ch >= 0xd800 && ch <= 0xdbff)
127             codepoint = ((ch - 0xd800) << 10) + 0x10000;
128         else
129         {
130             if (ch >= 0xdc00 && ch <= 0xdfff)
131                 codepoint |= ch - 0xdc00;
132             else
133                 codepoint = ch;
134 
135             if (codepoint <= 0x7f)
136                 out.append(1, static_cast<char>(codepoint));
137             else if (codepoint <= 0x7ff)
138             {
139                 out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
140                 out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
141             }
142             else if (codepoint <= 0xffff)
143             {
144                 out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
145                 out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
146                 out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
147             }
148             else
149             {
150                 out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
151                 out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
152                 out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
153                 out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
154             }
155             codepoint = 0;
156         }
157     }
158     return out;
159 #endif
160 }
161 
162 /**
163  * Takes a wide-character string and converts it to an
164  * 8-bit string encoded in the current system codepage.
165  * @param src Wide-character string.
166  * @return Codepage string.
167  */
wstrToCp(const std::wstring & src)168 std::string Language::wstrToCp(const std::wstring& src)
169 {
170 	if (src.empty())
171 		return "";
172 #ifdef _WIN32
173 	int size = WideCharToMultiByte(CP_ACP, 0, &src[0], (int)src.size(), NULL, 0, NULL, NULL);
174 	std::string str(size, 0);
175 	WideCharToMultiByte(CP_ACP, 0, &src[0], (int)src.size(), &str[0], size, NULL, NULL);
176 	return str;
177 #else
178 	const int MAX = 500;
179 	char buffer[MAX];
180 	setlocale(LC_ALL, "");
181 	wcstombs(buffer, src.c_str(), MAX);
182 	setlocale(LC_ALL, "C");
183 	std::string str(buffer);
184 	return str;
185 #endif
186 }
187 
188 /**
189  * Takes a wide-character string and converts it to an
190  * 8-bit string with the filesystem encoding.
191  * @param src Wide-character string.
192  * @return Filesystem string.
193  */
wstrToFs(const std::wstring & src)194 std::string Language::wstrToFs(const std::wstring& src)
195 {
196 #ifdef _WIN32
197 	return Language::wstrToCp(src);
198 #else
199 	return Language::wstrToUtf8(src);
200 #endif
201 }
202 
203 /**
204  * Takes an 8-bit string encoded in UTF-8 and converts it
205  * to a wide-character string.
206  * @note Adapted from http://stackoverflow.com/questions/148403/utf8-to-from-wide-char-conversion-in-stl
207  * @param src UTF-8 string.
208  * @return Wide-character string.
209  */
utf8ToWstr(const std::string & src)210 std::wstring Language::utf8ToWstr(const std::string& src)
211 {
212 	if (src.empty())
213 		return L"";
214 #ifdef _WIN32
215 	int size = MultiByteToWideChar(CP_UTF8, 0, &src[0], (int)src.size(), NULL, 0);
216     std::wstring wstr(size, 0);
217     MultiByteToWideChar(CP_UTF8, 0, &src[0], (int)src.size(), &wstr[0], size);
218 	return wstr;
219 #else
220 	std::wstring out;
221     unsigned int codepoint = 0;
222     int following = 0;
223     for (std::string::const_iterator i = src.begin(); i != src.end(); ++i)
224     {
225         unsigned char ch = *i;
226         if (ch <= 0x7f)
227         {
228             codepoint = ch;
229             following = 0;
230         }
231         else if (ch <= 0xbf)
232         {
233             if (following > 0)
234             {
235                 codepoint = (codepoint << 6) | (ch & 0x3f);
236                 --following;
237             }
238         }
239         else if (ch <= 0xdf)
240         {
241             codepoint = ch & 0x1f;
242             following = 1;
243         }
244         else if (ch <= 0xef)
245         {
246             codepoint = ch & 0x0f;
247             following = 2;
248         }
249         else
250         {
251             codepoint = ch & 0x07;
252             following = 3;
253         }
254         if (following == 0)
255         {
256             if (codepoint > 0xffff)
257             {
258                 out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
259                 out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
260             }
261             else
262                 out.append(1, static_cast<wchar_t>(codepoint));
263             codepoint = 0;
264         }
265     }
266     return out;
267 #endif
268 }
269 
270 /**
271  * Takes an 8-bit string encoded in the current system codepage
272  * and converts it to a wide-character string.
273  * @param src Codepage string.
274  * @return Wide-character string.
275  */
cpToWstr(const std::string & src)276 std::wstring Language::cpToWstr(const std::string& src)
277 {
278 	if (src.empty())
279 		return L"";
280 #ifdef _WIN32
281 	int size = MultiByteToWideChar(CP_ACP, 0, &src[0], (int)src.size(), NULL, 0);
282     std::wstring wstr(size, 0);
283     MultiByteToWideChar(CP_ACP, 0, &src[0], (int)src.size(), &wstr[0], size);
284 	return wstr;
285 #else
286 	const int MAX = 500;
287 	wchar_t buffer[MAX + 1];
288 	setlocale(LC_ALL, "");
289 	size_t len = mbstowcs(buffer, src.c_str(), MAX);
290 	setlocale(LC_ALL, "C");
291 	if (len == (size_t)-1)
292 		return L"?";
293 	return std::wstring(buffer, len);
294 #endif
295 }
296 
297 /**
298  * Takes an 8-bit string with the filesystem encoding
299  * and converts it to a wide-character string.
300  * @param src Filesystem string.
301  * @return Wide-character string.
302  */
fsToWstr(const std::string & src)303 std::wstring Language::fsToWstr(const std::string& src)
304 {
305 #ifdef _WIN32
306 	return Language::cpToWstr(src);
307 #else
308 	return Language::utf8ToWstr(src);
309 #endif
310 }
311 
312 /**
313  * Replaces every instance of a substring.
314  * @param str The string to modify.
315  * @param find The substring to find.
316  * @param replace The substring to replace it with.
317  */
replace(std::string & str,const std::string & find,const std::string & replace)318 void Language::replace(std::string &str, const std::string &find, const std::string &replace)
319 {
320 	for (size_t i = str.find(find); i != std::string::npos; i = str.find(find, i + replace.length()))
321 	{
322 		str.replace(i, find.length(), replace);
323 	}
324 }
325 
326 /**
327  * Replaces every instance of a substring.
328  * @param str The string to modify.
329  * @param find The substring to find.
330  * @param replace The substring to replace it with.
331  */
replace(std::wstring & str,const std::wstring & find,const std::wstring & replace)332 void Language::replace(std::wstring &str, const std::wstring &find, const std::wstring &replace)
333 {
334 	for (size_t i = str.find(find); i != std::wstring::npos; i = str.find(find, i + replace.length()))
335 	{
336 		str.replace(i, find.length(), replace);
337 	}
338 }
339 
340 /**
341  * Gets all the languages found in the
342  * Data folder and returns their properties.
343  * @param files List of language filenames.
344  * @param names List of language human-readable names.
345  */
getList(std::vector<std::string> & files,std::vector<std::wstring> & names)346 void Language::getList(std::vector<std::string> &files, std::vector<std::wstring> &names)
347 {
348 	files = CrossPlatform::getFolderContents(CrossPlatform::getDataFolder("Language/"), "yml");
349 	names.clear();
350 
351 	for (std::vector<std::string>::iterator i = files.begin(); i != files.end(); ++i)
352 	{
353 		*i = CrossPlatform::noExt(*i);
354 		std::wstring name;
355 		std::map<std::string, std::wstring>::iterator lang = _names.find(*i);
356 		if (lang != _names.end())
357 		{
358 			name = lang->second;
359 		}
360 		else
361 		{
362 			name = Language::fsToWstr(*i);
363 		}
364 		names.push_back(name);
365 	}
366 }
367 
368 /**
369  * Loads a language file in Ruby-on-Rails YAML format.
370  * Not that this has anything to do with Ruby, but since it's a
371  * widely-supported format and we already have YAML, it was convenient.
372  * @param filename Filename of the YAML file.
373  * @param extras Pointer to extra strings from ruleset.
374  */
load(const std::string & filename,ExtraStrings * extras)375 void Language::load(const std::string &filename, ExtraStrings *extras)
376 {
377 	_strings.clear();
378 
379 	YAML::Node doc = YAML::LoadFile(filename);
380 	_id = doc.begin()->first.as<std::string>();
381 	YAML::Node lang = doc.begin()->second;
382 	for (YAML::const_iterator i = lang.begin(); i != lang.end(); ++i)
383 	{
384 		// Regular strings
385 		if (i->second.IsScalar())
386 		{
387 			_strings[i->first.as<std::string>()] = loadString(i->second.as<std::string>());
388 		}
389 		// Strings with plurality
390 		else if (i->second.IsMap())
391 		{
392 			for (YAML::const_iterator j = i->second.begin(); j != i->second.end(); ++j)
393 			{
394 				std::string s = i->first.as<std::string>() + "_" + j->first.as<std::string>();
395 				_strings[s] = loadString(j->second.as<std::string>());
396 			}
397 		}
398 	}
399 	if (extras)
400 	{
401 		for (std::map<std::string, std::string>::const_iterator i = extras->getStrings()->begin(); i != extras->getStrings()->end(); ++i)
402 		{
403 			_strings[i->first] = loadString(i->second);
404 		}
405 	}
406 	delete _handler;
407 	_handler = LanguagePlurality::create(_id);
408 	if (std::find(_rtl.begin(), _rtl.end(), _id) == _rtl.end())
409 	{
410 		_direction = DIRECTION_LTR;
411 	}
412 	else
413 	{
414 		_direction = DIRECTION_RTL;
415 	}
416 	if (std::find(_cjk.begin(), _cjk.end(), _id) == _cjk.end())
417 	{
418 		_wrap = WRAP_WORDS;
419 	}
420 	else
421 	{
422 		_wrap = WRAP_LETTERS;
423 	}
424 }
425 
426 /**
427 * Replaces all special string markers with the approriate characters
428 * and converts the string encoding.
429 * @param string Original UTF-8 string.
430 * @return New widechar string.
431 */
loadString(const std::string & string) const432 std::wstring Language::loadString(const std::string &string) const
433 {
434 	std::string s = string;
435 	replace(s, "{NEWLINE}", "\n");
436 	replace(s, "{SMALLLINE}", "\x02");
437 	replace(s, "{ALT}", "\x01");
438 	return utf8ToWstr(s);
439 }
440 
441 /**
442  * Returns the language's locale.
443  * @return IANA language tag.
444  */
getId() const445 std::string Language::getId() const
446 {
447 	return _id;
448 }
449 
450 /**
451  * Returns the language's name in its native language.
452  * @return Language name.
453  */
getName() const454 std::wstring Language::getName() const
455 {
456 	return _names[_id];
457 }
458 
459 /**
460  * Returns the localized text with the specified ID.
461  * If it's not found, just returns the ID.
462  * @param id ID of the string.
463  * @return String with the requested ID.
464  */
getString(const std::string & id) const465 const LocalizedText &Language::getString(const std::string &id) const
466 {
467 	static LocalizedText hack(L"");
468 	if (id.empty())
469 		return hack;
470 	std::map<std::string, LocalizedText>::const_iterator s = _strings.find(id);
471 	if (s == _strings.end())
472 	{
473 		Log(LOG_WARNING) << id << " not found in " << Options::language;
474 		hack = LocalizedText(utf8ToWstr(id));
475 		return hack;
476 	}
477 	else
478 	{
479 		return s->second;
480 	}
481 }
482 
483 /**
484  * Returns the localized text with the specified ID, in the proper form for @a n.
485  * The substitution of @a n has already happened in the returned LocalizedText.
486  * If it's not found, just returns the ID.
487  * @param id ID of the string.
488  * @param n Number to use to decide the proper form.
489  * @return String with the requested ID.
490  */
getString(const std::string & id,unsigned n) const491 LocalizedText Language::getString(const std::string &id, unsigned n) const
492 {
493 	assert(!id.empty());
494 	std::map<std::string, LocalizedText>::const_iterator s = _strings.end();
495 	if (0 == n)
496 	{
497 		// Try specialized form.
498 		s = _strings.find(id + "_zero");
499 	}
500 	if (s == _strings.end())
501 	{
502 		// Try proper form by language
503 		s = _strings.find(id + _handler->getSuffix(n));
504 	}
505 	if (s == _strings.end())
506 	{
507 		Log(LOG_WARNING) << id << " not found in " << Options::language;
508 		return LocalizedText(utf8ToWstr(id));
509 	}
510 	std::wostringstream ss;
511 	ss << n;
512 	std::wstring marker(L"{N}"), val(ss.str()), txt(s->second);
513 	replace(txt, marker, val);
514 	return txt;
515 }
516 
517 /**
518  * Returns the localized text with the specified ID, in the proper form for the gender.
519  * If it's not found, just returns the ID.
520  * @param id ID of the string.
521  * @param gender Current soldier gender.
522  * @return String with the requested ID.
523  */
getString(const std::string & id,SoldierGender gender) const524 const LocalizedText &Language::getString(const std::string &id, SoldierGender gender) const
525 {
526 	std::string genderId;
527 	if (gender == GENDER_MALE)
528 	{
529 		genderId = id + "_MALE";
530 	}
531 	else
532 	{
533 		genderId = id + "_FEMALE";
534 	}
535 	return getString(genderId);
536 }
537 
538 /**
539  * Outputs all the language IDs and strings
540  * to an HTML table.
541  * @param filename HTML file.
542  */
toHtml(const std::string & filename) const543 void Language::toHtml(const std::string &filename) const
544 {
545 	std::ofstream htmlFile (filename.c_str(), std::ios::out);
546 	htmlFile << "<table border=\"1\" width=\"100%\">" << std::endl;
547 	htmlFile << "<tr><th>ID String</th><th>English String</th></tr>" << std::endl;
548 	for (std::map<std::string, LocalizedText>::const_iterator i = _strings.begin(); i != _strings.end(); ++i)
549 	{
550 		htmlFile << "<tr><td>" << i->first << "</td><td>";
551 		std::string s = wstrToUtf8(i->second);
552 		for (std::string::const_iterator j = s.begin(); j != s.end(); ++j)
553 		{
554 			if (*j == 2 || *j == '\n')
555 			{
556 				htmlFile << "<br />";
557 			}
558 			else
559 			{
560 				htmlFile << *j;
561 			}
562 		}
563 		htmlFile << "</td></tr>" << std::endl;
564 	}
565 	htmlFile << "</table>" << std::endl;
566 	htmlFile.close();
567 }
568 
569 /**
570  * Returns the direction to use for rendering
571  * text in this language.
572  * @return Text direction.
573  */
getTextDirection() const574 TextDirection Language::getTextDirection() const
575 {
576 	return _direction;
577 }
578 
579 /**
580  * Returns the wrapping rules to use for rendering
581  * text in this language.
582  * @return Text wrapping.
583  */
getTextWrapping() const584 TextWrapping Language::getTextWrapping() const
585 {
586 	return _wrap;
587 }
588 
589 }
590 
591 /** @page LanguageFiles Format of the language files.
592 
593 Language files are formatted as YAML (.yml) containing UTF-8 (no BOM) text.
594 The first line in a language file is the language's identifier.
595 The rest of the file are key-value pairs. The key of each pair
596 contains the ID string (dictionary key), and the value contains the localized
597 text for the given key in quotes.
598 
599 The localized text may contain the following special markers:
600 <table>
601 <tr>
602  <td><tt>{</tt><i>0, 1, 2, ...</i> <tt>}</tt></td>
603  <td>These markers will be replaced by programmer-supplied values before the
604  message is displayed.</td></tr>
605 <tr>
606  <td><tt>{ALT}</tt></td>
607  <td>The rest of the text will be in an alternate color. Using this again will
608  switch back to the primary color.</td></tr>
609 <tr>
610  <td><tt>{NEWLINE}</tt></td>
611  <td>It will be replaced with a line break in the game.</td></tr>
612 <tr>
613  <td><tt>{SMALLLINE}</tt></td>
614  <td>The rest of the text will be in a small font.</td></tr>
615 </table>
616 
617 There is an additional marker sequence, that should only appear in texts that
618 depend on a number. This marker <tt>{N}</tt> will be replaced by the actual
619 number used. The keys for texts that depend on numbers also have special
620 suffixes, that depend on the language. For all languages, a suffix of
621 <tt>_zero</tt> is tried if the number is zero, before trying the actual key
622 according to the language rules. The rest of the suffixes depend on the language,
623 as described <a href="http://unicode.org/repos/cldr-tmp/trunk/diff/supplemental/language_plural_rules.html">here</a>.
624 
625 So, you would write (for English):
626 <pre>
627 STR_ENEMIES:
628   zero:  "There are no enemies left."
629   one:   "There is a single enemy left."
630   other: "There are {N} enemies left."
631 </pre>
632 
633 */
634