1<?php
2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
3//
4// All Rights Reserved. See copyright.txt for details and a complete list of authors.
5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
6// $Id$
7
8/*
9 * Created on Jan 27, 2009
10 *
11 */
12
13class Multilingual_MachineTranslation_GoogleTranslateWrapper implements Multilingual_MachineTranslation_Interface
14{
15	  const SERVICE_URL = "https://www.googleapis.com/language/translate/v2";
16
17	//wiki markup (keep this regex in case we decide to translate wiki markup and not html)
18	//	const WIKI_MARKUP = "/<[^>]*>| ?[\`\!\@\#\$\%\^\&\*\[\]\:\;\"\'\<\,\>\/\|\\\=\-\+\_\(\)]{2,} ?|\(\([\s\S]*?\)\)|\~[a-z]{2,3}\~[\s\S]*?\~\/[a-z]{2,3}\~|\~hs\~|\~\~[\s\S]*?\:|\~\~|[[^\|]*?\||\[[^|\]]*\]|\{\*[^\}\*]*?\*\}|\{[^\}]*?\}|^;|!/m";
19	const WIKI_MARKUP = "/<[^>]*>| ?[\`\!\@\#\$\%\^\&\*\[\]\:\;\"\'\<\,\>\/\|\\\=\-\+\_\(\)]{2,} ?|\(\([\s\S]*?\)\)|\~\/?[a-z]{2,3}\~|\~hs\~|\~\~[\s\S]*?\:|\~\~|[[^\|]*?\||\[[^|\]]*\]|\{\*[^\}\*]*?\*\}|\{[^\}]*?\}|^;|!/m";
20
21	//Google doesn't return parens upon translation
22	//Include spaces in markup (Google adds some, and they will be stripped later. Want to preserve the original ones)
23	const HTML_MARKUP = "/ ?<[^>]*> ?| ?\(|\) ?/";
24	const TITLE_TAG = "/(<[Hh][\d][^>]*>(<[^>]*>)*)([^<]*)/";
25	  const NO_TRANSLATE_STRING = "<span class='notranslate'>\$0</span>";
26	const NO_TRANSLATE_PATTERN = "/ <span class='notranslate'>(.*)<\/span> |^<span class='notranslate'>(.*)<\/span> | <span class='notranslate'>(.*)<\/span>\$|^<span class='notranslate'>(.*)<\/span>\$|<span class='notranslate'>(.*)<\/span>/Um";
27
28	private $key;
29	private $sourceLang;
30	private $targetLang;
31	private $markup;
32	private $translatingHTML = true;
33	private $arrayOfUntranslatableStringsAndTheirIDs = [];
34	private $currentID = 169;
35
36	function __construct($key, $sourceLang, $targetLang, $html = true)
37	{
38		$this->key = $key;
39		$this->sourceLang = $sourceLang;
40		$this->targetLang = $targetLang;
41		if ($html) {
42			$this->markup = self::HTML_MARKUP;
43		} else {
44			$this->translatingHTML = false;
45			$this->markup = self::WIKI_MARKUP;
46		}
47	}
48
49
50	function getSupportedLanguages()
51	{
52		return [
53			'sq' => 'Albanian',
54			'ar' => 'Arabic',
55			'bg' => 'Bulgarian',
56			'ca' => 'Catalan',
57			'zh' => 'Chinese',
58			'hr' => 'Croatian',
59			'cs' => 'Czech',
60			'da' => 'Danish',
61			'nl' => 'Dutch',
62			'en' => 'English',
63			'et' => 'Estonian',
64			'fil' => 'Filipino',
65			'fi' => 'Finnish',
66			'fr' => 'French',
67			'gl' => 'Galician',
68			'de' => 'German',
69			'el' => 'Greek',
70			'he' => 'Hebrew',
71			'hi' => 'Hindi',
72			'hu' => 'Hungarian',
73			'id' => 'Indonesian',
74			'it' => 'Italian',
75			'ja' => 'Japanese',
76			'ko' => 'Korean',
77			'lv' => 'Latvian',
78			'lt' => 'Lithuanian',
79			'mt' => 'Maltese',
80			'no' => 'Norwegian',
81			'fa' => 'Persian',
82			'pl' => 'Polish',
83			'pt' => 'Portuguese',
84			'ro' => 'Romanian',
85			'ru' => 'Russian',
86			'sr' => 'Serbian',
87			'sk' => 'Slovak',
88			'sl' => 'Slovenian',
89			'es' => 'Spanish',
90			'sv' => 'Swedish',
91			'th' => 'Thai',
92			'tr' => 'Turkish',
93			'uk' => 'Ukrainian',
94			'vi' => 'Vietnamese'
95		];
96	}
97
98
99	function translateText($text)
100	{
101		$text = $this->escape_untranslatable_text($text);
102
103		$urlencodedText = urlencode($text);
104
105		if (strlen($urlencodedText) < 1800) {
106			$chunks = [$text];
107		} else {
108			$chunks = $this->splitInLogicalChunksOf450CharsMax($text);
109		}
110
111		$result = "";
112		foreach ($chunks as $textToTranslate) {
113			$result .= $this->getTranslationFromGoogle($textToTranslate) . " ";
114		}
115
116		$result = $this->remove_notranslateTags_and_reverse_to_original_markup($result);
117		return trim($result);
118	}
119
120
121	private function translateSentenceBySentence($text)
122	{
123		$segmentor = new Multilingual_Aligner_SentenceSegmentor();
124		$sentences = $segmentor->segment($text);
125		$result = "";
126		foreach ($sentences as $textToTranslate) {
127			$result .= $this->getTranslationFromGoogle($textToTranslate);
128		}
129
130		return $result;
131	}
132
133	private function getTranslationFromGoogle($text)
134	{
135		require_once 'lib/ointegratelib.php';
136		$ointegrate = new OIntegrate();
137		$params = [
138			'key' => $this->key,
139			'target' => $this->targetLang,
140			'q' => $text,
141			'format' => ($this->markup === self::HTML_MARKUP) ? 'html' : 'text',
142		];
143
144		if ($this->sourceLang != Multilingual_MachineTranslation::DETECT_LANGUAGE) {
145			$params['source'] = $this->sourceLang;
146		}
147
148		$url = self::SERVICE_URL . '?' . http_build_query($params, '', '&');
149
150		$oi_result = $ointegrate->performRequest($url);
151		$result = $oi_result->data['data']['translations'];
152
153		return implode(
154			'',
155			array_map(
156				function ($entry) {
157					return $entry['translatedText'];
158				},
159				$result
160			)
161		);
162	}
163
164	private function splitInLogicalChunksOf450CharsMax($text)
165	{
166		$chunks = [];
167		$segmentor = new Multilingual_Aligner_SentenceSegmentor();
168		$sentences = $segmentor->segment($text);
169		$ii = 0;
170		$chunk = $sentences[$ii];
171		while ($ii < (count($sentences) - 1)) {
172			$ii++;
173			if (strlen(urlencode($chunk)) < 450) {
174				$chunk = $chunk . $sentences[$ii];
175			} else {
176				$chunks[] = $chunk;
177				$chunk = $sentences[$ii];
178			}
179		}
180		$chunks[] = $chunk;
181		return $chunks;
182	}
183
184	/*
185	 * Google Translate works best when wiki or html markup is first replaced with
186	 * a unique id (here something like this is used: id169) and then those ids
187	 * surrounded by Google's notranslate span tag. Upon translations span tags are
188	 * removed and ids reversed to the original markup.
189	 */
190	private function escape_untranslatable_text($text)
191	{
192		//Title is all between <hx> tags. Put it in lower case, so Google doesn't
193		//take the capitalized words as proper names
194		if (preg_match_all(self::TITLE_TAG, $text, $matchesT) != 0) {
195			foreach ($matchesT[0] as $i => $completeMatch) {
196				$text = str_replace($completeMatch, $matchesT[1][$i] . strtolower($matchesT[3][$i]), $text);
197			}
198		}
199
200		if (! $this->translatingHTML) {
201			$text = nl2br($text);
202		}
203		preg_match_all($this->markup, $text, $matches);
204
205		foreach ($matches[0] as $matched_markup) {
206			$id = array_search($matched_markup, $this->arrayOfUntranslatableStringsAndTheirIDs);
207			if ($id == false) {
208				$id = (int)$this->currentID + 1;
209				$this->arrayOfUntranslatableStringsAndTheirIDs[$id] = $matched_markup;
210				$this->currentID = $id;
211			}
212		}
213
214		foreach ($this->arrayOfUntranslatableStringsAndTheirIDs as $id => $markup) {
215			$id = "id" . $id;
216
217			//adding dot after </ul> to have it segmented properly. otherwise when the html contains only lists,
218			//sentence segmentor can't find where to segment the text
219			if ($markup == "</ul>") {
220				$text = preg_replace("/" . preg_quote($markup, '/') . "/", $id . ".", $text);
221			} else {
222				$text = preg_replace("/" . preg_quote($markup, '/') . "/", $id, $text);
223			}
224		}
225
226		$text = preg_replace("/(id[\d]+\.?(id[\d]+)*)/", self::NO_TRANSLATE_STRING, $text);
227		return $text;
228	}
229
230
231	private function remove_notranslateTags_and_reverse_to_original_markup($text)
232	{
233		//Google adds spaces before and after notranslate span
234
235		preg_match_all(self::NO_TRANSLATE_PATTERN, $text, $matches);
236
237		foreach ($matches as $i => $match) {
238			foreach ($matches[0] as $index => $found) {
239				if (! empty($match[$index])) {
240					$text = str_replace($found, $match[$index], $text);
241				}
242			}
243		}
244
245		foreach ($this->arrayOfUntranslatableStringsAndTheirIDs as $id => $markup) {
246			$id = "id" . $id;
247			$text = preg_replace("/$id/", $markup, $text); //str replace better
248		}
249
250		//trimming leading spaces in each line (wiki syntax doesn't work unless)
251		if (! $this->translatingHTML) {
252			$textArray = preg_split('/\<br(\s*)?\/?\>/i', $text);
253			$textArray = array_map('trim', $textArray);
254			$text = implode("\n", $textArray);
255		}
256		return $text;
257	}
258}
259