1<?php 2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project 3// 4// All Rights Reserved. See copyright.txt for details and a complete list of authors. 5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details. 6// $Id$ 7 8/* 9 * Created on Jan 27, 2009 10 * 11 */ 12 13class Multilingual_MachineTranslation_GoogleTranslateWrapper implements Multilingual_MachineTranslation_Interface 14{ 15 const SERVICE_URL = "https://www.googleapis.com/language/translate/v2"; 16 17 //wiki markup (keep this regex in case we decide to translate wiki markup and not html) 18 // const WIKI_MARKUP = "/<[^>]*>| ?[\`\!\@\#\$\%\^\&\*\[\]\:\;\"\'\<\,\>\/\|\\\=\-\+\_\(\)]{2,} ?|\(\([\s\S]*?\)\)|\~[a-z]{2,3}\~[\s\S]*?\~\/[a-z]{2,3}\~|\~hs\~|\~\~[\s\S]*?\:|\~\~|[[^\|]*?\||\[[^|\]]*\]|\{\*[^\}\*]*?\*\}|\{[^\}]*?\}|^;|!/m"; 19 const WIKI_MARKUP = "/<[^>]*>| ?[\`\!\@\#\$\%\^\&\*\[\]\:\;\"\'\<\,\>\/\|\\\=\-\+\_\(\)]{2,} ?|\(\([\s\S]*?\)\)|\~\/?[a-z]{2,3}\~|\~hs\~|\~\~[\s\S]*?\:|\~\~|[[^\|]*?\||\[[^|\]]*\]|\{\*[^\}\*]*?\*\}|\{[^\}]*?\}|^;|!/m"; 20 21 //Google doesn't return parens upon translation 22 //Include spaces in markup (Google adds some, and they will be stripped later. Want to preserve the original ones) 23 const HTML_MARKUP = "/ ?<[^>]*> ?| ?\(|\) ?/"; 24 const TITLE_TAG = "/(<[Hh][\d][^>]*>(<[^>]*>)*)([^<]*)/"; 25 const NO_TRANSLATE_STRING = "<span class='notranslate'>\$0</span>"; 26 const NO_TRANSLATE_PATTERN = "/ <span class='notranslate'>(.*)<\/span> |^<span class='notranslate'>(.*)<\/span> | <span class='notranslate'>(.*)<\/span>\$|^<span class='notranslate'>(.*)<\/span>\$|<span class='notranslate'>(.*)<\/span>/Um"; 27 28 private $key; 29 private $sourceLang; 30 private $targetLang; 31 private $markup; 32 private $translatingHTML = true; 33 private $arrayOfUntranslatableStringsAndTheirIDs = []; 34 private $currentID = 169; 35 36 function __construct($key, $sourceLang, $targetLang, $html = true) 37 { 38 $this->key = $key; 39 $this->sourceLang = $sourceLang; 40 $this->targetLang = $targetLang; 41 if ($html) { 42 $this->markup = self::HTML_MARKUP; 43 } else { 44 $this->translatingHTML = false; 45 $this->markup = self::WIKI_MARKUP; 46 } 47 } 48 49 50 function getSupportedLanguages() 51 { 52 return [ 53 'sq' => 'Albanian', 54 'ar' => 'Arabic', 55 'bg' => 'Bulgarian', 56 'ca' => 'Catalan', 57 'zh' => 'Chinese', 58 'hr' => 'Croatian', 59 'cs' => 'Czech', 60 'da' => 'Danish', 61 'nl' => 'Dutch', 62 'en' => 'English', 63 'et' => 'Estonian', 64 'fil' => 'Filipino', 65 'fi' => 'Finnish', 66 'fr' => 'French', 67 'gl' => 'Galician', 68 'de' => 'German', 69 'el' => 'Greek', 70 'he' => 'Hebrew', 71 'hi' => 'Hindi', 72 'hu' => 'Hungarian', 73 'id' => 'Indonesian', 74 'it' => 'Italian', 75 'ja' => 'Japanese', 76 'ko' => 'Korean', 77 'lv' => 'Latvian', 78 'lt' => 'Lithuanian', 79 'mt' => 'Maltese', 80 'no' => 'Norwegian', 81 'fa' => 'Persian', 82 'pl' => 'Polish', 83 'pt' => 'Portuguese', 84 'ro' => 'Romanian', 85 'ru' => 'Russian', 86 'sr' => 'Serbian', 87 'sk' => 'Slovak', 88 'sl' => 'Slovenian', 89 'es' => 'Spanish', 90 'sv' => 'Swedish', 91 'th' => 'Thai', 92 'tr' => 'Turkish', 93 'uk' => 'Ukrainian', 94 'vi' => 'Vietnamese' 95 ]; 96 } 97 98 99 function translateText($text) 100 { 101 $text = $this->escape_untranslatable_text($text); 102 103 $urlencodedText = urlencode($text); 104 105 if (strlen($urlencodedText) < 1800) { 106 $chunks = [$text]; 107 } else { 108 $chunks = $this->splitInLogicalChunksOf450CharsMax($text); 109 } 110 111 $result = ""; 112 foreach ($chunks as $textToTranslate) { 113 $result .= $this->getTranslationFromGoogle($textToTranslate) . " "; 114 } 115 116 $result = $this->remove_notranslateTags_and_reverse_to_original_markup($result); 117 return trim($result); 118 } 119 120 121 private function translateSentenceBySentence($text) 122 { 123 $segmentor = new Multilingual_Aligner_SentenceSegmentor(); 124 $sentences = $segmentor->segment($text); 125 $result = ""; 126 foreach ($sentences as $textToTranslate) { 127 $result .= $this->getTranslationFromGoogle($textToTranslate); 128 } 129 130 return $result; 131 } 132 133 private function getTranslationFromGoogle($text) 134 { 135 require_once 'lib/ointegratelib.php'; 136 $ointegrate = new OIntegrate(); 137 $params = [ 138 'key' => $this->key, 139 'target' => $this->targetLang, 140 'q' => $text, 141 'format' => ($this->markup === self::HTML_MARKUP) ? 'html' : 'text', 142 ]; 143 144 if ($this->sourceLang != Multilingual_MachineTranslation::DETECT_LANGUAGE) { 145 $params['source'] = $this->sourceLang; 146 } 147 148 $url = self::SERVICE_URL . '?' . http_build_query($params, '', '&'); 149 150 $oi_result = $ointegrate->performRequest($url); 151 $result = $oi_result->data['data']['translations']; 152 153 return implode( 154 '', 155 array_map( 156 function ($entry) { 157 return $entry['translatedText']; 158 }, 159 $result 160 ) 161 ); 162 } 163 164 private function splitInLogicalChunksOf450CharsMax($text) 165 { 166 $chunks = []; 167 $segmentor = new Multilingual_Aligner_SentenceSegmentor(); 168 $sentences = $segmentor->segment($text); 169 $ii = 0; 170 $chunk = $sentences[$ii]; 171 while ($ii < (count($sentences) - 1)) { 172 $ii++; 173 if (strlen(urlencode($chunk)) < 450) { 174 $chunk = $chunk . $sentences[$ii]; 175 } else { 176 $chunks[] = $chunk; 177 $chunk = $sentences[$ii]; 178 } 179 } 180 $chunks[] = $chunk; 181 return $chunks; 182 } 183 184 /* 185 * Google Translate works best when wiki or html markup is first replaced with 186 * a unique id (here something like this is used: id169) and then those ids 187 * surrounded by Google's notranslate span tag. Upon translations span tags are 188 * removed and ids reversed to the original markup. 189 */ 190 private function escape_untranslatable_text($text) 191 { 192 //Title is all between <hx> tags. Put it in lower case, so Google doesn't 193 //take the capitalized words as proper names 194 if (preg_match_all(self::TITLE_TAG, $text, $matchesT) != 0) { 195 foreach ($matchesT[0] as $i => $completeMatch) { 196 $text = str_replace($completeMatch, $matchesT[1][$i] . strtolower($matchesT[3][$i]), $text); 197 } 198 } 199 200 if (! $this->translatingHTML) { 201 $text = nl2br($text); 202 } 203 preg_match_all($this->markup, $text, $matches); 204 205 foreach ($matches[0] as $matched_markup) { 206 $id = array_search($matched_markup, $this->arrayOfUntranslatableStringsAndTheirIDs); 207 if ($id == false) { 208 $id = (int)$this->currentID + 1; 209 $this->arrayOfUntranslatableStringsAndTheirIDs[$id] = $matched_markup; 210 $this->currentID = $id; 211 } 212 } 213 214 foreach ($this->arrayOfUntranslatableStringsAndTheirIDs as $id => $markup) { 215 $id = "id" . $id; 216 217 //adding dot after </ul> to have it segmented properly. otherwise when the html contains only lists, 218 //sentence segmentor can't find where to segment the text 219 if ($markup == "</ul>") { 220 $text = preg_replace("/" . preg_quote($markup, '/') . "/", $id . ".", $text); 221 } else { 222 $text = preg_replace("/" . preg_quote($markup, '/') . "/", $id, $text); 223 } 224 } 225 226 $text = preg_replace("/(id[\d]+\.?(id[\d]+)*)/", self::NO_TRANSLATE_STRING, $text); 227 return $text; 228 } 229 230 231 private function remove_notranslateTags_and_reverse_to_original_markup($text) 232 { 233 //Google adds spaces before and after notranslate span 234 235 preg_match_all(self::NO_TRANSLATE_PATTERN, $text, $matches); 236 237 foreach ($matches as $i => $match) { 238 foreach ($matches[0] as $index => $found) { 239 if (! empty($match[$index])) { 240 $text = str_replace($found, $match[$index], $text); 241 } 242 } 243 } 244 245 foreach ($this->arrayOfUntranslatableStringsAndTheirIDs as $id => $markup) { 246 $id = "id" . $id; 247 $text = preg_replace("/$id/", $markup, $text); //str replace better 248 } 249 250 //trimming leading spaces in each line (wiki syntax doesn't work unless) 251 if (! $this->translatingHTML) { 252 $textArray = preg_split('/\<br(\s*)?\/?\>/i', $text); 253 $textArray = array_map('trim', $textArray); 254 $text = implode("\n", $textArray); 255 } 256 return $text; 257 } 258} 259