1# 2# Copyright 2007-2010 Zuza Software Foundation 3# 4# This file is part of the Translate Toolkit. 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19"""Manage the Wordfast Translation Memory format 20 21Wordfast TM format is the Translation Memory format used by the 22`Wordfast <http://www.wordfast.net/>`_ computer aided translation tool. 23 24It is a bilingual base class derived format with :class:`WordfastTMFile` 25and :class:`WordfastUnit` providing file and unit level access. 26 27Wordfast is a computer aided translation tool. It is an application 28built on top of Microsoft Word and is implemented as a rather 29sophisticated set of macros. Understanding that helps us understand 30many of the seemingly strange choices around this format including: 31encoding, escaping and file naming. 32 33Implementation 34 The implementation covers the full requirements of a Wordfast TM file. 35 The files are simple Tab Separated Value (TSV) files that can be read 36 by Microsoft Excel and other spreadsheet programs. They use the .txt 37 extension which does make it more difficult to automatically identify 38 such files. 39 40 The dialect of the TSV files is specified by :class:`WordfastDialect`. 41 42Encoding 43 The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices 44 are most likely because Microsoft Word is the base editing tool for 45 Wordfast. 46 47 The format is tab separated so we are able to detect UTF-16 vs Latin-1 48 by searching for the occurance of a UTF-16 tab character and then 49 continuing with the parsing. 50 51Timestamps 52 :class:`WordfastTime` allows for the correct management of the Wordfast 53 YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are 54 not updated when edited. 55 56Header 57 :class:`WordfastHeader` provides header management support. The header 58 functionality is fully implemented through observing the behaviour of the 59 files in real use cases, input from the Wordfast programmers and 60 public documentation. 61 62Escaping 63 Wordfast TM implements a form of escaping that covers two aspects: 64 65 1. Placeable: bold, formating, etc. These are left as is and ignored. It 66 is up to the editor and future placeable implementation to manage these. 67 68 2. Escapes: items that may confuse Excel or translators are escaped as 69 ``&'XX;``. These are fully implemented and are converted to and from 70 Unicode. By observing behaviour and reading documentation we where able 71 to observe all possible escapes. Unfortunately the escaping differs 72 slightly between Windows and Mac version. This might cause errors in 73 future. Functions allow for ``<_wf_to_char>`` and back to Wordfast 74 escape (``<_char_to_wf>``). 75 76Extended Attributes 77 The last 4 columns allow users to define and manage extended attributes. 78 These are left as is and are not directly managed byour implemenation. 79""" 80 81import csv 82import time 83 84from translate.storage import base 85 86 87WF_TIMEFORMAT = "%Y%m%d~%H%M%S" 88"""Time format used by Wordfast""" 89 90WF_FIELDNAMES_HEADER = [ 91 "date", 92 "userlist", 93 "tucount", 94 "src-lang", 95 "version", 96 "target-lang", 97 "license", 98 "attr1list", 99 "attr2list", 100 "attr3list", 101 "attr4list", 102 "attr5list", 103] 104"""Field names for the Wordfast header""" 105 106WF_FIELDNAMES = [ 107 "date", 108 "user", 109 "reuse", 110 "src-lang", 111 "source", 112 "target-lang", 113 "target", 114 "attr1", 115 "attr2", 116 "attr3", 117 "attr4", 118 "attr5", 119] 120"""Field names for a Wordfast TU""" 121 122WF_FIELDNAMES_HEADER_DEFAULTS = { 123 "date": "%19000101~121212", 124 "userlist": "%User ID,TT,TT Translate-Toolkit", 125 "tucount": "%TU=00000001", 126 "src-lang": "%EN-US", 127 "version": "%Wordfast TM v.5.51w9/00", 128 "target-lang": "", 129 "license": "%---00000001", 130 "attr1list": "", 131 "attr2list": "", 132 "attr3list": "", 133 "attr4list": "", 134 "attr5list": "", 135} 136"""Default or minimum header entries for a Wordfast file""" 137 138# TODO Needs validation. The following need to be checked against a WF TM file 139# to ensure that the correct Unicode values have been chosen for the characters. 140# For now these look correct and have been taken from Windows CP1252 and 141# Macintosh code points found for the respective character sets on Linux. 142WF_ESCAPE_MAP = ( 143 ("&'26;", "\u0026"), # & - Ampersand (must be first to prevent 144 # escaping of escapes) 145 ("&'82;", "\u201A"), # ‚ - Single low-9 quotation mark 146 ("&'85;", "\u2026"), # … - Elippsis 147 ("&'91;", "\u2018"), # ‘ - left single quotation mark 148 ("&'92;", "\u2019"), # ’ - right single quotation mark 149 ("&'93;", "\u201C"), # “ - left double quotation mark 150 ("&'94;", "\u201D"), # ” - right double quotation mark 151 ("&'96;", "\u2013"), # – - en dash (validate) 152 ("&'97;", "\u2014"), # — - em dash (validate) 153 ("&'99;", "\u2122"), # ™ - Trade mark 154 # Windows only 155 ("&'A0;", "\u00A0"), # - Non breaking space 156 ("&'A9;", "\u00A9"), # © - Copyright 157 ("&'AE;", "\u00AE"), # ® - Registered 158 ("&'BC;", "\u00BC"), # ¼ 159 ("&'BD;", "\u00BD"), # ½ 160 ("&'BE;", "\u00BE"), # ¾ 161 # Mac only 162 ("&'A8;", "\u00AE"), # ® - Registered 163 ("&'AA;", "\u2122"), # ™ - Trade mark 164 ("&'C7;", "\u00AB"), # « - Left-pointing double angle quotation mark 165 ("&'C8;", "\u00BB"), # » - Right-pointing double angle quotation mark 166 ("&'C9;", "\u2026"), # … - Horizontal Elippsis 167 ("&'CA;", "\u00A0"), # - Non breaking space 168 ("&'D0;", "\u2013"), # – - en dash (validate) 169 ("&'D1;", "\u2014"), # — - em dash (validate) 170 ("&'D2;", "\u201C"), # “ - left double quotation mark 171 ("&'D3;", "\u201D"), # ” - right double quotation mark 172 ("&'D4;", "\u2018"), # ‘ - left single quotation mark 173 ("&'D5;", "\u2019"), # ’ - right single quotation mark 174 ("&'E2;", "\u201A"), # ‚ - Single low-9 quotation mark 175 ("&'E3;", "\u201E"), # „ - Double low-9 quotation mark 176 # Other markers 177 # Soft-break - XXX creates a problem with roundtripping could 178 # also be represented by \u2028 179 # ("&'B;", "\n"), 180) 181"""Mapping of Wordfast &'XX; escapes to correct Unicode characters""" 182 183TAB_UTF16 = b"\x00\x09" 184"""The tab \\t character as it would appear in UTF-16 encoding""" 185 186 187def _char_to_wf(string): 188 """Char -> Wordfast &'XX; escapes 189 190 Full roundtripping is not possible because of the escaping of 191 NEWLINE \\n and TAB \\t 192 """ 193 # FIXME there is no platform check to ensure that we use Mac encodings 194 # when running on a Mac 195 if string: 196 for code, char in WF_ESCAPE_MAP: 197 string = string.replace(char, code) 198 string = string.replace("\n", "\\n").replace("\t", "\\t") 199 return string 200 201 202def _wf_to_char(string): 203 """Wordfast &'XX; escapes -> Char""" 204 if string: 205 for code, char in WF_ESCAPE_MAP: 206 string = string.replace(code, char) 207 string = string.replace("\\n", "\n").replace("\\t", "\t") 208 return string 209 210 211class WordfastDialect(csv.Dialect): 212 """Describe the properties of a Wordfast generated TAB-delimited file.""" 213 214 delimiter = "\t" 215 lineterminator = "\r\n" 216 quoting = csv.QUOTE_NONE 217 218 219csv.register_dialect("wordfast", WordfastDialect) 220 221 222class WordfastTime: 223 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss""" 224 225 def __init__(self, newtime=None): 226 self._time = None 227 if not newtime: 228 self.time = None 229 elif isinstance(newtime, str): 230 self.timestring = newtime 231 elif isinstance(newtime, time.struct_time): 232 self.time = newtime 233 234 def get_timestring(self): 235 """Get the time in the Wordfast time format""" 236 if not self._time: 237 return None 238 else: 239 return time.strftime(WF_TIMEFORMAT, self._time) 240 241 def set_timestring(self, timestring): 242 """Set the time_sturct object using a Wordfast time formated string 243 244 :param timestring: A Wordfast time string (YYYMMDD~hhmmss) 245 :type timestring: String 246 """ 247 self._time = time.strptime(timestring, WF_TIMEFORMAT) 248 249 timestring = property(get_timestring, set_timestring) 250 251 def get_time(self): 252 """Get the time_struct object""" 253 return self._time 254 255 def set_time(self, newtime): 256 """Set the time_struct object 257 258 :param newtime: a new time object 259 :type newtime: time.time_struct 260 """ 261 if newtime and isinstance(newtime, time.struct_time): 262 self._time = newtime 263 else: 264 self._time = None 265 266 time = property(get_time, set_time) 267 268 def __str__(self): 269 if not self.timestring: 270 return "" 271 else: 272 return self.timestring 273 274 275class WordfastHeader: 276 """A wordfast translation memory header""" 277 278 def __init__(self, header=None): 279 self._header_dict = [] 280 if not header: 281 self.header = self._create_default_header() 282 elif isinstance(header, dict): 283 self.header = header 284 285 def _create_default_header(self): 286 """Create a default Wordfast header with the date set to the current 287 time 288 """ 289 defaultheader = {} 290 defaultheader.update(WF_FIELDNAMES_HEADER_DEFAULTS) 291 defaultheader["date"] = "%%%s" % WordfastTime(time.localtime()).timestring 292 return defaultheader 293 294 def getheader(self): 295 """Get the header dictionary""" 296 return self._header_dict 297 298 def setheader(self, newheader): 299 self._header_dict = newheader 300 301 header = property(getheader, setheader) 302 303 def settargetlang(self, newlang): 304 self._header_dict["target-lang"] = "%%%s" % newlang 305 306 targetlang = property(None, settargetlang) 307 308 def settucount(self, count): 309 self._header_dict["tucount"] = "%%TU=%08d" % count 310 311 tucount = property(None, settucount) 312 313 314class WordfastUnit(base.TranslationUnit): 315 """A Wordfast translation memory unit""" 316 317 def __init__(self, source=None): 318 self._dict = {} 319 if source: 320 self.source = source 321 super().__init__(source) 322 323 def _update_timestamp(self): 324 """Refresh the timestamp for the unit""" 325 self._dict["date"] = WordfastTime(time.localtime()).timestring 326 327 def getdict(self): 328 """Get the dictionary of values for a Wordfast line""" 329 return self._dict 330 331 def setdict(self, newdict): 332 """Set the dictionary of values for a Wordfast line 333 334 :param newdict: a new dictionary with Wordfast line elements 335 :type newdict: Dict 336 """ 337 # TODO First check that the values are OK 338 self._dict = newdict 339 340 dict = property(getdict, setdict) 341 342 def _get_source_or_target(self, key): 343 if self._dict.get(key, None) is None: 344 return None 345 elif self._dict[key]: 346 return _wf_to_char(self._dict[key]) 347 else: 348 return "" 349 350 def _set_source_or_target(self, key, newvalue): 351 if newvalue is None: 352 self._dict[key] = None 353 newvalue = _char_to_wf(newvalue) 354 if key not in self._dict or newvalue != self._dict[key]: 355 self._dict[key] = newvalue 356 self._update_timestamp() 357 358 @property 359 def source(self): 360 return self._get_source_or_target("source") 361 362 @source.setter 363 def source(self, source): 364 self._rich_source = None 365 self._set_source_or_target("source", source) 366 367 @property 368 def target(self): 369 return self._get_source_or_target("target") 370 371 @target.setter 372 def target(self, target): 373 self._rich_target = None 374 self._set_source_or_target("target", target) 375 376 def settargetlang(self, newlang): 377 self._dict["target-lang"] = newlang 378 379 targetlang = property(None, settargetlang) 380 381 def __str__(self): 382 return str(self._dict) 383 384 def istranslated(self): 385 if not self._dict.get("source", None): 386 return False 387 return bool(self._dict.get("target", None)) 388 389 390class WordfastTMFile(base.TranslationStore): 391 """A Wordfast translation memory file""" 392 393 Name = "Wordfast Translation Memory" 394 Mimetypes = ["application/x-wordfast"] 395 Extensions = ["txt"] 396 UnitClass = WordfastUnit 397 default_encoding = "iso-8859-1" 398 399 def __init__(self, inputfile=None, **kwargs): 400 """construct a Wordfast TM, optionally reading in from inputfile.""" 401 super().__init__(**kwargs) 402 self.filename = "" 403 self.header = WordfastHeader() 404 if inputfile is not None: 405 self.parse(inputfile) 406 407 def parse(self, input): 408 """parsese the given file or file source string""" 409 if hasattr(input, "name"): 410 self.filename = input.name 411 elif not getattr(self, "filename", ""): 412 self.filename = "" 413 if hasattr(input, "read"): 414 tmsrc = input.read() 415 input.close() 416 input = tmsrc 417 if TAB_UTF16 in input.split(b"\n")[0]: 418 self.encoding = "utf-16" 419 else: 420 self.encoding = "iso-8859-1" 421 try: 422 input = input.decode(self.encoding) 423 except Exception: 424 raise ValueError( 425 "Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded" 426 ) 427 reader = csv.DictReader( 428 input.split("\n"), fieldnames=WF_FIELDNAMES, dialect="wordfast" 429 ) 430 for idx, line in enumerate(reader): 431 if idx == 0: 432 header = dict( 433 zip(WF_FIELDNAMES_HEADER, [line[key] for key in WF_FIELDNAMES]) 434 ) 435 self.header = WordfastHeader(header) 436 continue 437 newunit = WordfastUnit() 438 newunit.dict = line 439 self.addunit(newunit) 440 441 def serialize(self, out): 442 # Check first if there is at least one translated unit 443 translated_units = [u for u in self.units if u.istranslated()] 444 if not translated_units: 445 return 446 447 output = csv.StringIO() 448 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast") 449 # No real headers, the first line contains metadata 450 self.header.tucount = len(translated_units) 451 writer.writerow( 452 dict( 453 zip( 454 WF_FIELDNAMES, 455 [self.header.header[key] for key in WF_FIELDNAMES_HEADER], 456 ) 457 ) 458 ) 459 460 for unit in translated_units: 461 writer.writerow(unit.dict) 462 out.write(output.getvalue().encode(self.encoding)) 463