1#
2# Copyright 2007-2010 Zuza Software Foundation
3#
4# This file is part of the Translate Toolkit.
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19"""Manage the Wordfast Translation Memory format
20
21Wordfast TM format is the Translation Memory format used by the
22`Wordfast <http://www.wordfast.net/>`_ computer aided translation tool.
23
24It is a bilingual base class derived format with :class:`WordfastTMFile`
25and :class:`WordfastUnit` providing file and unit level access.
26
27Wordfast is a computer aided translation tool.  It is an application
28built on top of Microsoft Word and is implemented as a rather
29sophisticated set of macros.  Understanding that helps us understand
30many of the seemingly strange choices around this format including:
31encoding, escaping and file naming.
32
33Implementation
34    The implementation covers the full requirements of a Wordfast TM file.
35    The files are simple Tab Separated Value (TSV) files that can be read
36    by Microsoft Excel and other spreadsheet programs.  They use the .txt
37    extension which does make it more difficult to automatically identify
38    such files.
39
40    The dialect of the TSV files is specified by :class:`WordfastDialect`.
41
42Encoding
43    The files are UTF-16 or ISO-8859-1 (Latin1) encoded.  These choices
44    are most likely because Microsoft Word is the base editing tool for
45    Wordfast.
46
47    The format is tab separated so we are able to detect UTF-16 vs Latin-1
48    by searching for the occurance of a UTF-16 tab character and then
49    continuing with the parsing.
50
51Timestamps
52    :class:`WordfastTime` allows for the correct management of the Wordfast
53    YYYYMMDD~HHMMSS timestamps.  However, timestamps on individual units are
54    not updated when edited.
55
56Header
57    :class:`WordfastHeader` provides header management support.  The header
58    functionality is fully implemented through observing the behaviour of the
59    files in real use cases, input from the Wordfast programmers and
60    public documentation.
61
62Escaping
63    Wordfast TM implements a form of escaping that covers two aspects:
64
65    1. Placeable: bold, formating, etc.  These are left as is and ignored.  It
66       is up to the editor and future placeable implementation to manage these.
67
68    2. Escapes: items that may confuse Excel or translators are escaped as
69       ``&'XX;``. These are fully implemented and are converted to and from
70       Unicode.  By observing behaviour and reading documentation we where able
71       to observe all possible escapes. Unfortunately the escaping differs
72       slightly between Windows and Mac version.  This might cause errors in
73       future.  Functions allow for ``<_wf_to_char>`` and back to Wordfast
74       escape (``<_char_to_wf>``).
75
76Extended Attributes
77    The last 4 columns allow users to define and manage extended attributes.
78    These are left as is and are not directly managed byour implemenation.
79"""
80
81import csv
82import time
83
84from translate.storage import base
85
86
87WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
88"""Time format used by Wordfast"""
89
90WF_FIELDNAMES_HEADER = [
91    "date",
92    "userlist",
93    "tucount",
94    "src-lang",
95    "version",
96    "target-lang",
97    "license",
98    "attr1list",
99    "attr2list",
100    "attr3list",
101    "attr4list",
102    "attr5list",
103]
104"""Field names for the Wordfast header"""
105
106WF_FIELDNAMES = [
107    "date",
108    "user",
109    "reuse",
110    "src-lang",
111    "source",
112    "target-lang",
113    "target",
114    "attr1",
115    "attr2",
116    "attr3",
117    "attr4",
118    "attr5",
119]
120"""Field names for a Wordfast TU"""
121
122WF_FIELDNAMES_HEADER_DEFAULTS = {
123    "date": "%19000101~121212",
124    "userlist": "%User ID,TT,TT Translate-Toolkit",
125    "tucount": "%TU=00000001",
126    "src-lang": "%EN-US",
127    "version": "%Wordfast TM v.5.51w9/00",
128    "target-lang": "",
129    "license": "%---00000001",
130    "attr1list": "",
131    "attr2list": "",
132    "attr3list": "",
133    "attr4list": "",
134    "attr5list": "",
135}
136"""Default or minimum header entries for a Wordfast file"""
137
138# TODO Needs validation.  The following need to be checked against a WF TM file
139# to ensure that the correct Unicode values have been chosen for the characters.
140# For now these look correct and have been taken from Windows CP1252 and
141# Macintosh code points found for the respective character sets on Linux.
142WF_ESCAPE_MAP = (
143    ("&'26;", "\u0026"),  # & - Ampersand (must be first to prevent
144    #     escaping of escapes)
145    ("&'82;", "\u201A"),  # ‚ - Single low-9 quotation mark
146    ("&'85;", "\u2026"),  # … - Elippsis
147    ("&'91;", "\u2018"),  # ‘ - left single quotation mark
148    ("&'92;", "\u2019"),  # ’ - right single quotation mark
149    ("&'93;", "\u201C"),  # “ - left double quotation mark
150    ("&'94;", "\u201D"),  # ” - right double quotation mark
151    ("&'96;", "\u2013"),  # – - en dash (validate)
152    ("&'97;", "\u2014"),  # — - em dash (validate)
153    ("&'99;", "\u2122"),  # ™ - Trade mark
154    # Windows only
155    ("&'A0;", "\u00A0"),  #   - Non breaking space
156    ("&'A9;", "\u00A9"),  # © - Copyright
157    ("&'AE;", "\u00AE"),  # ® - Registered
158    ("&'BC;", "\u00BC"),  # ¼
159    ("&'BD;", "\u00BD"),  # ½
160    ("&'BE;", "\u00BE"),  # ¾
161    # Mac only
162    ("&'A8;", "\u00AE"),  # ® - Registered
163    ("&'AA;", "\u2122"),  # ™ - Trade mark
164    ("&'C7;", "\u00AB"),  # « - Left-pointing double angle quotation mark
165    ("&'C8;", "\u00BB"),  # » - Right-pointing double angle quotation mark
166    ("&'C9;", "\u2026"),  # … - Horizontal Elippsis
167    ("&'CA;", "\u00A0"),  #   - Non breaking space
168    ("&'D0;", "\u2013"),  # – - en dash (validate)
169    ("&'D1;", "\u2014"),  # — - em dash (validate)
170    ("&'D2;", "\u201C"),  # “ - left double quotation mark
171    ("&'D3;", "\u201D"),  # ” - right double quotation mark
172    ("&'D4;", "\u2018"),  # ‘ - left single quotation mark
173    ("&'D5;", "\u2019"),  # ’ - right single quotation mark
174    ("&'E2;", "\u201A"),  # ‚ - Single low-9 quotation mark
175    ("&'E3;", "\u201E"),  # „ - Double low-9 quotation mark
176    # Other markers
177    # Soft-break - XXX creates a problem with roundtripping could
178    # also be represented by \u2028
179    # ("&'B;", "\n"),
180)
181"""Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
182
183TAB_UTF16 = b"\x00\x09"
184"""The tab \\t character as it would appear in UTF-16 encoding"""
185
186
187def _char_to_wf(string):
188    """Char -> Wordfast &'XX; escapes
189
190    Full roundtripping is not possible because of the escaping of
191    NEWLINE \\n and TAB \\t
192    """
193    # FIXME there is no platform check to ensure that we use Mac encodings
194    # when running on a Mac
195    if string:
196        for code, char in WF_ESCAPE_MAP:
197            string = string.replace(char, code)
198        string = string.replace("\n", "\\n").replace("\t", "\\t")
199    return string
200
201
202def _wf_to_char(string):
203    """Wordfast &'XX; escapes -> Char"""
204    if string:
205        for code, char in WF_ESCAPE_MAP:
206            string = string.replace(code, char)
207        string = string.replace("\\n", "\n").replace("\\t", "\t")
208    return string
209
210
211class WordfastDialect(csv.Dialect):
212    """Describe the properties of a Wordfast generated TAB-delimited file."""
213
214    delimiter = "\t"
215    lineterminator = "\r\n"
216    quoting = csv.QUOTE_NONE
217
218
219csv.register_dialect("wordfast", WordfastDialect)
220
221
222class WordfastTime:
223    """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
224
225    def __init__(self, newtime=None):
226        self._time = None
227        if not newtime:
228            self.time = None
229        elif isinstance(newtime, str):
230            self.timestring = newtime
231        elif isinstance(newtime, time.struct_time):
232            self.time = newtime
233
234    def get_timestring(self):
235        """Get the time in the Wordfast time format"""
236        if not self._time:
237            return None
238        else:
239            return time.strftime(WF_TIMEFORMAT, self._time)
240
241    def set_timestring(self, timestring):
242        """Set the time_sturct object using a Wordfast time formated string
243
244        :param timestring: A Wordfast time string (YYYMMDD~hhmmss)
245        :type timestring: String
246        """
247        self._time = time.strptime(timestring, WF_TIMEFORMAT)
248
249    timestring = property(get_timestring, set_timestring)
250
251    def get_time(self):
252        """Get the time_struct object"""
253        return self._time
254
255    def set_time(self, newtime):
256        """Set the time_struct object
257
258        :param newtime: a new time object
259        :type newtime: time.time_struct
260        """
261        if newtime and isinstance(newtime, time.struct_time):
262            self._time = newtime
263        else:
264            self._time = None
265
266    time = property(get_time, set_time)
267
268    def __str__(self):
269        if not self.timestring:
270            return ""
271        else:
272            return self.timestring
273
274
275class WordfastHeader:
276    """A wordfast translation memory header"""
277
278    def __init__(self, header=None):
279        self._header_dict = []
280        if not header:
281            self.header = self._create_default_header()
282        elif isinstance(header, dict):
283            self.header = header
284
285    def _create_default_header(self):
286        """Create a default Wordfast header with the date set to the current
287        time
288        """
289        defaultheader = {}
290        defaultheader.update(WF_FIELDNAMES_HEADER_DEFAULTS)
291        defaultheader["date"] = "%%%s" % WordfastTime(time.localtime()).timestring
292        return defaultheader
293
294    def getheader(self):
295        """Get the header dictionary"""
296        return self._header_dict
297
298    def setheader(self, newheader):
299        self._header_dict = newheader
300
301    header = property(getheader, setheader)
302
303    def settargetlang(self, newlang):
304        self._header_dict["target-lang"] = "%%%s" % newlang
305
306    targetlang = property(None, settargetlang)
307
308    def settucount(self, count):
309        self._header_dict["tucount"] = "%%TU=%08d" % count
310
311    tucount = property(None, settucount)
312
313
314class WordfastUnit(base.TranslationUnit):
315    """A Wordfast translation memory unit"""
316
317    def __init__(self, source=None):
318        self._dict = {}
319        if source:
320            self.source = source
321        super().__init__(source)
322
323    def _update_timestamp(self):
324        """Refresh the timestamp for the unit"""
325        self._dict["date"] = WordfastTime(time.localtime()).timestring
326
327    def getdict(self):
328        """Get the dictionary of values for a Wordfast line"""
329        return self._dict
330
331    def setdict(self, newdict):
332        """Set the dictionary of values for a Wordfast line
333
334        :param newdict: a new dictionary with Wordfast line elements
335        :type newdict: Dict
336        """
337        # TODO First check that the values are OK
338        self._dict = newdict
339
340    dict = property(getdict, setdict)
341
342    def _get_source_or_target(self, key):
343        if self._dict.get(key, None) is None:
344            return None
345        elif self._dict[key]:
346            return _wf_to_char(self._dict[key])
347        else:
348            return ""
349
350    def _set_source_or_target(self, key, newvalue):
351        if newvalue is None:
352            self._dict[key] = None
353        newvalue = _char_to_wf(newvalue)
354        if key not in self._dict or newvalue != self._dict[key]:
355            self._dict[key] = newvalue
356            self._update_timestamp()
357
358    @property
359    def source(self):
360        return self._get_source_or_target("source")
361
362    @source.setter
363    def source(self, source):
364        self._rich_source = None
365        self._set_source_or_target("source", source)
366
367    @property
368    def target(self):
369        return self._get_source_or_target("target")
370
371    @target.setter
372    def target(self, target):
373        self._rich_target = None
374        self._set_source_or_target("target", target)
375
376    def settargetlang(self, newlang):
377        self._dict["target-lang"] = newlang
378
379    targetlang = property(None, settargetlang)
380
381    def __str__(self):
382        return str(self._dict)
383
384    def istranslated(self):
385        if not self._dict.get("source", None):
386            return False
387        return bool(self._dict.get("target", None))
388
389
390class WordfastTMFile(base.TranslationStore):
391    """A Wordfast translation memory file"""
392
393    Name = "Wordfast Translation Memory"
394    Mimetypes = ["application/x-wordfast"]
395    Extensions = ["txt"]
396    UnitClass = WordfastUnit
397    default_encoding = "iso-8859-1"
398
399    def __init__(self, inputfile=None, **kwargs):
400        """construct a Wordfast TM, optionally reading in from inputfile."""
401        super().__init__(**kwargs)
402        self.filename = ""
403        self.header = WordfastHeader()
404        if inputfile is not None:
405            self.parse(inputfile)
406
407    def parse(self, input):
408        """parsese the given file or file source string"""
409        if hasattr(input, "name"):
410            self.filename = input.name
411        elif not getattr(self, "filename", ""):
412            self.filename = ""
413        if hasattr(input, "read"):
414            tmsrc = input.read()
415            input.close()
416            input = tmsrc
417        if TAB_UTF16 in input.split(b"\n")[0]:
418            self.encoding = "utf-16"
419        else:
420            self.encoding = "iso-8859-1"
421        try:
422            input = input.decode(self.encoding)
423        except Exception:
424            raise ValueError(
425                "Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded"
426            )
427        reader = csv.DictReader(
428            input.split("\n"), fieldnames=WF_FIELDNAMES, dialect="wordfast"
429        )
430        for idx, line in enumerate(reader):
431            if idx == 0:
432                header = dict(
433                    zip(WF_FIELDNAMES_HEADER, [line[key] for key in WF_FIELDNAMES])
434                )
435                self.header = WordfastHeader(header)
436                continue
437            newunit = WordfastUnit()
438            newunit.dict = line
439            self.addunit(newunit)
440
441    def serialize(self, out):
442        # Check first if there is at least one translated unit
443        translated_units = [u for u in self.units if u.istranslated()]
444        if not translated_units:
445            return
446
447        output = csv.StringIO()
448        writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast")
449        # No real headers, the first line contains metadata
450        self.header.tucount = len(translated_units)
451        writer.writerow(
452            dict(
453                zip(
454                    WF_FIELDNAMES,
455                    [self.header.header[key] for key in WF_FIELDNAMES_HEADER],
456                )
457            )
458        )
459
460        for unit in translated_units:
461            writer.writerow(unit.dict)
462        out.write(output.getvalue().encode(self.encoding))
463