1#
2# Copyright 2010 Zuza Software Foundation
3#
4# This file is part of the Translate Toolkit.
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19r"""Manage the Trados .txt Translation Memory format
20
21A Trados file looks like this:
22
23.. code-block:: xml
24
25    <TrU>
26    <CrD>18012000, 13:18:35
27    <CrU>CAROL-ANN
28    <UsC>0
29    <Seg L=EN_GB>Association for Road Safety \endash  Conference
30    <Seg L=DE_DE>Tagung der Gesellschaft für Verkehrssicherheit
31    </TrU>
32    <TrU>
33    <CrD>18012000, 13:19:14
34    <CrU>CAROL-ANN
35    <UsC>0
36    <Seg L=EN_GB>Road Safety Education in our Schools
37    <Seg L=DE_DE>Verkehrserziehung an Schulen
38    </TrU>
39
40"""
41
42import re
43import time
44
45from translate.storage import base
46
47
48try:
49    # FIXME see if we can't use lxml
50    from bs4 import BeautifulSoup
51except ImportError:
52    raise ImportError(
53        "BeautifulSoup 4 is not installed. Support for Trados txt is disabled."
54    )
55
56
57__all__ = (
58    "TRADOS_TIMEFORMAT",
59    "RTF_ESCAPES",
60    "escape",
61    "unescape",
62    "TradosTxtDate",
63    "TradosUnit",
64    "TradosTxtTmFile",
65)
66
67
68TRADOS_TIMEFORMAT = "%d%m%Y, %H:%M:%S"
69"""Time format used by Trados .txt"""
70
71RTF_ESCAPES = {
72    "\\emdash": "—",
73    "\\endash": "–",
74    # Nonbreaking space equal to width of character "m" in current font.
75    "\\emspace": "\u2003",
76    # Nonbreaking space equal to width of character "n" in current font.
77    "\\enspace": "\u2002",
78    # "\\qmspace": "",    # One-quarter em space.
79    "\\bullet": "•",  # Bullet character.
80    "\\lquote": "‘",  # Left single quotation mark. \u2018
81    "\\rquote": "’",  # Right single quotation mark. \u2019
82    "\\ldblquote": "“",  # Left double quotation mark. \u201C
83    "\\rdblquote": "”",  # Right double quotation mark. \u201D
84    "\\~": "\u00a0",  # Nonbreaking space
85    "\\-": "\u00ad",  # Optional hyphen.
86    "\\_": "‑",  # Nonbreaking hyphen \U2011
87    # A hexadecimal value, based on the specified character set (may be used to
88    # identify 8-bit values).
89    # "\\'hh": "",
90}
91"""RTF control to Unicode map. See
92http://msdn.microsoft.com/en-us/library/aa140283(v=office.10).aspx
93"""
94
95
96def unescape(text):
97    """Convert Trados text to normal Unicode string"""
98    for trados_escape, char in RTF_ESCAPES.items():
99        text = text.replace(trados_escape, char)
100    return text
101
102
103def escape(text):
104    """Convert Unicode string to Trodas escapes"""
105    for trados_escape, char in RTF_ESCAPES.items():
106        text = text.replace(char, trados_escape)
107    return text
108
109
110class TradosTxtDate:
111    """Manages the timestamps in the Trados .txt format of DDMMYYY, hh:mm:ss"""
112
113    def __init__(self, newtime=None):
114        self._time = None
115        if newtime:
116            if isinstance(newtime, str):
117                self.timestring = newtime
118            elif isinstance(newtime, time.struct_time):
119                self.time = newtime
120
121    def get_timestring(self):
122        """Get the time in the Trados time format"""
123        if not self._time:
124            return None
125        else:
126            return time.strftime(TRADOS_TIMEFORMAT, self._time)
127
128    def set_timestring(self, timestring):
129        """Set the time_struct object using a Trados time formated string
130
131        :param timestring: A Trados time string (DDMMYYYY, hh:mm:ss)
132        :type timestring: String
133        """
134        self._time = time.strptime(timestring, TRADOS_TIMEFORMAT)
135
136    timestring = property(get_timestring, set_timestring)
137
138    def get_time(self):
139        """Get the time_struct object"""
140        return self._time
141
142    def set_time(self, newtime):
143        """Set the time_struct object
144
145        :param newtime: a new time object
146        :type newtime: time.time_struct
147        """
148        if newtime and isinstance(newtime, time.struct_time):
149            self._time = newtime
150        else:
151            self._time = None
152
153    time = property(get_time, set_time)
154
155    def __str__(self):
156        if not self.timestring:
157            return ""
158        else:
159            return self.timestring
160
161
162class TradosUnit(base.TranslationUnit):
163    def __init__(self, source=None):
164        self._soup = None
165        super().__init__(source)
166
167    @property
168    def source(self):
169        return unescape(self._soup.findAll("seg")[0].contents[0])
170
171    @source.setter
172    def source(self, source):
173        pass
174
175    def gettarget(self):
176        return unescape(self._soup.findAll("seg")[1].contents[0])
177
178    target = property(gettarget, None)
179
180
181class TradosSoup(BeautifulSoup):
182
183    MARKUP_MASSAGE = [
184        (
185            re.compile("<(?P<fulltag>(?P<tag>[^\\s\\/]+).*?)>(?P<content>.+)\r"),
186            lambda x: "<%(fulltag)s>%(content)s</%(tag)s>" % x.groupdict(),
187        ),
188    ]
189
190
191class TradosTxtTmFile(base.TranslationStore):
192    """A Trados translation memory file"""
193
194    Name = "Trados Translation Memory"
195    Mimetypes = ["application/x-trados-tm"]
196    Extensions = ["txt"]
197    UnitClass = TradosUnit
198    default_encoding = "iso-8859-1"
199
200    def __init__(self, inputfile=None, **kwargs):
201        """construct a Wordfast TM, optionally reading in from inputfile."""
202        super().__init__(**kwargs)
203        self.filename = ""
204        if inputfile is not None:
205            self.parse(inputfile)
206
207    def parse(self, input):
208        if hasattr(input, "name"):
209            self.filename = input.name
210        elif not getattr(self, "filename", ""):
211            self.filename = ""
212        if hasattr(input, "read"):
213            tmsrc = input.read()
214            input.close()
215            input = tmsrc
216        self._soup = TradosSoup(input)
217        for tu in self._soup.findAll("tru"):
218            unit = TradosUnit()
219            unit._soup = TradosSoup(str(tu))
220            self.addunit(unit)
221
222    def serialize(self, out):
223        # FIXME turn the lowercased tags back into mixed case
224        out.write(self._soup.prettify())
225