1# 2# Copyright 2010 Zuza Software Foundation 3# 4# This file is part of the Translate Toolkit. 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19r"""Manage the Trados .txt Translation Memory format 20 21A Trados file looks like this: 22 23.. code-block:: xml 24 25 <TrU> 26 <CrD>18012000, 13:18:35 27 <CrU>CAROL-ANN 28 <UsC>0 29 <Seg L=EN_GB>Association for Road Safety \endash Conference 30 <Seg L=DE_DE>Tagung der Gesellschaft für Verkehrssicherheit 31 </TrU> 32 <TrU> 33 <CrD>18012000, 13:19:14 34 <CrU>CAROL-ANN 35 <UsC>0 36 <Seg L=EN_GB>Road Safety Education in our Schools 37 <Seg L=DE_DE>Verkehrserziehung an Schulen 38 </TrU> 39 40""" 41 42import re 43import time 44 45from translate.storage import base 46 47 48try: 49 # FIXME see if we can't use lxml 50 from bs4 import BeautifulSoup 51except ImportError: 52 raise ImportError( 53 "BeautifulSoup 4 is not installed. Support for Trados txt is disabled." 54 ) 55 56 57__all__ = ( 58 "TRADOS_TIMEFORMAT", 59 "RTF_ESCAPES", 60 "escape", 61 "unescape", 62 "TradosTxtDate", 63 "TradosUnit", 64 "TradosTxtTmFile", 65) 66 67 68TRADOS_TIMEFORMAT = "%d%m%Y, %H:%M:%S" 69"""Time format used by Trados .txt""" 70 71RTF_ESCAPES = { 72 "\\emdash": "—", 73 "\\endash": "–", 74 # Nonbreaking space equal to width of character "m" in current font. 75 "\\emspace": "\u2003", 76 # Nonbreaking space equal to width of character "n" in current font. 77 "\\enspace": "\u2002", 78 # "\\qmspace": "", # One-quarter em space. 79 "\\bullet": "•", # Bullet character. 80 "\\lquote": "‘", # Left single quotation mark. \u2018 81 "\\rquote": "’", # Right single quotation mark. \u2019 82 "\\ldblquote": "“", # Left double quotation mark. \u201C 83 "\\rdblquote": "”", # Right double quotation mark. \u201D 84 "\\~": "\u00a0", # Nonbreaking space 85 "\\-": "\u00ad", # Optional hyphen. 86 "\\_": "‑", # Nonbreaking hyphen \U2011 87 # A hexadecimal value, based on the specified character set (may be used to 88 # identify 8-bit values). 89 # "\\'hh": "", 90} 91"""RTF control to Unicode map. See 92http://msdn.microsoft.com/en-us/library/aa140283(v=office.10).aspx 93""" 94 95 96def unescape(text): 97 """Convert Trados text to normal Unicode string""" 98 for trados_escape, char in RTF_ESCAPES.items(): 99 text = text.replace(trados_escape, char) 100 return text 101 102 103def escape(text): 104 """Convert Unicode string to Trodas escapes""" 105 for trados_escape, char in RTF_ESCAPES.items(): 106 text = text.replace(char, trados_escape) 107 return text 108 109 110class TradosTxtDate: 111 """Manages the timestamps in the Trados .txt format of DDMMYYY, hh:mm:ss""" 112 113 def __init__(self, newtime=None): 114 self._time = None 115 if newtime: 116 if isinstance(newtime, str): 117 self.timestring = newtime 118 elif isinstance(newtime, time.struct_time): 119 self.time = newtime 120 121 def get_timestring(self): 122 """Get the time in the Trados time format""" 123 if not self._time: 124 return None 125 else: 126 return time.strftime(TRADOS_TIMEFORMAT, self._time) 127 128 def set_timestring(self, timestring): 129 """Set the time_struct object using a Trados time formated string 130 131 :param timestring: A Trados time string (DDMMYYYY, hh:mm:ss) 132 :type timestring: String 133 """ 134 self._time = time.strptime(timestring, TRADOS_TIMEFORMAT) 135 136 timestring = property(get_timestring, set_timestring) 137 138 def get_time(self): 139 """Get the time_struct object""" 140 return self._time 141 142 def set_time(self, newtime): 143 """Set the time_struct object 144 145 :param newtime: a new time object 146 :type newtime: time.time_struct 147 """ 148 if newtime and isinstance(newtime, time.struct_time): 149 self._time = newtime 150 else: 151 self._time = None 152 153 time = property(get_time, set_time) 154 155 def __str__(self): 156 if not self.timestring: 157 return "" 158 else: 159 return self.timestring 160 161 162class TradosUnit(base.TranslationUnit): 163 def __init__(self, source=None): 164 self._soup = None 165 super().__init__(source) 166 167 @property 168 def source(self): 169 return unescape(self._soup.findAll("seg")[0].contents[0]) 170 171 @source.setter 172 def source(self, source): 173 pass 174 175 def gettarget(self): 176 return unescape(self._soup.findAll("seg")[1].contents[0]) 177 178 target = property(gettarget, None) 179 180 181class TradosSoup(BeautifulSoup): 182 183 MARKUP_MASSAGE = [ 184 ( 185 re.compile("<(?P<fulltag>(?P<tag>[^\\s\\/]+).*?)>(?P<content>.+)\r"), 186 lambda x: "<%(fulltag)s>%(content)s</%(tag)s>" % x.groupdict(), 187 ), 188 ] 189 190 191class TradosTxtTmFile(base.TranslationStore): 192 """A Trados translation memory file""" 193 194 Name = "Trados Translation Memory" 195 Mimetypes = ["application/x-trados-tm"] 196 Extensions = ["txt"] 197 UnitClass = TradosUnit 198 default_encoding = "iso-8859-1" 199 200 def __init__(self, inputfile=None, **kwargs): 201 """construct a Wordfast TM, optionally reading in from inputfile.""" 202 super().__init__(**kwargs) 203 self.filename = "" 204 if inputfile is not None: 205 self.parse(inputfile) 206 207 def parse(self, input): 208 if hasattr(input, "name"): 209 self.filename = input.name 210 elif not getattr(self, "filename", ""): 211 self.filename = "" 212 if hasattr(input, "read"): 213 tmsrc = input.read() 214 input.close() 215 input = tmsrc 216 self._soup = TradosSoup(input) 217 for tu in self._soup.findAll("tru"): 218 unit = TradosUnit() 219 unit._soup = TradosSoup(str(tu)) 220 self.addunit(unit) 221 222 def serialize(self, out): 223 # FIXME turn the lowercased tags back into mixed case 224 out.write(self._soup.prettify()) 225