1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2010, Gerendi Sandor Attila' 7__docformat__ = 'restructuredtext en' 8 9""" 10RTF tokenizer and token parser. v.1.0 (1/17/2010) 11Author: Gerendi Sandor Attila 12 13At this point this will tokenize a RTF file then rebuild it from the tokens. 14In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compliant. 15""" 16 17 18class tokenDelimitatorStart(): 19 20 def __init__(self): 21 pass 22 23 def toRTF(self): 24 return '{' 25 26 def __repr__(self): 27 return '{' 28 29 30class tokenDelimitatorEnd(): 31 32 def __init__(self): 33 pass 34 35 def toRTF(self): 36 return '}' 37 38 def __repr__(self): 39 return '}' 40 41 42class tokenControlWord(): 43 44 def __init__(self, name, separator=''): 45 self.name = name 46 self.separator = separator 47 48 def toRTF(self): 49 return self.name + self.separator 50 51 def __repr__(self): 52 return self.name + self.separator 53 54 55class tokenControlWordWithNumericArgument(): 56 57 def __init__(self, name, argument, separator=''): 58 self.name = name 59 self.argument = argument 60 self.separator = separator 61 62 def toRTF(self): 63 return self.name + repr(self.argument) + self.separator 64 65 def __repr__(self): 66 return self.name + repr(self.argument) + self.separator 67 68 69class tokenControlSymbol(): 70 71 def __init__(self, name): 72 self.name = name 73 74 def toRTF(self): 75 return self.name 76 77 def __repr__(self): 78 return self.name 79 80 81class tokenData(): 82 83 def __init__(self, data): 84 self.data = data 85 86 def toRTF(self): 87 return self.data 88 89 def __repr__(self): 90 return self.data 91 92 93class tokenBinN(): 94 95 def __init__(self, data, separator=''): 96 self.data = data 97 self.separator = separator 98 99 def toRTF(self): 100 return "\\bin" + repr(len(self.data)) + self.separator + self.data 101 102 def __repr__(self): 103 return "\\bin" + repr(len(self.data)) + self.separator + self.data 104 105 106class token8bitChar(): 107 108 def __init__(self, data): 109 self.data = data 110 111 def toRTF(self): 112 return "\\'" + self.data 113 114 def __repr__(self): 115 return "\\'" + self.data 116 117 118class tokenUnicode(): 119 120 def __init__(self, data, separator='', current_ucn=1, eqList=[]): 121 self.data = data 122 self.separator = separator 123 self.current_ucn = current_ucn 124 self.eqList = eqList 125 126 def toRTF(self): 127 result = '\\u' + repr(self.data) + ' ' 128 ucn = self.current_ucn 129 if len(self.eqList) < ucn: 130 ucn = len(self.eqList) 131 result = tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result 132 i = 0 133 for eq in self.eqList: 134 if i >= ucn: 135 break 136 result = result + eq.toRTF() 137 return result 138 139 def __repr__(self): 140 return '\\u' + repr(self.data) 141 142 143def isAsciiLetter(value): 144 return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z')) 145 146 147def isDigit(value): 148 return (value >= '0') and (value <= '9') 149 150 151def isChar(value, char): 152 return value == char 153 154 155def isString(buffer, string): 156 return buffer == string 157 158 159class RtfTokenParser(): 160 161 def __init__(self, tokens): 162 self.tokens = tokens 163 self.process() 164 self.processUnicode() 165 166 def process(self): 167 i = 0 168 newTokens = [] 169 while i < len(self.tokens): 170 if isinstance(self.tokens[i], tokenControlSymbol): 171 if isString(self.tokens[i].name, "\\'"): 172 i = i + 1 173 if not isinstance(self.tokens[i], tokenData): 174 raise Exception('Error: token8bitChar without data.') 175 if len(self.tokens[i].data) < 2: 176 raise Exception('Error: token8bitChar without data.') 177 newTokens.append(token8bitChar(self.tokens[i].data[0:2])) 178 if len(self.tokens[i].data) > 2: 179 newTokens.append(tokenData(self.tokens[i].data[2:])) 180 i = i + 1 181 continue 182 183 newTokens.append(self.tokens[i]) 184 i = i + 1 185 186 self.tokens = list(newTokens) 187 188 def processUnicode(self): 189 i = 0 190 newTokens = [] 191 ucNbStack = [1] 192 while i < len(self.tokens): 193 if isinstance(self.tokens[i], tokenDelimitatorStart): 194 ucNbStack.append(ucNbStack[len(ucNbStack) - 1]) 195 newTokens.append(self.tokens[i]) 196 i = i + 1 197 continue 198 if isinstance(self.tokens[i], tokenDelimitatorEnd): 199 ucNbStack.pop() 200 newTokens.append(self.tokens[i]) 201 i = i + 1 202 continue 203 if isinstance(self.tokens[i], tokenControlWordWithNumericArgument): 204 if isString(self.tokens[i].name, '\\uc'): 205 ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument 206 newTokens.append(self.tokens[i]) 207 i = i + 1 208 continue 209 if isString(self.tokens[i].name, '\\u'): 210 x = i 211 j = 0 212 i = i + 1 213 replace = [] 214 partialData = None 215 ucn = ucNbStack[len(ucNbStack) - 1] 216 while (i < len(self.tokens)) and (j < ucn): 217 if isinstance(self.tokens[i], tokenDelimitatorStart): 218 break 219 if isinstance(self.tokens[i], tokenDelimitatorEnd): 220 break 221 if isinstance(self.tokens[i], tokenData): 222 if len(self.tokens[i].data) >= ucn - j: 223 replace.append(tokenData(self.tokens[i].data[0 : ucn - j])) 224 if len(self.tokens[i].data) > ucn - j: 225 partialData = tokenData(self.tokens[i].data[ucn - j:]) 226 i = i + 1 227 break 228 else: 229 replace.append(self.tokens[i]) 230 j = j + len(self.tokens[i].data) 231 i = i + 1 232 continue 233 if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN): 234 replace.append(self.tokens[i]) 235 i = i + 1 236 j = j + 1 237 continue 238 raise Exception('Error: incorrect utf replacement.') 239 240 # calibre rtf2xml does not support utfreplace 241 replace = [] 242 243 newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace)) 244 if partialData is not None: 245 newTokens.append(partialData) 246 continue 247 248 newTokens.append(self.tokens[i]) 249 i = i + 1 250 251 self.tokens = list(newTokens) 252 253 def toRTF(self): 254 result = [] 255 for token in self.tokens: 256 result.append(token.toRTF()) 257 return "".join(result) 258 259 260class RtfTokenizer(): 261 262 def __init__(self, rtfData): 263 self.rtfData = [] 264 self.tokens = [] 265 self.rtfData = rtfData 266 self.tokenize() 267 268 def tokenize(self): 269 i = 0 270 lastDataStart = -1 271 while i < len(self.rtfData): 272 273 if isChar(self.rtfData[i], '{'): 274 if lastDataStart > -1: 275 self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) 276 lastDataStart = -1 277 self.tokens.append(tokenDelimitatorStart()) 278 i = i + 1 279 continue 280 281 if isChar(self.rtfData[i], '}'): 282 if lastDataStart > -1: 283 self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) 284 lastDataStart = -1 285 self.tokens.append(tokenDelimitatorEnd()) 286 i = i + 1 287 continue 288 289 if isChar(self.rtfData[i], '\\'): 290 if i + 1 >= len(self.rtfData): 291 raise Exception('Error: Control character found at the end of the document.') 292 293 if lastDataStart > -1: 294 self.tokens.append(tokenData(self.rtfData[lastDataStart : i])) 295 lastDataStart = -1 296 297 tokenStart = i 298 i = i + 1 299 300 # Control Words 301 if isAsciiLetter(self.rtfData[i]): 302 # consume <ASCII Letter Sequence> 303 consumed = False 304 while i < len(self.rtfData): 305 if not isAsciiLetter(self.rtfData[i]): 306 tokenEnd = i 307 consumed = True 308 break 309 i = i + 1 310 311 if not consumed: 312 raise Exception('Error (at:%d): Control Word without end.'%(tokenStart)) 313 314 # we have numeric argument before delimiter 315 if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]): 316 # consume the numeric argument 317 consumed = False 318 l = 0 319 while i < len(self.rtfData): 320 if not isDigit(self.rtfData[i]): 321 consumed = True 322 break 323 l = l + 1 324 i = i + 1 325 if l > 10 : 326 raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart]) 327 328 if not consumed: 329 raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart]) 330 331 separator = '' 332 if isChar(self.rtfData[i], ' '): 333 separator = ' ' 334 335 controlWord = self.rtfData[tokenStart: tokenEnd] 336 if tokenEnd < i: 337 value = int(self.rtfData[tokenEnd: i]) 338 if isString(controlWord, "\\bin"): 339 i = i + value 340 self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator)) 341 else: 342 self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator)) 343 else: 344 self.tokens.append(tokenControlWord(controlWord, separator)) 345 # space delimiter, we should discard it 346 if self.rtfData[i] == ' ': 347 i = i + 1 348 349 # Control Symbol 350 else: 351 self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1])) 352 i = i + 1 353 continue 354 355 if lastDataStart < 0: 356 lastDataStart = i 357 i = i + 1 358 359 def toRTF(self): 360 result = [] 361 for token in self.tokens: 362 result.append(token.toRTF()) 363 return "".join(result) 364 365 366if __name__ == "__main__": 367 import sys 368 if len(sys.argv) < 2: 369 print("Usage %prog rtfFileToConvert") 370 sys.exit() 371 with open(sys.argv[1], 'rb') as f: 372 data = f.read() 373 374 tokenizer = RtfTokenizer(data) 375 parsedTokens = RtfTokenParser(tokenizer.tokens) 376 377 data = parsedTokens.toRTF() 378 379 with open(sys.argv[1], 'w') as f: 380 f.write(data) 381