1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2010, Gerendi Sandor Attila'
7__docformat__ = 'restructuredtext en'
8
9"""
10RTF tokenizer and token parser. v.1.0 (1/17/2010)
11Author: Gerendi Sandor Attila
12
13At this point this will tokenize a RTF file then rebuild it from the tokens.
14In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compliant.
15"""
16
17
18class tokenDelimitatorStart():
19
20    def __init__(self):
21        pass
22
23    def toRTF(self):
24        return '{'
25
26    def __repr__(self):
27        return '{'
28
29
30class tokenDelimitatorEnd():
31
32    def __init__(self):
33        pass
34
35    def toRTF(self):
36        return '}'
37
38    def __repr__(self):
39        return '}'
40
41
42class tokenControlWord():
43
44    def __init__(self, name, separator=''):
45        self.name = name
46        self.separator = separator
47
48    def toRTF(self):
49        return self.name + self.separator
50
51    def __repr__(self):
52        return self.name + self.separator
53
54
55class tokenControlWordWithNumericArgument():
56
57    def __init__(self, name, argument, separator=''):
58        self.name = name
59        self.argument = argument
60        self.separator = separator
61
62    def toRTF(self):
63        return self.name + repr(self.argument) + self.separator
64
65    def __repr__(self):
66        return self.name + repr(self.argument) + self.separator
67
68
69class tokenControlSymbol():
70
71    def __init__(self, name):
72        self.name = name
73
74    def toRTF(self):
75        return self.name
76
77    def __repr__(self):
78        return self.name
79
80
81class tokenData():
82
83    def __init__(self, data):
84        self.data = data
85
86    def toRTF(self):
87        return self.data
88
89    def __repr__(self):
90        return self.data
91
92
93class tokenBinN():
94
95    def __init__(self, data, separator=''):
96        self.data = data
97        self.separator = separator
98
99    def toRTF(self):
100        return "\\bin" + repr(len(self.data)) + self.separator + self.data
101
102    def __repr__(self):
103        return "\\bin" + repr(len(self.data)) + self.separator + self.data
104
105
106class token8bitChar():
107
108    def __init__(self, data):
109        self.data = data
110
111    def toRTF(self):
112        return "\\'" + self.data
113
114    def __repr__(self):
115        return "\\'" + self.data
116
117
118class tokenUnicode():
119
120    def __init__(self, data, separator='', current_ucn=1, eqList=[]):
121        self.data = data
122        self.separator = separator
123        self.current_ucn = current_ucn
124        self.eqList = eqList
125
126    def toRTF(self):
127        result = '\\u' + repr(self.data) + ' '
128        ucn = self.current_ucn
129        if len(self.eqList) < ucn:
130            ucn = len(self.eqList)
131            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
132        i = 0
133        for eq in self.eqList:
134            if i >= ucn:
135                break
136            result = result + eq.toRTF()
137        return result
138
139    def __repr__(self):
140        return '\\u' + repr(self.data)
141
142
143def isAsciiLetter(value):
144    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
145
146
147def isDigit(value):
148    return (value >= '0') and (value <= '9')
149
150
151def isChar(value, char):
152    return value == char
153
154
155def isString(buffer, string):
156    return buffer == string
157
158
159class RtfTokenParser():
160
161    def __init__(self, tokens):
162        self.tokens = tokens
163        self.process()
164        self.processUnicode()
165
166    def process(self):
167        i = 0
168        newTokens = []
169        while i < len(self.tokens):
170            if isinstance(self.tokens[i], tokenControlSymbol):
171                if isString(self.tokens[i].name, "\\'"):
172                    i = i + 1
173                    if not isinstance(self.tokens[i], tokenData):
174                        raise Exception('Error: token8bitChar without data.')
175                    if len(self.tokens[i].data) < 2:
176                        raise Exception('Error: token8bitChar without data.')
177                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
178                    if len(self.tokens[i].data) > 2:
179                        newTokens.append(tokenData(self.tokens[i].data[2:]))
180                    i = i + 1
181                    continue
182
183            newTokens.append(self.tokens[i])
184            i = i + 1
185
186        self.tokens = list(newTokens)
187
188    def processUnicode(self):
189        i = 0
190        newTokens = []
191        ucNbStack = [1]
192        while i < len(self.tokens):
193            if isinstance(self.tokens[i], tokenDelimitatorStart):
194                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
195                newTokens.append(self.tokens[i])
196                i = i + 1
197                continue
198            if isinstance(self.tokens[i], tokenDelimitatorEnd):
199                ucNbStack.pop()
200                newTokens.append(self.tokens[i])
201                i = i + 1
202                continue
203            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
204                if isString(self.tokens[i].name, '\\uc'):
205                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
206                    newTokens.append(self.tokens[i])
207                    i = i + 1
208                    continue
209                if isString(self.tokens[i].name, '\\u'):
210                    x = i
211                    j = 0
212                    i = i + 1
213                    replace = []
214                    partialData = None
215                    ucn = ucNbStack[len(ucNbStack) - 1]
216                    while (i < len(self.tokens)) and (j < ucn):
217                        if isinstance(self.tokens[i], tokenDelimitatorStart):
218                            break
219                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
220                            break
221                        if isinstance(self.tokens[i], tokenData):
222                            if len(self.tokens[i].data) >= ucn - j:
223                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
224                                if len(self.tokens[i].data) > ucn - j:
225                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
226                                i = i + 1
227                                break
228                            else:
229                                replace.append(self.tokens[i])
230                                j = j + len(self.tokens[i].data)
231                                i = i + 1
232                                continue
233                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
234                            replace.append(self.tokens[i])
235                            i = i + 1
236                            j = j + 1
237                            continue
238                        raise Exception('Error: incorrect utf replacement.')
239
240                    # calibre rtf2xml does not support utfreplace
241                    replace = []
242
243                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
244                    if partialData is not None:
245                        newTokens.append(partialData)
246                    continue
247
248            newTokens.append(self.tokens[i])
249            i = i + 1
250
251        self.tokens = list(newTokens)
252
253    def toRTF(self):
254        result = []
255        for token in self.tokens:
256            result.append(token.toRTF())
257        return "".join(result)
258
259
260class RtfTokenizer():
261
262    def __init__(self, rtfData):
263        self.rtfData = []
264        self.tokens = []
265        self.rtfData = rtfData
266        self.tokenize()
267
268    def tokenize(self):
269        i = 0
270        lastDataStart = -1
271        while i < len(self.rtfData):
272
273            if isChar(self.rtfData[i], '{'):
274                if lastDataStart > -1:
275                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
276                    lastDataStart = -1
277                self.tokens.append(tokenDelimitatorStart())
278                i = i + 1
279                continue
280
281            if isChar(self.rtfData[i], '}'):
282                if lastDataStart > -1:
283                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
284                    lastDataStart = -1
285                self.tokens.append(tokenDelimitatorEnd())
286                i = i + 1
287                continue
288
289            if isChar(self.rtfData[i], '\\'):
290                if i + 1 >= len(self.rtfData):
291                    raise Exception('Error: Control character found at the end of the document.')
292
293                if lastDataStart > -1:
294                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
295                    lastDataStart = -1
296
297                tokenStart = i
298                i = i + 1
299
300                # Control Words
301                if isAsciiLetter(self.rtfData[i]):
302                    # consume <ASCII Letter Sequence>
303                    consumed = False
304                    while i < len(self.rtfData):
305                        if not isAsciiLetter(self.rtfData[i]):
306                            tokenEnd = i
307                            consumed = True
308                            break
309                        i = i + 1
310
311                    if not consumed:
312                        raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
313
314                    # we have numeric argument before delimiter
315                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
316                        # consume the numeric argument
317                        consumed = False
318                        l = 0
319                        while i < len(self.rtfData):
320                            if not isDigit(self.rtfData[i]):
321                                consumed = True
322                                break
323                            l = l + 1
324                            i = i + 1
325                            if l > 10 :
326                                raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
327
328                        if not consumed:
329                            raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
330
331                    separator = ''
332                    if isChar(self.rtfData[i], ' '):
333                        separator = ' '
334
335                    controlWord = self.rtfData[tokenStart: tokenEnd]
336                    if tokenEnd < i:
337                        value = int(self.rtfData[tokenEnd: i])
338                        if isString(controlWord, "\\bin"):
339                            i = i + value
340                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
341                        else:
342                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
343                    else:
344                        self.tokens.append(tokenControlWord(controlWord, separator))
345                    # space delimiter, we should discard it
346                    if self.rtfData[i] == ' ':
347                        i = i + 1
348
349                # Control Symbol
350                else:
351                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
352                    i = i + 1
353                continue
354
355            if lastDataStart < 0:
356                lastDataStart = i
357            i = i + 1
358
359    def toRTF(self):
360        result = []
361        for token in self.tokens:
362            result.append(token.toRTF())
363        return "".join(result)
364
365
366if __name__ == "__main__":
367    import sys
368    if len(sys.argv) < 2:
369        print("Usage %prog rtfFileToConvert")
370        sys.exit()
371    with open(sys.argv[1], 'rb') as f:
372        data = f.read()
373
374    tokenizer = RtfTokenizer(data)
375    parsedTokens = RtfTokenParser(tokenizer.tokens)
376
377    data = parsedTokens.toRTF()
378
379    with open(sys.argv[1], 'w') as f:
380        f.write(data)
381