1#########################################################################
2#                                                                       #
3#                                                                       #
4#   copyright 2002 Paul Henry Tremblay                                  #
5#                                                                       #
6#   This program is distributed in the hope that it will be useful,     #
7#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
8#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
9#   General Public License for more details.                            #
10#                                                                       #
11#                                                                       #
12#########################################################################
13import sys, os
14
15from calibre.ebooks.rtf2xml import copy
16from calibre.ptempfile import better_mktemp
17from . import open_for_read, open_for_write
18
19
20class Paragraphs:
21    """
22    =================
23    Purpose
24    =================
25    Write paragraph tags for a tokenized file. (This module won't be any use to use
26    to you unless you use it as part of the other modules.)
27    -------------
28    Method
29    -------------
30    RTF does not tell you when a paragraph begins. It only tells you when the
31    paragraph ends.
32    In order to make paragraphs out of this limited info, the parser starts in the
33    body of the documents and assumes it is not in a paragraph. It looks for clues
34    to begin a paragraph. Text starts a paragraph; so does an inline field or
35    list-text. If an end of paragraph marker (\\par) is found, then this indicates
36    a blank paragraph.
37    Once a paragraph is found, the state changes to 'paragraph.' In this state,
38    clues are looked to for the end of a paragraph. The end of a paragraph marker
39    (\\par) marks the end of a paragraph. So does the end of a footnote or heading;
40    a paragraph definition; the end of a field-block; and the beginning of a
41    section. (How about the end of a section or the end of a field-block?)
42    """
43
44    def __init__(self,
45            in_file,
46            bug_handler,
47            copy=None,
48            write_empty_para=1,
49            run_level=1,
50            ):
51        """
52        Required:
53            'file'--file to parse
54        Optional:
55            'copy'-- whether to make a copy of result for debugging
56            'temp_dir' --where to output temporary results (default is
57            directory from which the script is run.)
58        Returns:
59            nothing
60            """
61        self.__file = in_file
62        self.__bug_handler = bug_handler
63        self.__copy = copy
64        self.__write_empty_para = write_empty_para
65        self.__run_level = run_level
66        self.__write_to = better_mktemp()
67
68    def __initiate_values(self):
69        """
70        Initiate all values.
71        """
72        self.__state = 'before_body'
73        self.__start_marker =  'mi<mk<para-start\n'  # outside para tags
74        self.__start2_marker = 'mi<mk<par-start_\n'  # inside para tags
75        self.__end2_marker =   'mi<mk<par-end___\n'  # inside para tags
76        self.__end_marker =    'mi<mk<para-end__\n'  # outside para tags
77        self.__state_dict = {
78        'before_body'       : self.__before_body_func,
79        'not_paragraph'     : self.__not_paragraph_func,
80        'paragraph'         : self.__paragraph_func,
81        }
82        self.__paragraph_dict = {
83        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
84        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
85        # 'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
86        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
87        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
88        'mi<mk<body-close'      : self.__close_para_func,   # end of body
89        'mi<mk<sect-close'      : self.__close_para_func,   # end of body
90        'mi<mk<sect-start'      : self.__close_para_func,   # start of section
91        'mi<mk<foot___clo'      : self.__close_para_func,   # end of footnote
92        'cw<tb<cell______'      : self.__close_para_func,   # end of cell
93        'mi<mk<par-in-fld'      : self.__close_para_func,   # start of block field
94        'cw<pf<par-def___'      : self.__bogus_para__def_func,   # paragraph definition
95        }
96        self.__not_paragraph_dict = {
97        'tx<nu<__________'      : self.__start_para_func,
98        'tx<hx<__________'      : self.__start_para_func,
99        'tx<ut<__________'      : self.__start_para_func,
100        'tx<mc<__________'      : self.__start_para_func,
101        'mi<mk<inline-fld'      : self.__start_para_func,
102        'mi<mk<para-beg__'      : self.__start_para_func,
103        'cw<pf<par-end___'      : self.__empty_para_func,
104        'mi<mk<pict-start'      : self.__start_para_func,
105        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
106        }
107
108    def __before_body_func(self, line):
109        """
110        Required:
111            line -- line to parse
112        Returns:
113            nothing
114        Logic:
115            This function handles all the lines before the start of the body.
116            Once the body starts, the state is switched to 'not_paragraph'
117        """
118        if self.__token_info == 'mi<mk<body-open_':
119            self.__state = 'not_paragraph'
120        self.__write_obj.write(line)
121
122    def __not_paragraph_func(self, line):
123        """
124        Required:
125            line --line to parse
126        Returns:
127            nothing
128        Logic:
129            This function handles all lines that are outside of the paragraph.
130            It looks for clues that start a paragraph, and when found,
131            switches states and writes the start tags.
132        """
133        action = self.__not_paragraph_dict.get(self.__token_info)
134        if action:
135            action(line)
136        self.__write_obj.write(line)
137
138    def __paragraph_func(self, line):
139        """
140        Required:
141            line --line to parse
142        Returns:
143            nothing
144        Logic:
145            This function handles all the lines that are in the paragraph. It
146            looks for clues to the end of the paragraph. When a clue is found,
147            it calls on another method to write the end of the tag and change
148            the state.
149        """
150        action = self.__paragraph_dict.get(self.__token_info)
151        if action:
152            action(line)
153        else:
154            self.__write_obj.write(line)
155
156    def __start_para_func(self, line):
157        """
158        Requires:
159            line --line to parse
160        Returns:
161            nothing
162        Logic:
163            This function writes the beginning tags for a paragraph and
164            changes the state to paragraph.
165        """
166        self.__write_obj.write(self.__start_marker)  # marker for later parsing
167        self.__write_obj.write(
168        'mi<tg<open______<para\n'
169        )
170        self.__write_obj.write(self.__start2_marker)
171        self.__state = 'paragraph'
172
173    def __empty_para_func(self, line):
174        """
175        Requires:
176            line --line to parse
177        Returns:
178            nothing
179        Logic:
180            This function writes the empty tags for a paragraph.
181            It does not do anything if self.__write_empty_para is 0.
182        """
183        if self.__write_empty_para:
184            self.__write_obj.write(self.__start_marker)  # marker for later parsing
185            self.__write_obj.write(
186            'mi<tg<empty_____<para\n'
187            )
188            self.__write_obj.write(self.__end_marker)   # marker for later parsing
189
190    def __empty_pgbk_func(self, line):
191        """
192        Requires:
193            line --line to parse
194        Returns:
195            nothing
196        Logic:
197            This function writes the empty tags for a page break.
198        """
199        self.__write_obj.write(
200        'mi<tg<empty_____<page-break\n'
201        )
202
203    def __close_para_func(self, line):
204        """
205        Requires:
206            line --line to parse
207        Returns:
208            nothing
209        Logic:
210            This function writes the end tags for a paragraph and
211            changes the state to not_paragraph.
212        """
213        self.__write_obj.write(self.__end2_marker)  # marker for later parser
214        self.__write_obj.write(
215        'mi<tg<close_____<para\n'
216        )
217        self.__write_obj.write(self.__end_marker)  # marker for later parser
218        self.__write_obj.write(line)
219        self.__state = 'not_paragraph'
220
221    def __bogus_para__def_func(self, line):
222        """
223        Requires:
224            line --line to parse
225        Returns:
226            nothing
227        Logic:
228            if a \\pard occurs in a paragraph, I want to ignore it. (I believe)
229        """
230        self.__write_obj.write('mi<mk<bogus-pard\n')
231
232    def make_paragraphs(self):
233        """
234        Requires:
235            nothing
236        Returns:
237            nothing (changes the original file)
238        Logic:
239            Read one line in at a time. Determine what action to take based on
240            the state. If the state is before the body, look for the
241            beginning of the body.
242            When the body is found, change the state to 'not_paragraph'. The
243            only other state is 'paragraph'.
244        """
245        self.__initiate_values()
246        with open_for_read(self.__file) as read_obj:
247            with open_for_write(self.__write_to) as self.__write_obj:
248                for line in read_obj:
249                    self.__token_info = line[:16]
250                    action = self.__state_dict.get(self.__state)
251                    if action is None:
252                        try:
253                            sys.stderr.write('no matching state in module paragraphs.py\n')
254                            sys.stderr.write(self.__state + '\n')
255                        except:
256                            pass
257                    action(line)
258        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
259        if self.__copy:
260            copy_obj.copy_file(self.__write_to, "paragraphs.data")
261        copy_obj.rename(self.__write_to, self.__file)
262        os.remove(self.__write_to)
263