1######################################################################### 2# # 3# # 4# copyright 2002 Paul Henry Tremblay # 5# # 6# This program is distributed in the hope that it will be useful, # 7# but WITHOUT ANY WARRANTY; without even the implied warranty of # 8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # 9# General Public License for more details. # 10# # 11# # 12######################################################################### 13import sys, os 14 15from calibre.ebooks.rtf2xml import copy 16from calibre.ptempfile import better_mktemp 17from . import open_for_read, open_for_write 18 19 20class Paragraphs: 21 """ 22 ================= 23 Purpose 24 ================= 25 Write paragraph tags for a tokenized file. (This module won't be any use to use 26 to you unless you use it as part of the other modules.) 27 ------------- 28 Method 29 ------------- 30 RTF does not tell you when a paragraph begins. It only tells you when the 31 paragraph ends. 32 In order to make paragraphs out of this limited info, the parser starts in the 33 body of the documents and assumes it is not in a paragraph. It looks for clues 34 to begin a paragraph. Text starts a paragraph; so does an inline field or 35 list-text. If an end of paragraph marker (\\par) is found, then this indicates 36 a blank paragraph. 37 Once a paragraph is found, the state changes to 'paragraph.' In this state, 38 clues are looked to for the end of a paragraph. The end of a paragraph marker 39 (\\par) marks the end of a paragraph. So does the end of a footnote or heading; 40 a paragraph definition; the end of a field-block; and the beginning of a 41 section. (How about the end of a section or the end of a field-block?) 42 """ 43 44 def __init__(self, 45 in_file, 46 bug_handler, 47 copy=None, 48 write_empty_para=1, 49 run_level=1, 50 ): 51 """ 52 Required: 53 'file'--file to parse 54 Optional: 55 'copy'-- whether to make a copy of result for debugging 56 'temp_dir' --where to output temporary results (default is 57 directory from which the script is run.) 58 Returns: 59 nothing 60 """ 61 self.__file = in_file 62 self.__bug_handler = bug_handler 63 self.__copy = copy 64 self.__write_empty_para = write_empty_para 65 self.__run_level = run_level 66 self.__write_to = better_mktemp() 67 68 def __initiate_values(self): 69 """ 70 Initiate all values. 71 """ 72 self.__state = 'before_body' 73 self.__start_marker = 'mi<mk<para-start\n' # outside para tags 74 self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags 75 self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags 76 self.__end_marker = 'mi<mk<para-end__\n' # outside para tags 77 self.__state_dict = { 78 'before_body' : self.__before_body_func, 79 'not_paragraph' : self.__not_paragraph_func, 80 'paragraph' : self.__paragraph_func, 81 } 82 self.__paragraph_dict = { 83 'cw<pf<par-end___' : self.__close_para_func, # end of paragraph 84 'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer 85 # 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition 86 # 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block 87 'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block 88 'mi<mk<body-close' : self.__close_para_func, # end of body 89 'mi<mk<sect-close' : self.__close_para_func, # end of body 90 'mi<mk<sect-start' : self.__close_para_func, # start of section 91 'mi<mk<foot___clo' : self.__close_para_func, # end of footnote 92 'cw<tb<cell______' : self.__close_para_func, # end of cell 93 'mi<mk<par-in-fld' : self.__close_para_func, # start of block field 94 'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition 95 } 96 self.__not_paragraph_dict = { 97 'tx<nu<__________' : self.__start_para_func, 98 'tx<hx<__________' : self.__start_para_func, 99 'tx<ut<__________' : self.__start_para_func, 100 'tx<mc<__________' : self.__start_para_func, 101 'mi<mk<inline-fld' : self.__start_para_func, 102 'mi<mk<para-beg__' : self.__start_para_func, 103 'cw<pf<par-end___' : self.__empty_para_func, 104 'mi<mk<pict-start' : self.__start_para_func, 105 'cw<pf<page-break' : self.__empty_pgbk_func, # page break 106 } 107 108 def __before_body_func(self, line): 109 """ 110 Required: 111 line -- line to parse 112 Returns: 113 nothing 114 Logic: 115 This function handles all the lines before the start of the body. 116 Once the body starts, the state is switched to 'not_paragraph' 117 """ 118 if self.__token_info == 'mi<mk<body-open_': 119 self.__state = 'not_paragraph' 120 self.__write_obj.write(line) 121 122 def __not_paragraph_func(self, line): 123 """ 124 Required: 125 line --line to parse 126 Returns: 127 nothing 128 Logic: 129 This function handles all lines that are outside of the paragraph. 130 It looks for clues that start a paragraph, and when found, 131 switches states and writes the start tags. 132 """ 133 action = self.__not_paragraph_dict.get(self.__token_info) 134 if action: 135 action(line) 136 self.__write_obj.write(line) 137 138 def __paragraph_func(self, line): 139 """ 140 Required: 141 line --line to parse 142 Returns: 143 nothing 144 Logic: 145 This function handles all the lines that are in the paragraph. It 146 looks for clues to the end of the paragraph. When a clue is found, 147 it calls on another method to write the end of the tag and change 148 the state. 149 """ 150 action = self.__paragraph_dict.get(self.__token_info) 151 if action: 152 action(line) 153 else: 154 self.__write_obj.write(line) 155 156 def __start_para_func(self, line): 157 """ 158 Requires: 159 line --line to parse 160 Returns: 161 nothing 162 Logic: 163 This function writes the beginning tags for a paragraph and 164 changes the state to paragraph. 165 """ 166 self.__write_obj.write(self.__start_marker) # marker for later parsing 167 self.__write_obj.write( 168 'mi<tg<open______<para\n' 169 ) 170 self.__write_obj.write(self.__start2_marker) 171 self.__state = 'paragraph' 172 173 def __empty_para_func(self, line): 174 """ 175 Requires: 176 line --line to parse 177 Returns: 178 nothing 179 Logic: 180 This function writes the empty tags for a paragraph. 181 It does not do anything if self.__write_empty_para is 0. 182 """ 183 if self.__write_empty_para: 184 self.__write_obj.write(self.__start_marker) # marker for later parsing 185 self.__write_obj.write( 186 'mi<tg<empty_____<para\n' 187 ) 188 self.__write_obj.write(self.__end_marker) # marker for later parsing 189 190 def __empty_pgbk_func(self, line): 191 """ 192 Requires: 193 line --line to parse 194 Returns: 195 nothing 196 Logic: 197 This function writes the empty tags for a page break. 198 """ 199 self.__write_obj.write( 200 'mi<tg<empty_____<page-break\n' 201 ) 202 203 def __close_para_func(self, line): 204 """ 205 Requires: 206 line --line to parse 207 Returns: 208 nothing 209 Logic: 210 This function writes the end tags for a paragraph and 211 changes the state to not_paragraph. 212 """ 213 self.__write_obj.write(self.__end2_marker) # marker for later parser 214 self.__write_obj.write( 215 'mi<tg<close_____<para\n' 216 ) 217 self.__write_obj.write(self.__end_marker) # marker for later parser 218 self.__write_obj.write(line) 219 self.__state = 'not_paragraph' 220 221 def __bogus_para__def_func(self, line): 222 """ 223 Requires: 224 line --line to parse 225 Returns: 226 nothing 227 Logic: 228 if a \\pard occurs in a paragraph, I want to ignore it. (I believe) 229 """ 230 self.__write_obj.write('mi<mk<bogus-pard\n') 231 232 def make_paragraphs(self): 233 """ 234 Requires: 235 nothing 236 Returns: 237 nothing (changes the original file) 238 Logic: 239 Read one line in at a time. Determine what action to take based on 240 the state. If the state is before the body, look for the 241 beginning of the body. 242 When the body is found, change the state to 'not_paragraph'. The 243 only other state is 'paragraph'. 244 """ 245 self.__initiate_values() 246 with open_for_read(self.__file) as read_obj: 247 with open_for_write(self.__write_to) as self.__write_obj: 248 for line in read_obj: 249 self.__token_info = line[:16] 250 action = self.__state_dict.get(self.__state) 251 if action is None: 252 try: 253 sys.stderr.write('no matching state in module paragraphs.py\n') 254 sys.stderr.write(self.__state + '\n') 255 except: 256 pass 257 action(line) 258 copy_obj = copy.Copy(bug_handler=self.__bug_handler) 259 if self.__copy: 260 copy_obj.copy_file(self.__write_to, "paragraphs.data") 261 copy_obj.rename(self.__write_to, self.__file) 262 os.remove(self.__write_to) 263