1#!/usr/local/bin/python3.8 2# -*- coding: utf-8 -*- 3# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 5# Copyright (c) 2019-2020 Kevin B. Hendricks 6# All rights reserved. 7# 8# Redistribution and use in source and binary forms, with or without modification, 9# are permitted provided that the following conditions are met: 10# 11# 1. Redistributions of source code must retain the above copyright notice, this list of 12# conditions and the following disclaimer. 13# 14# 2. Redistributions in binary form must reproduce the above copyright notice, this list 15# of conditions and the following disclaimer in the documentation and/or other materials 16# provided with the distribution. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 21# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 26# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28import sys 29import re 30from quickparser import QuickXHTMLParser 31 32SIGIL_REPLACE_LANDMARKS_HERE = "<!-- SIGIL_REPLACE_LANDMARKS_HERE -->" 33SIGIL_REPLACE_PAGELIST_HERE = "<!-- SIGIL_REPLACE_PAGELIST_HERE -->" 34SIGIL_REPLACE_TOC_HERE = "<!-- SIGIL_REPLACE_TOC_HERE -->" 35 36NAV_TOC_PATTERN = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_TOC_HERE\s*-->\s*$''', re.M) 37NAV_PAGELIST_PATTERN = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_PAGELIST_HERE\s*-->\s*$''', re.M) 38NAV_LANDMARKS_PATTERN = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_LANDMARKS_HERE\s*-->\s*$''', re.M) 39 40# encode/escape text to make it xml safe 41def xmlencode(data): 42 if data is None: 43 return '' 44 newdata = xmldecode(data) 45 newdata = newdata.replace('&', '&') 46 newdata = newdata.replace('<', '<') 47 newdata = newdata.replace('>', '>') 48 newdata = newdata.replace('"', '"') 49 return newdata 50 51# decode xml encoded/escaped strings 52def xmldecode(data): 53 if data is None: 54 return '' 55 newdata = data 56 newdata = newdata.replace('"', '"') 57 newdata = newdata.replace('>', '>') 58 newdata = newdata.replace('<', '<') 59 newdata = newdata.replace('&', '&') 60 return newdata 61 62 63class NavProcessor(object): 64 65 def __init__(self, navsrc, codec='utf-8'): 66 if navsrc is None: 67 navsrc = "" 68 if isinstance(navsrc, bytes): 69 self.content = navsrc.decode(codec) 70 else: 71 self.content = navsrc 72 73 # returns ordered list of tuples (play_order, nesting_level, href, title) 74 # href is in url encoded form (percent encodings used if needed) 75 # title has been xml decoded/unescaped 76 def getTOC(self): 77 # parse the nav to get the table of contents 78 navsrc = self.content 79 toclist = [] 80 81 qp = QuickXHTMLParser() 82 qp.setContent(navsrc) 83 lvl = 0 84 po = 0 85 title = "" 86 nav_type = None 87 href = None 88 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 89 if txt is not None: 90 if ".a." in tp or tp.endswith(".a"): 91 title = title + txt 92 else: 93 title = "" 94 else: 95 if tname == "nav" and ttype == "begin": 96 nav_type = tattr.get("epub:type", None) 97 continue 98 if tname == "nav" and ttype == "end": 99 nav_type = None 100 continue 101 if nav_type is not None and nav_type == "toc": 102 if tname == "ol": 103 if ttype == "begin": lvl += 1 104 if ttype == "end": lvl -= 1 105 continue 106 if tname == "a" and ttype == "begin": 107 href = tattr.get("href", "") 108 # must leave all url hrefs in raw url encoded form 109 # if they can ever contain fragments 110 continue 111 if tname == "a" and ttype == "end": 112 po += 1 113 title = xmldecode(title) 114 toclist.append((po, lvl, href, title)) 115 title = "" 116 href = None 117 continue 118 119 return toclist 120 121 # replace the TOC with ordered list of tuples (play_order, nesting_level, href, title) 122 # href should be url encoded (percent encodings present if needed) 123 # title should be xml decoded/unescaped 124 def setTOC(self, toclist): 125 toc_xhtml = self.buildTOC(toclist) 126 # replace the TOC in the current navsrc with a placeholder 127 navsrc = self.content 128 qp = QuickXHTMLParser() 129 qp.setContent(navsrc) 130 nav_type = None 131 res = [] 132 skip_output = False 133 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 134 if txt is not None: 135 if not skip_output: 136 res.append(txt) 137 else: 138 if tname == "nav" and ttype == "begin": 139 nav_type = tattr.get("epub:type", None) 140 if nav_type is not None and nav_type == "toc": 141 res.append(SIGIL_REPLACE_TOC_HERE) 142 skip_output = True 143 continue 144 if tname == "nav" and ttype == "end" and nav_type == "toc": 145 nav_type = None 146 skip_output = False 147 continue 148 149 if not skip_output: 150 res.append(qp.tag_info_to_xml(tname, ttype, tattr)) 151 152 navsrc = "".join(res) 153 m = re.search(NAV_TOC_PATTERN, navsrc) 154 if m is None: 155 return False 156 navsrc = navsrc[0:m.start()] + toc_xhtml + navsrc[m.end():] 157 self.content = navsrc 158 return True 159 160 # returns ordered list of tuples (epubtype, href, title) 161 # href is url encoded (percent encodings present if needed) 162 # title has been xml decoded/unescaped 163 def getLandmarks(self): 164 # parse the nav to get the landmarks 165 navsrc = self.content 166 landmarks = [] 167 168 qp = QuickXHTMLParser() 169 qp.setContent(navsrc) 170 title = "" 171 nav_type = None 172 href = None 173 epubtype = None 174 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 175 if txt is not None: 176 if ".a." in tp or tp.endswith(".a"): 177 title = title + txt 178 else: 179 title = "" 180 else: 181 if tname == "nav" and ttype == "begin": 182 nav_type = tattr.get("epub:type", None) 183 continue 184 if tname == "nav" and ttype == "end": 185 nav_type = None 186 continue 187 188 if nav_type is not None and nav_type == "landmarks": 189 if tname == "a" and ttype == "begin": 190 href = tattr.get("href", "") 191 # must leave all hrefs in raw url encoded form 192 # if they can contain fragments 193 epubtype = tattr.get("epub:type", None) 194 continue 195 if tname == "a" and ttype == "end": 196 if epubtype is not None: 197 title = xmldecode(title) 198 landmarks.append((epubtype, href, title)) 199 title = "" 200 epubtype = None 201 href = None 202 continue 203 return landmarks 204 205 # replace the landmarks with ordered list of tuples (epubtype, href, title) 206 # href should be url encoded (percent encodings present if needed) 207 # title should be xml decoded/unescaped 208 def setLandmarks(self, landmarks): 209 landmarks_xhtml = self.buildLandmarks(landmarks) 210 # replace the landmarks from the navsrc with a placeholer 211 navsrc = self.content 212 qp = QuickXHTMLParser() 213 qp.setContent(navsrc) 214 nav_type = None 215 res = [] 216 skip_output = False 217 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 218 if txt is not None: 219 if not skip_output: 220 res.append(txt) 221 else: 222 if tname == "nav" and ttype == "begin": 223 nav_type = tattr.get("epub:type", None) 224 if nav_type is not None and nav_type == "landmarks": 225 res.append(SIGIL_REPLACE_LANDMARKS_HERE) 226 skip_output = True 227 continue 228 if tname == "nav" and ttype == "end" and nav_type == "landmarks": 229 nav_type = None 230 skip_output = False 231 continue 232 233 if not skip_output: 234 res.append(qp.tag_info_to_xml(tname, ttype, tattr)) 235 236 navsrc = "".join(res) 237 m = re.search(NAV_LANDMARKS_PATTERN, navsrc) 238 if m is None: 239 return False 240 navsrc = navsrc[0:m.start()] + landmarks_xhtml + navsrc[m.end():] 241 self.content = navsrc 242 return True 243 244 # returns ordered list of tuples (page_number, href, title) 245 # href is url encoded (percent encodings if needed should be present)) 246 # title has been xml decoded/unescaped 247 def getPageList(self): 248 # parse the nav source to get the page-list 249 navsrc = self.content 250 pagelist = [] 251 252 qp = QuickXHTMLParser() 253 qp.setContent(navsrc) 254 pgcnt = 0 255 nav_type = None 256 href = None 257 title = "" 258 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 259 if txt is not None: 260 if ".a." in tp or tp.endswith(".a"): 261 title = title + txt 262 else: 263 title = "" 264 else: 265 if tname == "nav" and ttype == "begin": 266 nav_type = tattr.get("epub:type", None) 267 continue 268 if tname == "nav" and ttype == "end": 269 nav_type = None 270 continue 271 if nav_type is not None and nav_type == "page-list": 272 if tname == "a" and ttype == "begin" and nav_type == "page-list": 273 href = tattr.get("href", "") 274 # hrefs must be kept in raw urlencoded form that may contain fragments 275 continue 276 if tname == "a" and ttype == "end": 277 pgcnt += 1 278 title = xmldecode(title) 279 pagelist.append((pgcnt, href, title)) 280 title = "" 281 continue 282 283 return pagelist 284 285 # replace the page with ordered list of tuples (page_number, href, title) 286 # href should be url encoded (percent encodings present if needed)) 287 # title should be xml decoded/unescaped 288 def setPageList(self, pagelist): 289 pagelist_xhtml = self.buildPageList(pagelist) 290 # replace the pagelist from the navsrc with a placeholer 291 navsrc = self.content 292 qp = QuickXHTMLParser() 293 qp.setContent(navsrc) 294 nav_type = None 295 res = [] 296 skip_output = False 297 found_page_list = False 298 299 for txt, tp, tname, ttype, tattr in qp.parse_iter(): 300 if txt is not None: 301 if not skip_output: 302 res.append(txt) 303 else: 304 if tname == "nav" and ttype == "begin": 305 nav_type = tattr.get("epub:type", None) 306 if nav_type is not None and nav_type == "page-list": 307 res.append(SIGIL_REPLACE_PAGELIST_HERE) 308 found_page_list = True 309 skip_output = True 310 continue 311 if tname == "nav" and ttype == "end" and nav_type == "page-list": 312 nav_type = None 313 skip_output = False 314 continue 315 if tname == "body" and ttype == "end": 316 if not found_page_list and len(pagelist) > 0: 317 padding = res[-1] 318 res.append(SIGIL_REPLACE_PAGELIST_HERE) 319 res.append(padding) 320 found_page_list = True 321 322 if not skip_output: 323 res.append(qp.tag_info_to_xml(tname, ttype, tattr)) 324 325 navsrc = "".join(res) 326 m = re.search(NAV_PAGELIST_PATTERN, navsrc) 327 if m is None: 328 return False 329 navsrc = navsrc[0:m.start()] + pagelist_xhtml + navsrc[m.end():] 330 self.content = navsrc 331 return True 332 333 # self.toclist is an ordered list of tuples (play_order, nesting_level, href, title) 334 # hrefs should be in url encoded form (percent encodings present if needed) 335 def buildTOC(self, toclist): 336 navres = [] 337 ind = ' ' 338 ibase = ind * 3 339 incr = ind * 2 340 # start with the toc 341 navres.append(ind * 2 + '<nav epub:type="toc" id="toc">\n') 342 navres.append(ind * 3 + '<h1>Table of Contents</h1>\n') 343 navres.append(ibase + '<ol>\n') 344 curlvl = 1 345 initial = True 346 for po, lvl, href, lbl in toclist: 347 lbl = xmlencode(lbl) 348 if lvl > curlvl: 349 while lvl > curlvl: 350 indent = ibase + incr * (curlvl) 351 navres.append(indent + '<ol>\n') 352 navres.append(indent + ind + '<li>\n') 353 navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl)) 354 curlvl += 1 355 elif lvl < curlvl: 356 while lvl < curlvl: 357 indent = ibase + incr * (curlvl - 1) 358 navres.append(indent + ind + '</li>\n') 359 navres.append(indent + '</ol>\n') 360 curlvl -= 1 361 indent = ibase + incr * (lvl - 1) 362 navres.append(indent + ind + '</li>\n') 363 navres.append(indent + ind + '<li>\n') 364 navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl)) 365 else: 366 indent = ibase + incr * (lvl - 1) 367 if not initial: 368 navres.append(indent + ind + '</li>\n') 369 navres.append(indent + ind + '<li>\n') 370 navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl)) 371 initial = False 372 curlvl = lvl 373 while(curlvl > 0): 374 indent = ibase + incr * (curlvl - 1) 375 navres.append(indent + ind + "</li>\n") 376 navres.append(indent + "</ol>\n") 377 curlvl -= 1 378 navres.append(ind * 2 + '</nav>\n') 379 return "".join(navres) 380 381 382 # self.pagelist is an ordered list of tuples (page_number, href, title) 383 # href should be url encoded (percent encodings present if needed) 384 def buildPageList(self, pagelist): 385 navres = [] 386 ind = ' ' 387 # add any existing page-list if need be 388 if len(pagelist) > 0: 389 navres.append(ind * 2 + '<nav epub:type="page-list" id="page-list" hidden="">\n') 390 navres.append(ind * 3 + '<ol>\n') 391 for pn, href, title in pagelist: 392 title = xmlencode(title) 393 navres.append(ind * 4 + '<li><a href="%s">%s</a></li>\n' % (href, title)) 394 navres.append(ind * 3 + '</ol>\n') 395 navres.append(ind * 2 + '</nav>\n') 396 return "".join(navres) 397 398 399 # self.landmarks is an ordered list of tuples (epub_type, href, title) 400 # href should be url encoded (percent encodings present if needed) 401 def buildLandmarks(self, landmarks): 402 navres = [] 403 ind = ' ' 404 navres.append(ind * 2 + '<nav epub:type="landmarks" id="landmarks" hidden="">\n') 405 navres.append(ind * 3 + '<h2>Guide</h2>\n') 406 navres.append(ind * 3 + '<ol>\n') 407 for etyp, href, title in landmarks: 408 title = xmlencode(title) 409 navres.append(ind * 4 + '<li>\n') 410 navres.append(ind * 5 + '<a epub:type="%s" href="%s">%s</a>\n' % (etyp, href, title)) 411 navres.append(ind * 4 + '</li>\n') 412 navres.append(ind * 3 + '</ol>\n') 413 navres.append(ind * 2 + '</nav>\n') 414 return "".join(navres) 415 416 # returns the nav source code as a unicode string in its current form 417 def getNavSrc(self): 418 return self.content 419 420 421def main(argv=sys.argv): 422 if len(argv) != 2: 423 print("navprocessor.py nav_file_path") 424 return -1 425 navpath = argv[1] 426 navsrc = "" 427 with open(navpath, 'rb') as f: 428 navsrc = f.read() 429 navsrc = navsrc.decode('utf-8') 430 np = NavProcessor(navsrc) 431 landmarks = np.getLandmarks() 432 pagelist = np.getPageList() 433 toclist = np.getTOC() 434 print(toclist) 435 print(landmarks) 436 print(pagelist) 437 print(np.setLandmarks(landmarks)) 438 print(np.setPageList(pagelist)) 439 print(np.setTOC(toclist)) 440 print(np.getNavSrc()) 441 return 0 442 443if __name__ == '__main__': 444 sys.exit(main()) 445