1#!/usr/local/bin/python3.8
2# -*- coding: utf-8 -*-
3# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4
5# Copyright (c) 2019-2020 Kevin B. Hendricks
6# All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without modification,
9# are permitted provided that the following conditions are met:
10#
11# 1. Redistributions of source code must retain the above copyright notice, this list of
12# conditions and the following disclaimer.
13#
14# 2. Redistributions in binary form must reproduce the above copyright notice, this list
15# of conditions and the following disclaimer in the documentation and/or other materials
16# provided with the distribution.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
19# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
26# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28import sys
29import re
30from quickparser import QuickXHTMLParser
31
32SIGIL_REPLACE_LANDMARKS_HERE = "<!-- SIGIL_REPLACE_LANDMARKS_HERE -->"
33SIGIL_REPLACE_PAGELIST_HERE  = "<!-- SIGIL_REPLACE_PAGELIST_HERE -->"
34SIGIL_REPLACE_TOC_HERE       = "<!-- SIGIL_REPLACE_TOC_HERE -->"
35
36NAV_TOC_PATTERN       = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_TOC_HERE\s*-->\s*$''', re.M)
37NAV_PAGELIST_PATTERN  = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_PAGELIST_HERE\s*-->\s*$''', re.M)
38NAV_LANDMARKS_PATTERN = re.compile(r'''^\s*<!--\s*SIGIL_REPLACE_LANDMARKS_HERE\s*-->\s*$''', re.M)
39
40# encode/escape text to make it xml safe
41def xmlencode(data):
42    if data is None:
43        return ''
44    newdata = xmldecode(data)
45    newdata = newdata.replace('&', '&amp;')
46    newdata = newdata.replace('<', '&lt;')
47    newdata = newdata.replace('>', '&gt;')
48    newdata = newdata.replace('"', '&quot;')
49    return newdata
50
51# decode xml encoded/escaped strings
52def xmldecode(data):
53    if data is None:
54        return ''
55    newdata = data
56    newdata = newdata.replace('&quot;', '"')
57    newdata = newdata.replace('&gt;', '>')
58    newdata = newdata.replace('&lt;', '<')
59    newdata = newdata.replace('&amp;', '&')
60    return newdata
61
62
63class NavProcessor(object):
64
65    def __init__(self, navsrc, codec='utf-8'):
66        if navsrc is None:
67            navsrc = ""
68        if isinstance(navsrc, bytes):
69            self.content = navsrc.decode(codec)
70        else:
71            self.content = navsrc
72
73    # returns ordered list of tuples (play_order, nesting_level, href, title)
74    # href is in url encoded form (percent encodings used if needed)
75    # title has been xml decoded/unescaped
76    def getTOC(self):
77        # parse the nav to get the table of contents
78        navsrc = self.content
79        toclist = []
80
81        qp = QuickXHTMLParser()
82        qp.setContent(navsrc)
83        lvl = 0
84        po = 0
85        title = ""
86        nav_type = None
87        href = None
88        for txt, tp, tname, ttype, tattr in qp.parse_iter():
89            if txt is not None:
90                if ".a." in tp or tp.endswith(".a"):
91                    title = title + txt
92                else:
93                    title = ""
94            else:
95                if tname == "nav" and ttype == "begin":
96                    nav_type = tattr.get("epub:type", None)
97                    continue
98                if tname == "nav" and ttype == "end":
99                    nav_type = None
100                    continue
101                if nav_type is not None and nav_type == "toc":
102                    if tname == "ol":
103                        if ttype == "begin": lvl += 1
104                        if ttype == "end": lvl -= 1
105                        continue
106                    if tname == "a" and ttype == "begin":
107                        href = tattr.get("href", "")
108                        # must leave all url hrefs in raw url encoded form
109                        # if they can ever contain fragments
110                        continue
111                    if tname == "a" and ttype == "end":
112                        po += 1
113                        title = xmldecode(title)
114                        toclist.append((po, lvl, href, title))
115                        title = ""
116                        href = None
117                        continue
118
119        return toclist
120
121    # replace the TOC with ordered list of tuples (play_order, nesting_level, href, title)
122    # href should be url encoded (percent encodings present if needed)
123    # title should be xml decoded/unescaped
124    def setTOC(self, toclist):
125        toc_xhtml = self.buildTOC(toclist)
126        # replace the TOC in the current navsrc with a placeholder
127        navsrc = self.content
128        qp = QuickXHTMLParser()
129        qp.setContent(navsrc)
130        nav_type = None
131        res = []
132        skip_output = False
133        for txt, tp, tname, ttype, tattr in qp.parse_iter():
134            if txt is not None:
135                if not skip_output:
136                    res.append(txt)
137            else:
138                if tname == "nav" and ttype == "begin":
139                    nav_type = tattr.get("epub:type", None)
140                    if nav_type is not None and nav_type == "toc":
141                        res.append(SIGIL_REPLACE_TOC_HERE)
142                        skip_output = True
143                        continue
144                if tname == "nav" and ttype == "end" and nav_type == "toc":
145                    nav_type = None
146                    skip_output = False
147                    continue
148
149                if not skip_output:
150                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))
151
152        navsrc = "".join(res)
153        m = re.search(NAV_TOC_PATTERN, navsrc)
154        if m is None:
155            return False
156        navsrc = navsrc[0:m.start()] + toc_xhtml + navsrc[m.end():]
157        self.content = navsrc
158        return True
159
160    # returns ordered list of tuples (epubtype, href, title)
161    # href is url encoded (percent encodings present if needed)
162    # title has been xml decoded/unescaped
163    def getLandmarks(self):
164        # parse the nav to get the landmarks
165        navsrc = self.content
166        landmarks = []
167
168        qp = QuickXHTMLParser()
169        qp.setContent(navsrc)
170        title = ""
171        nav_type = None
172        href = None
173        epubtype = None
174        for txt, tp, tname, ttype, tattr in qp.parse_iter():
175            if txt is not None:
176                if ".a." in tp or tp.endswith(".a"):
177                    title = title + txt
178                else:
179                    title = ""
180            else:
181                if tname == "nav" and ttype == "begin":
182                    nav_type = tattr.get("epub:type", None)
183                    continue
184                if tname == "nav" and ttype == "end":
185                    nav_type = None
186                    continue
187
188                if nav_type is not None and nav_type == "landmarks":
189                    if tname == "a" and ttype == "begin":
190                        href = tattr.get("href", "")
191                        # must leave all hrefs in raw url encoded form
192                        # if they can contain fragments
193                        epubtype = tattr.get("epub:type", None)
194                        continue
195                    if tname == "a" and ttype == "end":
196                        if epubtype is not None:
197                            title = xmldecode(title)
198                            landmarks.append((epubtype, href, title))
199                        title = ""
200                        epubtype = None
201                        href = None
202                        continue
203        return landmarks
204
205    # replace the landmarks with ordered list of tuples (epubtype, href, title)
206    # href should be url encoded (percent encodings present if needed)
207    # title should be xml decoded/unescaped
208    def setLandmarks(self, landmarks):
209        landmarks_xhtml = self.buildLandmarks(landmarks)
210        # replace the landmarks from the navsrc with a placeholer
211        navsrc = self.content
212        qp = QuickXHTMLParser()
213        qp.setContent(navsrc)
214        nav_type = None
215        res = []
216        skip_output = False
217        for txt, tp, tname, ttype, tattr in qp.parse_iter():
218            if txt is not None:
219                if not skip_output:
220                    res.append(txt)
221            else:
222                if tname == "nav" and ttype == "begin":
223                    nav_type = tattr.get("epub:type", None)
224                    if nav_type is not None and nav_type == "landmarks":
225                        res.append(SIGIL_REPLACE_LANDMARKS_HERE)
226                        skip_output = True
227                        continue
228                if tname == "nav" and ttype == "end" and nav_type == "landmarks":
229                    nav_type = None
230                    skip_output = False
231                    continue
232
233                if not skip_output:
234                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))
235
236        navsrc = "".join(res)
237        m = re.search(NAV_LANDMARKS_PATTERN, navsrc)
238        if m is None:
239            return False
240        navsrc = navsrc[0:m.start()] + landmarks_xhtml + navsrc[m.end():]
241        self.content = navsrc
242        return True
243
244    # returns ordered list of tuples (page_number, href, title)
245    # href is url encoded (percent encodings if needed should be present))
246    # title has been xml decoded/unescaped
247    def getPageList(self):
248        # parse the nav source to get the page-list
249        navsrc = self.content
250        pagelist = []
251
252        qp = QuickXHTMLParser()
253        qp.setContent(navsrc)
254        pgcnt = 0
255        nav_type = None
256        href = None
257        title = ""
258        for txt, tp, tname, ttype, tattr in qp.parse_iter():
259            if txt is not None:
260                if ".a." in tp or tp.endswith(".a"):
261                    title = title + txt
262                else:
263                    title = ""
264            else:
265                if tname == "nav" and ttype == "begin":
266                    nav_type = tattr.get("epub:type", None)
267                    continue
268                if tname == "nav" and ttype == "end":
269                    nav_type = None
270                    continue
271                if nav_type is not None and nav_type == "page-list":
272                    if tname == "a" and ttype == "begin" and nav_type == "page-list":
273                        href = tattr.get("href", "")
274                        # hrefs must be kept in raw urlencoded form that may contain fragments
275                        continue
276                    if tname == "a" and ttype == "end":
277                        pgcnt += 1
278                        title = xmldecode(title)
279                        pagelist.append((pgcnt, href, title))
280                        title = ""
281                        continue
282
283        return pagelist
284
285    # replace the page with ordered list of tuples (page_number, href, title)
286    # href should be url encoded (percent encodings present if needed))
287    # title should be xml decoded/unescaped
288    def setPageList(self, pagelist):
289        pagelist_xhtml = self.buildPageList(pagelist)
290        # replace the pagelist from the navsrc with a placeholer
291        navsrc = self.content
292        qp = QuickXHTMLParser()
293        qp.setContent(navsrc)
294        nav_type = None
295        res = []
296        skip_output = False
297        found_page_list = False
298
299        for txt, tp, tname, ttype, tattr in qp.parse_iter():
300            if txt is not None:
301                if not skip_output:
302                    res.append(txt)
303            else:
304                if tname == "nav" and ttype == "begin":
305                    nav_type = tattr.get("epub:type", None)
306                    if nav_type is not None and nav_type == "page-list":
307                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
308                        found_page_list = True
309                        skip_output = True
310                        continue
311                if tname == "nav" and ttype == "end" and nav_type == "page-list":
312                    nav_type = None
313                    skip_output = False
314                    continue
315                if tname == "body" and ttype == "end":
316                    if not found_page_list and len(pagelist) > 0:
317                        padding = res[-1]
318                        res.append(SIGIL_REPLACE_PAGELIST_HERE)
319                        res.append(padding)
320                        found_page_list = True
321
322                if not skip_output:
323                    res.append(qp.tag_info_to_xml(tname, ttype, tattr))
324
325        navsrc = "".join(res)
326        m = re.search(NAV_PAGELIST_PATTERN, navsrc)
327        if m is None:
328            return False
329        navsrc = navsrc[0:m.start()] + pagelist_xhtml + navsrc[m.end():]
330        self.content = navsrc
331        return True
332
333    # self.toclist is an ordered list of tuples (play_order, nesting_level, href, title)
334    # hrefs should be in url encoded form (percent encodings present if needed)
335    def buildTOC(self, toclist):
336        navres = []
337        ind = '  '
338        ibase = ind * 3
339        incr = ind * 2
340        # start with the toc
341        navres.append(ind * 2 + '<nav epub:type="toc" id="toc">\n')
342        navres.append(ind * 3 + '<h1>Table of Contents</h1>\n')
343        navres.append(ibase + '<ol>\n')
344        curlvl = 1
345        initial = True
346        for po, lvl, href, lbl in toclist:
347            lbl = xmlencode(lbl)
348            if lvl > curlvl:
349                while lvl > curlvl:
350                    indent = ibase + incr * (curlvl)
351                    navres.append(indent + '<ol>\n')
352                    navres.append(indent + ind + '<li>\n')
353                    navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl))
354                    curlvl += 1
355            elif lvl < curlvl:
356                while lvl < curlvl:
357                    indent = ibase + incr * (curlvl - 1)
358                    navres.append(indent + ind + '</li>\n')
359                    navres.append(indent + '</ol>\n')
360                    curlvl -= 1
361                indent = ibase + incr * (lvl - 1)
362                navres.append(indent + ind + '</li>\n')
363                navres.append(indent + ind + '<li>\n')
364                navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl))
365            else:
366                indent = ibase + incr * (lvl - 1)
367                if not initial:
368                    navres.append(indent + ind + '</li>\n')
369                navres.append(indent + ind + '<li>\n')
370                navres.append(indent + ind * 2 + '<a href="%s">%s</a>\n' % (href, lbl))
371            initial = False
372            curlvl = lvl
373        while(curlvl > 0):
374            indent = ibase + incr * (curlvl - 1)
375            navres.append(indent + ind + "</li>\n")
376            navres.append(indent + "</ol>\n")
377            curlvl -= 1
378        navres.append(ind * 2 + '</nav>\n')
379        return "".join(navres)
380
381
382    # self.pagelist is an ordered list of tuples (page_number, href, title)
383    # href should be url encoded (percent encodings present if needed)
384    def buildPageList(self, pagelist):
385        navres = []
386        ind = '  '
387        # add any existing page-list if need be
388        if len(pagelist) > 0:
389            navres.append(ind * 2 + '<nav epub:type="page-list" id="page-list" hidden="">\n')
390            navres.append(ind * 3 + '<ol>\n')
391            for pn, href, title in pagelist:
392                title = xmlencode(title)
393                navres.append(ind * 4 + '<li><a href="%s">%s</a></li>\n' % (href, title))
394            navres.append(ind * 3 + '</ol>\n')
395            navres.append(ind * 2 + '</nav>\n')
396        return "".join(navres)
397
398
399    # self.landmarks is an ordered list of tuples (epub_type, href, title)
400    # href should be url encoded (percent encodings present if needed)
401    def buildLandmarks(self, landmarks):
402        navres = []
403        ind = '  '
404        navres.append(ind * 2 + '<nav epub:type="landmarks" id="landmarks" hidden="">\n')
405        navres.append(ind * 3 + '<h2>Guide</h2>\n')
406        navres.append(ind * 3 + '<ol>\n')
407        for etyp, href, title in landmarks:
408            title = xmlencode(title)
409            navres.append(ind * 4 + '<li>\n')
410            navres.append(ind * 5 + '<a epub:type="%s" href="%s">%s</a>\n' % (etyp, href, title))
411            navres.append(ind * 4 + '</li>\n')
412        navres.append(ind * 3 + '</ol>\n')
413        navres.append(ind * 2 + '</nav>\n')
414        return "".join(navres)
415
416    # returns the nav source code as a unicode string in its current form
417    def getNavSrc(self):
418        return self.content
419
420
421def main(argv=sys.argv):
422    if len(argv) != 2:
423        print("navprocessor.py nav_file_path")
424        return -1
425    navpath = argv[1]
426    navsrc = ""
427    with open(navpath, 'rb') as f:
428        navsrc = f.read()
429    navsrc = navsrc.decode('utf-8')
430    np = NavProcessor(navsrc)
431    landmarks = np.getLandmarks()
432    pagelist = np.getPageList()
433    toclist = np.getTOC()
434    print(toclist)
435    print(landmarks)
436    print(pagelist)
437    print(np.setLandmarks(landmarks))
438    print(np.setPageList(pagelist))
439    print(np.setTOC(toclist))
440    print(np.getNavSrc())
441    return 0
442
443if __name__ == '__main__':
444    sys.exit(main())
445