1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4
5__license__ = 'GPL v3'
6__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
7
8from operator import itemgetter
9
10from lxml import etree
11
12from calibre.utils.icu import partition_by_first_letter, sort_key
13from polyglot.builtins import iteritems
14
15
16def get_applicable_xe_fields(index, xe_fields, XPath, expand):
17    iet = index.get('entry-type', None)
18    xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
19
20    lr = index.get('letter-range', None)
21    if lr is not None:
22        sl, el = lr.partition('-')[0::2]
23        sl, el = sl.strip(), el.strip()
24        if sl and el:
25            def inrange(text):
26                return sl <= text[0] <= el
27            xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
28
29    bmark = index.get('bookmark', None)
30    if bmark is None:
31        return xe_fields
32    attr = expand('w:name')
33    bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
34    ancestors = XPath('ancestor::w:bookmarkStart')
35
36    def contained(xe):
37        # Check if the xe field is contained inside a bookmark with the
38        # specified name
39        return bool(set(ancestors(xe['start_elem'])) & bookmarks)
40
41    return [xe for xe in xe_fields if contained(xe)]
42
43
44def make_block(expand, style, parent, pos):
45    p = parent.makeelement(expand('w:p'))
46    parent.insert(pos, p)
47    if style is not None:
48        ppr = p.makeelement(expand('w:pPr'))
49        p.append(ppr)
50        ps = ppr.makeelement(expand('w:pStyle'))
51        ppr.append(ps)
52        ps.set(expand('w:val'), style)
53    r = p.makeelement(expand('w:r'))
54    p.append(r)
55    t = r.makeelement(expand('w:t'))
56    t.set(expand('xml:space'), 'preserve')
57    r.append(t)
58    return p, t
59
60
61def add_xe(xe, t, expand):
62    run = t.getparent()
63    idx = run.index(t)
64    t.text = xe.get('text') or ' '
65    pt = xe.get('page-number-text', None)
66
67    if pt:
68        p = t.getparent().getparent()
69        r = p.makeelement(expand('w:r'))
70        p.append(r)
71        t2 = r.makeelement(expand('w:t'))
72        t2.set(expand('xml:space'), 'preserve')
73        t2.text = ' [%s]' % pt
74        r.append(t2)
75    # put separate entries on separate lines
76    run.insert(idx + 1, run.makeelement(expand('w:br')))
77    return xe['anchor'], run
78
79
80def process_index(field, index, xe_fields, log, XPath, expand):
81    '''
82    We remove all the word generated index markup and replace it with our own
83    that is more suitable for an ebook.
84    '''
85    styles = []
86    heading_text = index.get('heading', None)
87    heading_style = 'IndexHeading'
88    start_pos = None
89    for elem in field.contents:
90        if elem.tag.endswith('}p'):
91            s = XPath('descendant::pStyle/@w:val')(elem)
92            if s:
93                styles.append(s[0])
94            p = elem.getparent()
95            if start_pos is None:
96                start_pos = (p, p.index(elem))
97            p.remove(elem)
98
99    xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
100    if not xe_fields:
101        return [], []
102    if heading_text is not None:
103        groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
104        items = []
105        for key, fields in iteritems(groups):
106            items.append(key), items.extend(fields)
107        if styles:
108            heading_style = styles[0]
109    else:
110        items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
111
112    hyperlinks = []
113    blocks = []
114    for item in reversed(items):
115        is_heading = not isinstance(item, dict)
116        style = heading_style if is_heading else None
117        p, t = make_block(expand, style, *start_pos)
118        if is_heading:
119            text = heading_text
120            if text.lower().startswith('a'):
121                text = item + text[1:]
122            t.text = text
123        else:
124            hyperlinks.append(add_xe(item, t, expand))
125            blocks.append(p)
126
127    return hyperlinks, blocks
128
129
130def split_up_block(block, a, text, parts, ldict):
131    prefix = parts[:-1]
132    a.text = parts[-1]
133    parent = a.getparent()
134    style = 'display:block; margin-left: %.3gem'
135    for i, prefix in enumerate(prefix):
136        m = 1.5 * i
137        span = parent.makeelement('span', style=style % m)
138        ldict[span]    = i
139        parent.append(span)
140        span.text = prefix
141    span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
142    parent.append(span)
143    span.append(a)
144    ldict[span]    = len(prefix)
145
146
147"""
148The merge algorithm is a little tricky.
149We start with a list of elementary blocks. Each is an HtmlElement, a p node
150with a list of child nodes. The last child may be a link, and the earlier ones are
151just text.
152The list is in reverse order from what we want in the index.
153There is a dictionary ldict which records the level of each child node.
154
155Now we want to do a reduce-like operation, combining all blocks with the same
156top level index entry into a single block representing the structure of all
157references, subentries, etc. under that top entry.
158Here's the algorithm.
159
160Given a block p and the next block n, and the top level entries p1 and n1 in each
161block, which we assume have the same text:
162
163Start with (p, p1) and (n, n1).
164
165Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
166
167If there are no more levels in n, and we have a link in nk,
168then add the link from nk to the links for pk.
169This might be the first link for pk, or we might get a list of references.
170
171Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
172the same text, it must follow pk, it must come before we find any other p entries at
173the same level as pk, and it must have the same level as nk+1.
174
175If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
176
177If there is no matching entry, then because of the original reversed order we want
178to insert nk+1 and all following entries from n into p immediately following pk.
179"""
180
181
182def find_match(prev_block, pind, nextent, ldict):
183    curlevel = ldict.get(prev_block[pind], -1)
184    if curlevel < 0:
185        return -1
186    for p in range(pind+1, len(prev_block)):
187        trylev = ldict.get(prev_block[p], -1)
188        if trylev <= curlevel:
189            return -1
190        if trylev > (curlevel+1):
191            continue
192        if prev_block[p].text_content() == nextent.text_content():
193            return p
194    return -1
195
196
197def add_link(pent, nent, ldict):
198    na = nent.xpath('descendant::a[1]')
199    # If there is no link, leave it as text
200    if not na or len(na) == 0:
201        return
202    na = na[0]
203    pa = pent.xpath('descendant::a')
204    if pa and len(pa) > 0:
205        # Put on same line with a comma
206        pa = pa[-1]
207        pa.tail = ', '
208        p = pa.getparent()
209        p.insert(p.index(pa) + 1, na)
210    else:
211        # substitute link na for plain text in pent
212        pent.text = ""
213        pent.append(na)
214
215
216def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
217    # First elements match. Any more in next?
218    if len(next_path) == (nind + 1):
219        nextent = next_block[nind]
220        add_link(prev_block[pind], nextent, ldict)
221        return
222
223    nind = nind + 1
224    nextent = next_block[nind]
225    prevent = find_match(prev_block, pind, nextent, ldict)
226    if prevent > 0:
227        merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
228        return
229
230    # Want to insert elements into previous block
231    while nind < len(next_block):
232        # insert takes it out of old
233        pind = pind + 1
234        prev_block.insert(pind, next_block[nind])
235
236    next_block.getparent().remove(next_block)
237
238
239def polish_index_markup(index, blocks):
240    # Blocks are in reverse order at this point
241    path_map = {}
242    ldict = {}
243    for block in blocks:
244        cls = block.get('class', '') or ''
245        block.set('class', (cls + ' index-entry').lstrip())
246        a = block.xpath('descendant::a[1]')
247        text = ''
248        if a:
249            text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
250        if ':' in text:
251            path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
252            if len(parts) > 1:
253                split_up_block(block, a[0], text, parts, ldict)
254        else:
255            # try using a span all the time
256            path_map[block] = [text]
257            parent = a[0].getparent()
258            span = parent.makeelement('span', style='display:block; margin-left: 0em')
259            parent.append(span)
260            span.append(a[0])
261            ldict[span] = 0
262
263        for br in block.xpath('descendant::br'):
264            br.tail = None
265
266    # We want a single block for each main entry
267    prev_block = blocks[0]
268    for block in blocks[1:]:
269        pp, pn = path_map[prev_block], path_map[block]
270        if pp[0] == pn[0]:
271            merge_blocks(prev_block, block, 0, 0, pn, ldict)
272        else:
273            prev_block = block
274