1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' 7 8from operator import itemgetter 9 10from lxml import etree 11 12from calibre.utils.icu import partition_by_first_letter, sort_key 13from polyglot.builtins import iteritems 14 15 16def get_applicable_xe_fields(index, xe_fields, XPath, expand): 17 iet = index.get('entry-type', None) 18 xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet] 19 20 lr = index.get('letter-range', None) 21 if lr is not None: 22 sl, el = lr.partition('-')[0::2] 23 sl, el = sl.strip(), el.strip() 24 if sl and el: 25 def inrange(text): 26 return sl <= text[0] <= el 27 xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))] 28 29 bmark = index.get('bookmark', None) 30 if bmark is None: 31 return xe_fields 32 attr = expand('w:name') 33 bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark} 34 ancestors = XPath('ancestor::w:bookmarkStart') 35 36 def contained(xe): 37 # Check if the xe field is contained inside a bookmark with the 38 # specified name 39 return bool(set(ancestors(xe['start_elem'])) & bookmarks) 40 41 return [xe for xe in xe_fields if contained(xe)] 42 43 44def make_block(expand, style, parent, pos): 45 p = parent.makeelement(expand('w:p')) 46 parent.insert(pos, p) 47 if style is not None: 48 ppr = p.makeelement(expand('w:pPr')) 49 p.append(ppr) 50 ps = ppr.makeelement(expand('w:pStyle')) 51 ppr.append(ps) 52 ps.set(expand('w:val'), style) 53 r = p.makeelement(expand('w:r')) 54 p.append(r) 55 t = r.makeelement(expand('w:t')) 56 t.set(expand('xml:space'), 'preserve') 57 r.append(t) 58 return p, t 59 60 61def add_xe(xe, t, expand): 62 run = t.getparent() 63 idx = run.index(t) 64 t.text = xe.get('text') or ' ' 65 pt = xe.get('page-number-text', None) 66 67 if pt: 68 p = t.getparent().getparent() 69 r = p.makeelement(expand('w:r')) 70 p.append(r) 71 t2 = r.makeelement(expand('w:t')) 72 t2.set(expand('xml:space'), 'preserve') 73 t2.text = ' [%s]' % pt 74 r.append(t2) 75 # put separate entries on separate lines 76 run.insert(idx + 1, run.makeelement(expand('w:br'))) 77 return xe['anchor'], run 78 79 80def process_index(field, index, xe_fields, log, XPath, expand): 81 ''' 82 We remove all the word generated index markup and replace it with our own 83 that is more suitable for an ebook. 84 ''' 85 styles = [] 86 heading_text = index.get('heading', None) 87 heading_style = 'IndexHeading' 88 start_pos = None 89 for elem in field.contents: 90 if elem.tag.endswith('}p'): 91 s = XPath('descendant::pStyle/@w:val')(elem) 92 if s: 93 styles.append(s[0]) 94 p = elem.getparent() 95 if start_pos is None: 96 start_pos = (p, p.index(elem)) 97 p.remove(elem) 98 99 xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand) 100 if not xe_fields: 101 return [], [] 102 if heading_text is not None: 103 groups = partition_by_first_letter(xe_fields, key=itemgetter('text')) 104 items = [] 105 for key, fields in iteritems(groups): 106 items.append(key), items.extend(fields) 107 if styles: 108 heading_style = styles[0] 109 else: 110 items = sorted(xe_fields, key=lambda x:sort_key(x['text'])) 111 112 hyperlinks = [] 113 blocks = [] 114 for item in reversed(items): 115 is_heading = not isinstance(item, dict) 116 style = heading_style if is_heading else None 117 p, t = make_block(expand, style, *start_pos) 118 if is_heading: 119 text = heading_text 120 if text.lower().startswith('a'): 121 text = item + text[1:] 122 t.text = text 123 else: 124 hyperlinks.append(add_xe(item, t, expand)) 125 blocks.append(p) 126 127 return hyperlinks, blocks 128 129 130def split_up_block(block, a, text, parts, ldict): 131 prefix = parts[:-1] 132 a.text = parts[-1] 133 parent = a.getparent() 134 style = 'display:block; margin-left: %.3gem' 135 for i, prefix in enumerate(prefix): 136 m = 1.5 * i 137 span = parent.makeelement('span', style=style % m) 138 ldict[span] = i 139 parent.append(span) 140 span.text = prefix 141 span = parent.makeelement('span', style=style % ((i + 1) * 1.5)) 142 parent.append(span) 143 span.append(a) 144 ldict[span] = len(prefix) 145 146 147""" 148The merge algorithm is a little tricky. 149We start with a list of elementary blocks. Each is an HtmlElement, a p node 150with a list of child nodes. The last child may be a link, and the earlier ones are 151just text. 152The list is in reverse order from what we want in the index. 153There is a dictionary ldict which records the level of each child node. 154 155Now we want to do a reduce-like operation, combining all blocks with the same 156top level index entry into a single block representing the structure of all 157references, subentries, etc. under that top entry. 158Here's the algorithm. 159 160Given a block p and the next block n, and the top level entries p1 and n1 in each 161block, which we assume have the same text: 162 163Start with (p, p1) and (n, n1). 164 165Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge: 166 167If there are no more levels in n, and we have a link in nk, 168then add the link from nk to the links for pk. 169This might be the first link for pk, or we might get a list of references. 170 171Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have 172the same text, it must follow pk, it must come before we find any other p entries at 173the same level as pk, and it must have the same level as nk+1. 174 175If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1). 176 177If there is no matching entry, then because of the original reversed order we want 178to insert nk+1 and all following entries from n into p immediately following pk. 179""" 180 181 182def find_match(prev_block, pind, nextent, ldict): 183 curlevel = ldict.get(prev_block[pind], -1) 184 if curlevel < 0: 185 return -1 186 for p in range(pind+1, len(prev_block)): 187 trylev = ldict.get(prev_block[p], -1) 188 if trylev <= curlevel: 189 return -1 190 if trylev > (curlevel+1): 191 continue 192 if prev_block[p].text_content() == nextent.text_content(): 193 return p 194 return -1 195 196 197def add_link(pent, nent, ldict): 198 na = nent.xpath('descendant::a[1]') 199 # If there is no link, leave it as text 200 if not na or len(na) == 0: 201 return 202 na = na[0] 203 pa = pent.xpath('descendant::a') 204 if pa and len(pa) > 0: 205 # Put on same line with a comma 206 pa = pa[-1] 207 pa.tail = ', ' 208 p = pa.getparent() 209 p.insert(p.index(pa) + 1, na) 210 else: 211 # substitute link na for plain text in pent 212 pent.text = "" 213 pent.append(na) 214 215 216def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict): 217 # First elements match. Any more in next? 218 if len(next_path) == (nind + 1): 219 nextent = next_block[nind] 220 add_link(prev_block[pind], nextent, ldict) 221 return 222 223 nind = nind + 1 224 nextent = next_block[nind] 225 prevent = find_match(prev_block, pind, nextent, ldict) 226 if prevent > 0: 227 merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict) 228 return 229 230 # Want to insert elements into previous block 231 while nind < len(next_block): 232 # insert takes it out of old 233 pind = pind + 1 234 prev_block.insert(pind, next_block[nind]) 235 236 next_block.getparent().remove(next_block) 237 238 239def polish_index_markup(index, blocks): 240 # Blocks are in reverse order at this point 241 path_map = {} 242 ldict = {} 243 for block in blocks: 244 cls = block.get('class', '') or '' 245 block.set('class', (cls + ' index-entry').lstrip()) 246 a = block.xpath('descendant::a[1]') 247 text = '' 248 if a: 249 text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip() 250 if ':' in text: 251 path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':')))) 252 if len(parts) > 1: 253 split_up_block(block, a[0], text, parts, ldict) 254 else: 255 # try using a span all the time 256 path_map[block] = [text] 257 parent = a[0].getparent() 258 span = parent.makeelement('span', style='display:block; margin-left: 0em') 259 parent.append(span) 260 span.append(a[0]) 261 ldict[span] = 0 262 263 for br in block.xpath('descendant::br'): 264 br.tail = None 265 266 # We want a single block for each main entry 267 prev_block = blocks[0] 268 for block in blocks[1:]: 269 pp, pn = path_map[prev_block], path_map[block] 270 if pp[0] == pn[0]: 271 merge_blocks(prev_block, block, 0, 0, pn, ldict) 272 else: 273 prev_block = block 274