1import os.path 2import logging 3 4from .pdfdevice import PDFTextDevice 5from .pdffont import PDFUnicodeNotDefined 6from .pdftypes import LITERALS_DCT_DECODE 7from .pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB 8from .layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve 9from .layout import LTFigure, LTImage, LTChar, LTTextLine 10from .layout import LTTextBox, LTTextBoxVertical, LTTextGroup 11from .utils import apply_matrix_pt, mult_matrix 12from .utils import htmlescape, bbox2str, create_bmp 13 14 15logger = logging.getLogger(__name__) 16 17 18class PDFLayoutAnalyzer(PDFTextDevice): 19 20 def __init__(self, rsrcmgr, pageno=1, laparams=None): 21 PDFTextDevice.__init__(self, rsrcmgr) 22 self.pageno = pageno 23 self.laparams = laparams 24 self._stack = [] 25 26 def begin_page(self, page, ctm): 27 (x0,y0,x1,y1) = page.mediabox 28 (x0,y0) = apply_matrix_pt(ctm, (x0,y0)) 29 (x1,y1) = apply_matrix_pt(ctm, (x1,y1)) 30 mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) 31 self.cur_item = LTPage(self.pageno, mediabox) 32 33 def end_page(self, page): 34 assert not self._stack 35 assert isinstance(self.cur_item, LTPage) 36 if self.laparams is not None: 37 self.cur_item.analyze(self.laparams) 38 self.pageno += 1 39 self.receive_layout(self.cur_item) 40 41 def begin_figure(self, name, bbox, matrix): 42 self._stack.append(self.cur_item) 43 self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) 44 45 def end_figure(self, _): 46 fig = self.cur_item 47 assert isinstance(self.cur_item, LTFigure) 48 self.cur_item = self._stack.pop() 49 self.cur_item.add(fig) 50 51 def render_image(self, name, stream): 52 assert isinstance(self.cur_item, LTFigure) 53 item = LTImage(name, stream, 54 (self.cur_item.x0, self.cur_item.y0, 55 self.cur_item.x1, self.cur_item.y1)) 56 self.cur_item.add(item) 57 58 def paint_path(self, gstate, stroke, fill, evenodd, path): 59 shape = ''.join(x[0] for x in path) 60 if shape == 'ml': 61 # horizontal/vertical line 62 (_,x0,y0) = path[0] 63 (_,x1,y1) = path[1] 64 (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) 65 (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) 66 if x0 == x1 or y0 == y1: 67 self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1))) 68 return 69 if shape == 'mlllh': 70 # rectangle 71 (_,x0,y0) = path[0] 72 (_,x1,y1) = path[1] 73 (_,x2,y2) = path[2] 74 (_,x3,y3) = path[3] 75 (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) 76 (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) 77 (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) 78 (x3,y3) = apply_matrix_pt(self.ctm, (x3,y3)) 79 if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or 80 (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): 81 self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) 82 return 83 # other shapes 84 pts = [] 85 for p in path: 86 for i in range(1, len(p), 2): 87 pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) 88 self.cur_item.add(LTCurve(gstate.linewidth, pts)) 89 90 def render_char(self, matrix, font, fontsize, scaling, rise, cid): 91 try: 92 text = font.to_unichr(cid) 93 assert isinstance(text, str), text 94 except PDFUnicodeNotDefined: 95 text = self.handle_undefined_char(font, cid) 96 textwidth = font.char_width(cid) 97 textdisp = font.char_disp(cid) 98 item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) 99 self.cur_item.add(item) 100 return item.adv 101 102 def handle_undefined_char(self, font, cid): 103 logger.warning('undefined: %r, %r', font, cid) 104 return '(cid:%d)' % cid 105 106 def receive_layout(self, ltpage): 107 pass 108 109 110class PDFPageAggregator(PDFLayoutAnalyzer): 111 112 def __init__(self, rsrcmgr, pageno=1, laparams=None): 113 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 114 self.result = None 115 116 def receive_layout(self, ltpage): 117 self.result = ltpage 118 119 def get_result(self): 120 return self.result 121 122 123## PDFConverter 124## 125class PDFConverter(PDFLayoutAnalyzer): 126 # outfp is an fp opened in *text* mode 127 def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None): 128 PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) 129 self.outfp = outfp 130 131 def write_image(self, image): 132 stream = image.stream 133 filters = stream.get_filters() 134 if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: 135 ext = '.jpg' 136 data = stream.get_rawdata() 137 elif image.colorspace is LITERAL_DEVICE_RGB: 138 ext = '.bmp' 139 data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height) 140 elif image.colorspace is LITERAL_DEVICE_GRAY: 141 ext = '.bmp' 142 data = create_bmp(stream.get_data(), stream.bits, image.width, image.height) 143 else: 144 ext = '.img' 145 data = stream.get_data() 146 name = image.name+ext 147 path = os.path.join(self.outdir, name) 148 fp = file(path, 'wb') 149 fp.write(data) 150 fp.close() 151 return name 152 153 154## TextConverter 155## 156class TextConverter(PDFConverter): 157 158 def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, 159 showpageno=False): 160 PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams) 161 self.showpageno = showpageno 162 163 def write_text(self, text): 164 self.outfp.write(text) 165 166 def receive_layout(self, ltpage): 167 def render(item): 168 if isinstance(item, LTContainer): 169 for child in item: 170 render(child) 171 elif isinstance(item, LTText): 172 self.write_text(item.get_text()) 173 if isinstance(item, LTTextBox): 174 self.write_text('\n') 175 if self.showpageno: 176 self.write_text('Page %s\n' % ltpage.pageid) 177 render(ltpage) 178 self.write_text('\f') 179 180 # Some dummy functions to save memory/CPU when all that is wanted is text. 181 # This stops all the image and drawing ouput from being recorded and taking 182 # up RAM. 183 def render_image(self, name, stream): 184 pass 185 def paint_path(self, gstate, stroke, fill, evenodd, path): 186 pass 187 188 189## HTMLConverter 190## 191class HTMLConverter(PDFConverter): 192 193 RECT_COLORS = { 194 #'char': 'green', 195 'figure': 'yellow', 196 'textline': 'magenta', 197 'textbox': 'cyan', 198 'textgroup': 'red', 199 'curve': 'black', 200 'page': 'gray', 201 } 202 203 TEXT_COLORS = { 204 'textbox': 'blue', 205 'char': 'black', 206 } 207 208 def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, 209 scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, 210 pagemargin=50, outdir=None, 211 rect_colors={'curve':'black', 'page':'gray'}, 212 text_colors={'char':'black'}, 213 debug=False): 214 PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams) 215 self.scale = scale 216 self.fontscale = fontscale 217 self.layoutmode = layoutmode 218 self.showpageno = showpageno 219 self.pagemargin = pagemargin 220 self.outdir = outdir 221 self.rect_colors = rect_colors 222 self.text_colors = text_colors 223 if debug: 224 self.rect_colors.update(self.RECT_COLORS) 225 self.text_colors.update(self.TEXT_COLORS) 226 self._yoffset = self.pagemargin 227 self._font = None 228 self._fontstack = [] 229 self.write_header() 230 231 def write(self, text): 232 self.outfp.write(text) 233 234 def write_header(self): 235 self.write('<html><head>\n') 236 self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.encoding) 237 self.write('</head><body>\n') 238 239 def write_footer(self): 240 self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % 241 ', '.join('<a href="#%s">%s</a>' % (i,i) for i in range(1,self.pageno))) 242 self.write('</body></html>\n') 243 244 def write_text(self, text): 245 self.write(htmlescape(text, self.outfp.encoding)) 246 247 def place_rect(self, color, borderwidth, x, y, w, h): 248 color = self.rect_colors.get(color) 249 if color is not None: 250 self.write('<span style="position:absolute; border: %s %dpx solid; ' 251 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' % 252 (color, borderwidth, 253 x*self.scale, (self._yoffset-y)*self.scale, 254 w*self.scale, h*self.scale)) 255 256 def place_border(self, color, borderwidth, item): 257 self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height) 258 259 def place_image(self, item, borderwidth, x, y, w, h): 260 if self.outdir is not None: 261 name = self.write_image(item) 262 self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" ' 263 'width="%d" height="%d" />\n' % 264 (enc(name), borderwidth, 265 x*self.scale, (self._yoffset-y)*self.scale, 266 w*self.scale, h*self.scale)) 267 268 def place_text(self, color, text, x, y, size): 269 color = self.text_colors.get(color) 270 if color is not None: 271 self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' % 272 (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale)) 273 self.write_text(text) 274 self.write('</span>\n') 275 276 def begin_textbox(self, color, borderwidth, x, y, w, h, writing_mode): 277 self._fontstack.append(self._font) 278 self._font = None 279 self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; ' 280 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' % 281 (color, borderwidth, writing_mode, 282 x*self.scale, (self._yoffset-y)*self.scale, 283 w*self.scale, h*self.scale)) 284 285 def put_text(self, text, fontname, fontsize): 286 font = (fontname, fontsize) 287 if font != self._font: 288 if self._font is not None: 289 self.write('</span>') 290 self.write('<span style="font-family: %s; font-size:%dpx">' % 291 (fontname, fontsize * self.scale * self.fontscale)) 292 self._font = font 293 self.write_text(text) 294 295 def put_newline(self): 296 self.write('<br>') 297 298 def end_textbox(self, color): 299 if self._font is not None: 300 self.write('</span>') 301 self._font = self._fontstack.pop() 302 self.write('</div>') 303 304 def receive_layout(self, ltpage): 305 def show_group(item): 306 if isinstance(item, LTTextGroup): 307 self.place_border('textgroup', 1, item) 308 for child in item: 309 show_group(child) 310 311 def render(item): 312 if isinstance(item, LTPage): 313 self._yoffset += item.y1 314 self.place_border('page', 1, item) 315 if self.showpageno: 316 self.write('<div style="position:absolute; top:%dpx;">' % 317 ((self._yoffset-item.y1)*self.scale)) 318 self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid)) 319 for child in item: 320 render(child) 321 if item.groups is not None: 322 for group in item.groups: 323 show_group(group) 324 elif isinstance(item, LTCurve): 325 self.place_border('curve', 1, item) 326 elif isinstance(item, LTFigure): 327 self.place_border('figure', 1, item) 328 for child in item: 329 render(child) 330 elif isinstance(item, LTImage): 331 self.place_image(item, 1, item.x0, item.y1, item.width, item.height) 332 else: 333 if self.layoutmode == 'exact': 334 if isinstance(item, LTTextLine): 335 self.place_border('textline', 1, item) 336 for child in item: 337 render(child) 338 elif isinstance(item, LTTextBox): 339 self.place_border('textbox', 1, item) 340 self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20) 341 for child in item: 342 render(child) 343 elif isinstance(item, LTChar): 344 self.place_border('char', 1, item) 345 self.place_text('char', item.get_text(), item.x0, item.y1, item.size) 346 else: 347 if isinstance(item, LTTextLine): 348 for child in item: 349 render(child) 350 if self.layoutmode != 'loose': 351 self.put_newline() 352 elif isinstance(item, LTTextBox): 353 self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height, 354 item.get_writing_mode()) 355 for child in item: 356 render(child) 357 self.end_textbox('textbox') 358 elif isinstance(item, LTChar): 359 self.put_text(item.get_text(), item.fontname, item.size) 360 elif isinstance(item, LTText): 361 self.write_text(item.get_text()) 362 363 render(ltpage) 364 self._yoffset += self.pagemargin 365 366 def close(self): 367 self.write_footer() 368 369 370class XMLConverter(PDFConverter): 371 372 def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, outdir=None): 373 PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams) 374 self.outdir = outdir 375 self.write_header() 376 377 def write_header(self): 378 self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.outfp.encoding) 379 self.outfp.write('<pages>\n') 380 381 def write_footer(self): 382 self.outfp.write('</pages>\n') 383 384 def write_text(self, text): 385 self.outfp.write(htmlescape(text, self.outfp.encoding)) 386 387 def receive_layout(self, ltpage): 388 def show_group(item): 389 if isinstance(item, LTTextBox): 390 self.outfp.write('<textbox id="%d" bbox="%s" />\n' % 391 (item.index, bbox2str(item.bbox))) 392 elif isinstance(item, LTTextGroup): 393 self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox)) 394 for child in item: 395 show_group(child) 396 self.outfp.write('</textgroup>\n') 397 398 def render(item): 399 if isinstance(item, LTPage): 400 self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % 401 (item.pageid, bbox2str(item.bbox), item.rotate)) 402 for child in item: 403 render(child) 404 if item.groups is not None: 405 self.outfp.write('<layout>\n') 406 for group in item.groups: 407 show_group(group) 408 self.outfp.write('</layout>\n') 409 self.outfp.write('</page>\n') 410 elif isinstance(item, LTLine): 411 self.outfp.write('<line linewidth="%d" bbox="%s" />\n' % 412 (item.linewidth, bbox2str(item.bbox))) 413 elif isinstance(item, LTRect): 414 self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % 415 (item.linewidth, bbox2str(item.bbox))) 416 elif isinstance(item, LTCurve): 417 self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % 418 (item.linewidth, bbox2str(item.bbox), item.get_pts())) 419 elif isinstance(item, LTFigure): 420 self.outfp.write('<figure name="%s" bbox="%s">\n' % 421 (item.name, bbox2str(item.bbox))) 422 for child in item: 423 render(child) 424 self.outfp.write('</figure>\n') 425 elif isinstance(item, LTTextLine): 426 self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) 427 for child in item: 428 render(child) 429 self.outfp.write('</textline>\n') 430 elif isinstance(item, LTTextBox): 431 wmode = '' 432 if isinstance(item, LTTextBoxVertical): 433 wmode = ' wmode="vertical"' 434 self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' % 435 (item.index, bbox2str(item.bbox), wmode)) 436 for child in item: 437 render(child) 438 self.outfp.write('</textbox>\n') 439 elif isinstance(item, LTChar): 440 self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' % 441 (htmlescape(item.fontname), bbox2str(item.bbox), item.size)) 442 self.write_text(item.get_text()) 443 self.outfp.write('</text>\n') 444 elif isinstance(item, LTText): 445 self.outfp.write('<text>%s</text>\n' % item.get_text()) 446 elif isinstance(item, LTImage): 447 if self.outdir: 448 name = self.write_image(item) 449 self.outfp.write('<image src="%s" width="%d" height="%d" />\n' % 450 (enc(name), item.width, item.height)) 451 else: 452 self.outfp.write('<image width="%d" height="%d" />\n' % 453 (item.width, item.height)) 454 else: 455 assert 0, item 456 457 render(ltpage) 458 459 def close(self): 460 self.write_footer() 461